In [1]:
# required for jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(8,6)}) # set sns figure size

import os
import math

In [2]:
# read Gazipur raw csv by marking missing values as NaN
missing_values = ['NIL', 'nil', '']
df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'gazipur_2016-2020', 'gazipur.csv'), 
                     na_values=missing_values)

df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
77,Gazipur,2016,3,18,34.0,18.2,5.6,2.6,59.0,44.0,7.5,,401.28
1800,Gazipur,2020,12,5,22.6,16.2,0.0,1.0,90.0,56.0,1.8,8.8,110.27
822,Gazipur,2018,4,2,31.2,20.6,0.0,3.0,62.0,62.0,2.6,9.9,261.08
1426,Gazipur,2019,11,27,30.5,18.6,0.0,2.0,82.0,58.0,7.7,3.2,318.12
267,Gazipur,2016,9,24,33.5,26.6,16.6,3.6,76.0,77.0,3.4,8.8,264.19


In [3]:
def get_weekly_df(_df, num_all_days=7, num_days_before=3):
    '''
    input STATION-WISE dataframe with all expected columns
    returns dataframe with station, month, day and rainfall columns unchanged 
        and other features of 'num_all_days' starting from 'num_days_before' ago
        
    example: num_avg_days=7, num_days_before=3
        then rows for January 11 will have rainfall, year, month and day data of Jan 11 
            and other columns will be from Jan 1 to 7
    '''
    df=_df.copy()

    STATION_COL = 'Station'    
    MONTH_COL = 'Month'
    YEAR_COL = 'Year'
    DAY_COL = 'Day'
    RAINFALL_COL = 'Rainfall (mm)'

    # populate dict with daily features
    curr_col_dict = {}
    for col in df.columns:
        curr_col_dict[col] = []
        for val in df[col]:
            curr_col_dict[col].append(val)

    # initialize new columns dict 
    new_col_dict = {}
    for col in df.columns:
        # station, month, year, day columns unchanged
        if col in [STATION_COL, MONTH_COL, YEAR_COL, DAY_COL, RAINFALL_COL]:
            new_col_dict[col] = []
            if col != RAINFALL_COL: 
                continue

        for day_num in range(0, num_all_days):
            new_col_dict[col+str(day_num)] = []

    # populate the new columns dict
    for curr_day in range(num_all_days+num_days_before, df.shape[0]):
        start_day = curr_day-(num_all_days+num_days_before)
        end_day = start_day+num_all_days-1

        for col in df.columns:
            # station, month, year, day columns unchanged
            if col in [STATION_COL, MONTH_COL, YEAR_COL, DAY_COL, RAINFALL_COL]:
                new_col_dict[col].append(curr_col_dict[col][curr_day])
                if col != RAINFALL_COL: 
                    continue

            idx=0
            for day in range(start_day, end_day+1):
                new_col_dict[col+str(idx)].append(curr_col_dict[col][day])
                idx+=1
            
    #form new dataframe from dict and return
    return pd.DataFrame.from_dict(new_col_dict)

In [4]:
new_df = get_weekly_df(df)

In [7]:
new_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius)0,Max Temp. (degree Celcius)1,Max Temp. (degree Celcius)2,Max Temp. (degree Celcius)3,Max Temp. (degree Celcius)4,Max Temp. (degree Celcius)5,...,Cloudy (hour/day)4,Cloudy (hour/day)5,Cloudy (hour/day)6,Solar Radiation (cal/cm^2/day)0,Solar Radiation (cal/cm^2/day)1,Solar Radiation (cal/cm^2/day)2,Solar Radiation (cal/cm^2/day)3,Solar Radiation (cal/cm^2/day)4,Solar Radiation (cal/cm^2/day)5,Solar Radiation (cal/cm^2/day)6
661,Gazipur,2017,11,2,32.8,31.9,31.5,31.2,31.7,30.5,...,5.8,4.8,4.6,352.33,297.46,222.37,398.54,309.01,337.89,343.17
317,Gazipur,2016,11,23,31.7,31.4,30.8,32.0,32.4,30.8,...,2.3,2.7,1.7,315.56,341.13,330.9,361.59,341.13,330.9,356.48
1517,Gazipur,2020,3,7,28.2,27.5,29.5,32.2,32.2,31.8,...,3.0,5.5,4.0,315.16,346.19,349.02,413.91,442.58,363.15,410.81
121,Gazipur,2016,5,11,36.5,32.2,32.4,33.6,34.7,32.8,...,,,,459.13,277.84,264.89,407.33,342.59,400.86,290.79
871,Gazipur,2018,5,31,28.7,29.8,29.6,31.8,34.0,34.6,...,6.9,6.6,4.5,226.04,187.2,187.2,332.87,394.38,404.09,472.08


In [9]:
new_df[['Station', 'Year', 'Month', 'Day', 'Rainfall (mm)']].sample(5)

Unnamed: 0,Station,Year,Month,Day,Rainfall (mm)
20,Gazipur,2016,1,31,0.0
299,Gazipur,2016,11,5,8.2
427,Gazipur,2017,3,13,0.0
928,Gazipur,2018,7,27,21.0
497,Gazipur,2017,5,22,0.0
