In [1]:
# required for jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(8,6)}) # set sns figure size

import os
import math

## 1. Read station-wise datasets separately

In [2]:
# read Gazipur raw csv by marking missing values as NaN
missing_values = ['NIL', 'nil', '']
gazipur_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'gazipur_2016-2020', 'gazipur.csv'), 
                     na_values=missing_values)

gazipur_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
1030,Gazipur,2018,10,27,34.0,21.8,0.2,4.2,83.0,57.0,7.6,3.9,363.88
601,Gazipur,2017,8,24,37.4,26.7,0.0,5.0,83.0,67.0,10.6,2.3,530.58
1443,Gazipur,2019,12,14,28.4,14.8,0.0,2.0,95.0,53.0,7.3,3.3,284.75
1643,Gazipur,2020,7,1,36.2,27.5,0.0,,74.0,65.0,7.8,5.7,438.29
42,Gazipur,2016,2,12,27.2,16.8,0.0,1.0,71.0,46.0,4.8,,272.83


In [3]:
rangpur_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'rangpur_mid2017-2020', 'rangpur.csv'), 
                     na_values=missing_values)

rangpur_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
771,Rangpur,2020,1,11,19.5,11.5,0.0,2.0,94.0,70,3.0,7.7,187.18
778,Rangpur,2020,1,18,24.5,13.8,0.0,1.0,94.0,61,4.3,6.4,218.98
156,Rangpur,2018,5,6,28.2,23.5,0.0,2.0,91.0,76,2.3,2.3,262.21
1040,Rangpur,2020,10,6,31.5,25.2,19.5,2.0,91.0,91,1.0,10.5,171.83
438,Rangpur,2019,2,12,23.5,12.5,0.0,2.0,83.0,53,8.0,3.2,359.42


In [4]:
barisal_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'barisal_2017-2020', 'barisal.csv'), 
                     na_values=missing_values)

barisal_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
764,Barisal,2019,2,12,29.0,11.8,0.0,3.0,87,43.0,8.5,2.7,384.38
1432,Barisal,2020,12,11,20.6,17.0,0.0,1.0,88,90.0,2.1,8.6,164.59
711,Barisal,2018,12,21,22.4,12.6,0.0,1.0,94,55.0,6.0,4.7,259.65
663,Barisal,2018,11,3,33.6,22.2,,2.0,95,67.0,7.1,3.9,308.58
384,Barisal,2018,1,28,24.6,10.8,0.0,2.0,93,49.0,7.2,3.6,300.45


In [5]:
habiganj_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'habiganj_2019-2020', 'habiganj.csv'), 
                     na_values=missing_values)

habiganj_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
505,Habiganj,2020,5,20,30.1,24.5,88.5,,91,75,2.1,11.2,446.66
52,Habiganj,2019,2,22,27.8,18.8,0.0,,100,79,2.4,8.8,203.92
308,Habiganj,2019,11,5,33.2,22.5,0.0,,90,64,8.7,2.2,341.43
62,Habiganj,2019,3,4,23.5,19.2,7.8,1.0,85,64,1.6,10.2,213.31
74,Habiganj,2019,3,16,31.0,19.1,0.0,1.0,86,80,9.0,2.8,447.84


## 2. Pre-process each station's dataset with the techniques used in 'brri-dataset_pre-process.ipynb' notebook

### 2.1. Replace invalid values with NaN

- Max/Min Temp. (degree Celcius) > 50 
- Relative Humidity (afternoon, %) > 100, 
- Sunshine/Cloudy (hour/day) > 24, 
- Solar Radiation (cal/cm^2/day) > 1000 (from the box plot)

### 2.2. Fill up missing values with monthly average (DO NOT drop values that are still missing after fillup)

In [6]:
def pre_process(_df):
    df = _df.copy()
    
    # apply step 2.1
    df.loc[df['Max Temp. (degree Celcius)'] > 50, 'Max Temp. (degree Celcius)'] = math.nan
    df.loc[df['Min Temp. (degree Celcius)'] > 50, 'Min Temp. (degree Celcius)'] = math.nan
    df.loc[df['Relative Humidity (afternoon, %)'] > 100, 'Relative Humidity (afternoon, %)'] = math.nan
    df.loc[df['Sunshine (hour/day)'] > 24, 'Sunshine (hour/day)'] = math.nan
    df.loc[df['Cloudy (hour/day)'] > 24, 'Cloudy (hour/day)'] = math.nan
    df.loc[df['Solar Radiation (cal/cm^2/day)'] > 1000, 'Solar Radiation (cal/cm^2/day)'] = math.nan
    
    # apply step 2.2
    for column in df.columns:
        if column in ['Station', 'Year', 'Month', 'Day']:
            continue

        df[column] = df.groupby(['Station', 'Month'])[column].transform(
            lambda grp: grp.fillna(np.mean(grp))
        )
        
    # cannot drop missing values here it will mess up the average calculation
    # df.dropna(inplace=True)
    
    return df

def show_missing_data(_df):
    df = _df.copy()
    total_cnt = df.shape[0]
    missing_cnt = df.shape[0]-df.dropna().shape[0]
    print(f'Total instances={total_cnt}, missing={missing_cnt}({round(missing_cnt*100.0/total_cnt, 2)}%)')

In [7]:
gazipur_preProcessed_df = pre_process(gazipur_df)
# gazipur_df.sample(5)
habiganj_preProcessed_df = pre_process(habiganj_df)
barisal_preProcessed_df = pre_process(barisal_df)
rangpur_preProcessed_df = pre_process(rangpur_df)

In [8]:
show_missing_data(gazipur_preProcessed_df)
show_missing_data(habiganj_preProcessed_df)
show_missing_data(rangpur_preProcessed_df)
show_missing_data(barisal_preProcessed_df)

print()

print(gazipur_preProcessed_df.shape, rangpur_preProcessed_df.shape,
      barisal_preProcessed_df.shape, habiganj_preProcessed_df.shape) 

Total instances=1827, missing=0(0.0%)
Total instances=547, missing=210(38.39%)
Total instances=1127, missing=0(0.0%)
Total instances=1453, missing=0(0.0%)

(1827, 13) (1127, 13) (1453, 13) (547, 13)


## 3. Form each station wise weekly-all datasets

In [9]:
def get_weekly_df(_df, num_all_days=7, num_days_before=3):
    '''
    input STATION-WISE dataframe with all expected columns
    returns dataframe with station, month, day and rainfall columns unchanged 
        and other features of 'num_all_days' starting from 'num_days_before' ago
        
    example: num_all_days=7, num_days_before=3
        then rows for January 11 will have rainfall, year, month and day data of Jan 11 
            and other columns will be from Jan 1 to 7
    '''
    df=_df.copy()

    STATION_COL = 'Station'    
    MONTH_COL = 'Month'
    YEAR_COL = 'Year'
    DAY_COL = 'Day'
    RAINFALL_COL = 'Rainfall (mm)'

    # populate dict with daily features
    curr_col_dict = {}
    for col in df.columns:
        curr_col_dict[col] = []
        for val in df[col]:
            curr_col_dict[col].append(val)

    # initialize new columns dict 
    new_col_dict = {}
    for col in df.columns:
        # station, month, year, day columns unchanged
        if col in [STATION_COL, MONTH_COL, YEAR_COL, DAY_COL, RAINFALL_COL]:
            new_col_dict[col] = []
            if col != RAINFALL_COL: 
                continue

        for day_num in range(0, num_all_days):
            new_col_dict[col+str(day_num)] = []

    # populate the new columns dict
    for curr_day in range(num_all_days+num_days_before, df.shape[0]):
        start_day = curr_day-(num_all_days+num_days_before)
        end_day = start_day+num_all_days-1

        for col in df.columns:
            # station, month, year, day columns unchanged
            if col in [STATION_COL, MONTH_COL, YEAR_COL, DAY_COL, RAINFALL_COL]:
                new_col_dict[col].append(curr_col_dict[col][curr_day])
                if col != RAINFALL_COL: 
                    continue

            idx=0
            for day in range(start_day, end_day+1):
                new_col_dict[col+str(idx)].append(curr_col_dict[col][day])
                idx+=1
            
    #form new dataframe from dict and return
    return pd.DataFrame.from_dict(new_col_dict)

In [10]:
gazipur_preProcessed_df = get_weekly_df(gazipur_preProcessed_df)
rangpur_preProcessed_df = get_weekly_df(rangpur_preProcessed_df)
barisal_preProcessed_df = get_weekly_df(barisal_preProcessed_df)
habiganj_preProcessed_df = get_weekly_df(habiganj_preProcessed_df)

In [11]:
barisal_preProcessed_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius)0,Max Temp. (degree Celcius)1,Max Temp. (degree Celcius)2,Max Temp. (degree Celcius)3,Max Temp. (degree Celcius)4,Max Temp. (degree Celcius)5,...,Cloudy (hour/day)4,Cloudy (hour/day)5,Cloudy (hour/day)6,Solar Radiation (cal/cm^2/day)0,Solar Radiation (cal/cm^2/day)1,Solar Radiation (cal/cm^2/day)2,Solar Radiation (cal/cm^2/day)3,Solar Radiation (cal/cm^2/day)4,Solar Radiation (cal/cm^2/day)5,Solar Radiation (cal/cm^2/day)6
390,Barisal,2018,2,13,27.0,29.4,28.8,28.2,29.7,27.0,...,6.5,7.5,4.7,321.13,275.13,280.88,243.5,275.13,246.38,326.88
1354,Barisal,2020,10,4,28.0,28.6,29.6,31.6,32.0,33.0,...,4.7,8.0,9.1,162.6,193.25,248.43,300.54,392.51,291.35,257.63
101,Barisal,2017,4,22,34.6,35.2,35.4,34.4,32.4,34.0,...,6.6,3.8,3.7,442.44,468.31,452.14,465.05,371.28,461.85,465.08
356,Barisal,2018,1,10,27.4,24.0,25.6,23.8,21.2,20.6,...,4.2,6.0,4.1,276.717196,181.74,239.84,290.35,285.3,239.84,287.82
619,Barisal,2018,9,30,32.4,29.0,32.2,32.8,32.2,34.0,...,9.9,5.8,8.2,401.7,322.0,340.39,404.77,233.1,358.79,285.22


## 4. Drop missing values

In [12]:
gazipur_preProcessed_df.dropna(inplace=True)
habiganj_preProcessed_df.dropna(inplace=True)
rangpur_preProcessed_df.dropna(inplace=True)
barisal_preProcessed_df.dropna(inplace=True)

show_missing_data(gazipur_preProcessed_df)
show_missing_data(habiganj_preProcessed_df)
show_missing_data(rangpur_preProcessed_df)
show_missing_data(barisal_preProcessed_df)

print()

print(gazipur_preProcessed_df.shape, rangpur_preProcessed_df.shape,
      barisal_preProcessed_df.shape, habiganj_preProcessed_df.shape) 

Total instances=1817, missing=0(0.0%)
Total instances=315, missing=0(0.0%)
Total instances=1117, missing=0(0.0%)
Total instances=1443, missing=0(0.0%)

(1817, 68) (1117, 68) (1443, 68) (315, 68)


## 5. Merge stationwise separate datasets into a single dataset

In [13]:
merged_preProcessedWeekly_df = pd.concat([gazipur_preProcessed_df, habiganj_preProcessed_df, 
                                   rangpur_preProcessed_df, barisal_preProcessed_df])

merged_preProcessedWeekly_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius)0,Max Temp. (degree Celcius)1,Max Temp. (degree Celcius)2,Max Temp. (degree Celcius)3,Max Temp. (degree Celcius)4,Max Temp. (degree Celcius)5,...,Cloudy (hour/day)4,Cloudy (hour/day)5,Cloudy (hour/day)6,Solar Radiation (cal/cm^2/day)0,Solar Radiation (cal/cm^2/day)1,Solar Radiation (cal/cm^2/day)2,Solar Radiation (cal/cm^2/day)3,Solar Radiation (cal/cm^2/day)4,Solar Radiation (cal/cm^2/day)5,Solar Radiation (cal/cm^2/day)6
277,Rangpur,2018,9,14,31.2,32.2,32.5,30.5,31.2,30.5,...,7.9,9.9,12.2,341.4,350.47,432.01,401.81,290.06,229.66,160.2
978,Gazipur,2018,9,15,35.0,33.5,33.6,33.8,35.5,34.5,...,4.8,7.9,9.8,482.73,473.63,385.6,334.0,385.6,291.51,233.84
525,Rangpur,2019,5,20,32.3,32.7,33.5,30.5,30.2,30.8,...,5.3,7.0,8.3,359.54,317.36,294.65,262.21,447.13,391.98,349.81
1426,Gazipur,2019,12,7,30.5,31.5,31.2,31.0,29.8,28.5,...,4.2,4.4,2.8,318.12,333.46,330.9,333.46,263.23,258.45,296.7
685,Rangpur,2019,10,27,30.5,31.2,31.5,32.2,30.5,27.2,...,10.5,5.2,10.2,380.91,372.31,315.03,266.35,171.83,323.63,171.83


## 6. Drop Year, Day columns

In [14]:
merged_preProcessedWeekly_df.drop(columns=['Year', 'Day'], inplace=True)

## 7. Convert categorical 'Station' column to numeric with One-Hot-Encoding

In [15]:
merged_preProcessedWeekly_df = pd.get_dummies(merged_preProcessedWeekly_df, columns=['Station'])

merged_preProcessedWeekly_df.sample(5)

Unnamed: 0,Month,Max Temp. (degree Celcius)0,Max Temp. (degree Celcius)1,Max Temp. (degree Celcius)2,Max Temp. (degree Celcius)3,Max Temp. (degree Celcius)4,Max Temp. (degree Celcius)5,Max Temp. (degree Celcius)6,Min Temp. (degree Celcius)0,Min Temp. (degree Celcius)1,...,Solar Radiation (cal/cm^2/day)1,Solar Radiation (cal/cm^2/day)2,Solar Radiation (cal/cm^2/day)3,Solar Radiation (cal/cm^2/day)4,Solar Radiation (cal/cm^2/day)5,Solar Radiation (cal/cm^2/day)6,Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur
1008,9,33.5,33.5,32.8,29.2,27.5,31.2,33.2,27.2,27.7,...,250.8,160.2,160.2,119.46,311.2,281.0,0,0,0,1
575,8,29.4,29.0,31.8,33.6,32.6,32.0,32.0,25.4,25.4,...,305.33,470.11,509.65,381.13,447.04,512.95,1,0,0,0
190,6,31.2,34.2,34.5,32.3,30.5,31.2,32.5,26.2,28.2,...,319.53,384.0,287.3,287.3,351.76,329.2,0,0,0,1
1179,4,37.6,33.8,36.8,33.0,34.0,35.6,31.0,20.0,20.4,...,410.09,423.03,261.3,426.26,474.78,248.36,1,0,0,0
848,5,34.4,25.5,26.5,22.8,31.5,31.8,30.6,23.4,19.8,...,176.6,423.97,255.18,301.21,387.91,336.11,0,1,0,0


## 8. Create the classification dataset

In [16]:
def rain_classify(_df):
    df = _df.copy()
    
    # todo: implement
    
    return df

In [17]:
# merged_preProcessedWeekly_clf_df = rain_classify(merged_preProcessedWeekly_df)
# merged_preProcessedWeekly_clf_df.sample(5)

## Save the pre-processed and merged datasets

In [18]:
merged_preProcessedWeekly_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessedWeeklyAll_regression.csv'), index=False)
# merged_preProcessedWeekly_clf_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessedWeeklyAll_classification.csv'), index=False)

## 9. Train-Test split in 80:20 ratio and scale both datasets using train set

In [19]:
def splitTrainTest_and_scale(_df, class_label, is_regression=False):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    
    df = _df.copy()
    
    X_all = df.drop(columns=class_label)
    y_all = df[class_label]

    if(is_regression):
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True)
    else:    
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True, stratify=y_all)

    # scale the dataset
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
    X_test_scaled = scaler.transform(X_test)
    X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)
    
    # concat X, y
    train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
    test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
    
    return train_df, test_df

In [20]:
merged_preProcessed_train_df, merged_preProcessed_test_df = splitTrainTest_and_scale(merged_preProcessedWeekly_df, \
                                                               class_label='Rainfall (mm)', \
                                                               is_regression=True)

merged_preProcessed_train_df.sample(5)

Unnamed: 0,Month,Max Temp. (degree Celcius)0,Max Temp. (degree Celcius)1,Max Temp. (degree Celcius)2,Max Temp. (degree Celcius)3,Max Temp. (degree Celcius)4,Max Temp. (degree Celcius)5,Max Temp. (degree Celcius)6,Min Temp. (degree Celcius)0,Min Temp. (degree Celcius)1,...,Solar Radiation (cal/cm^2/day)2,Solar Radiation (cal/cm^2/day)3,Solar Radiation (cal/cm^2/day)4,Solar Radiation (cal/cm^2/day)5,Solar Radiation (cal/cm^2/day)6,Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur,Rainfall (mm)
1207,0.727273,0.798046,0.754153,0.739414,0.74744,0.71987,0.72093,0.687296,0.85466,0.828767,...,0.23648,0.317039,0.32358,0.287597,0.356298,1.0,0.0,0.0,0.0,55.0
2079,0.545455,0.700326,0.554817,0.583062,0.56314,0.654723,0.641196,0.641694,0.848341,0.80137,...,0.222683,0.175326,0.354195,0.388588,0.426428,1.0,0.0,0.0,0.0,10.8
3686,0.636364,0.80456,0.621262,0.771987,0.617747,0.70684,0.840532,0.693811,0.873618,0.821918,...,0.33988,0.176169,0.348486,0.337933,0.443438,1.0,0.0,0.0,0.0,6.0
642,0.181818,0.674267,0.700997,0.700326,0.733788,0.732899,0.787375,0.674267,0.557662,0.520548,...,0.436556,0.300765,0.355988,0.34679,0.285431,1.0,0.0,0.0,0.0,1.402105
3232,0.272727,0.837134,0.774086,0.863192,0.849829,0.771987,0.641196,0.583062,0.734597,0.705479,...,0.615177,0.345658,0.272839,0.276307,0.161881,0.0,1.0,0.0,0.0,45.2


In [21]:
merged_preProcessed_train_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'train', 'brri-weather_weekly-all_train_regression.csv'), index=False)
merged_preProcessed_test_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'test', 'brri-weather_weekly-all_test_regression.csv'), index=False)