In [1]:
# required for jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(8,6)}) # set sns figure size

import os
import math

## 1. Read station-wise datasets separately

In [2]:
# read Gazipur raw csv by marking missing values as NaN
missing_values = ['NIL', 'nil', '']
gazipur_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'gazipur_2016-2020', 'gazipur.csv'), 
                     na_values=missing_values)

gazipur_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
1595,Gazipur,2020,5,14,34.8,24.2,8.6,5.6,80.0,62.0,7.0,6.3,413.81
32,Gazipur,2016,2,2,28.4,15.8,0.0,3.0,100.0,59.0,5.2,,284.12
1686,Gazipur,2020,8,13,34.8,27.4,0.0,4.0,77.0,65.0,8.4,4.5,458.57
344,Gazipur,2016,12,10,28.4,14.8,0.0,2.0,100.0,49.0,8.0,2.6,301.49
989,Gazipur,2018,9,16,35.3,26.4,0.0,4.0,80.0,67.0,9.4,2.8,446.31


In [3]:
rangpur_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'rangpur_mid2017-2020', 'rangpur.csv'), 
                     na_values=missing_values)

rangpur_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
719,Rangpur,2019,11,20,28.2,14.8,0.0,2.0,80.0,53,6.3,4.6,278.59
1000,Rangpur,2020,8,27,32.6,26.5,0.0,6.0,83.0,70,9.3,3.6,488.56
984,Rangpur,2020,8,11,32.5,26.2,5.5,3.0,96.0,87,0.0,12.9,183.8
1066,Rangpur,2020,11,1,32.2,21.6,0.0,3.0,82.0,63,7.3,3.6,303.82
614,Rangpur,2019,8,7,34.2,29.8,0.0,5.0,79.0,61,5.3,7.6,357.48


In [4]:
barisal_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'barisal_2017-2020', 'barisal.csv'), 
                     na_values=missing_values)

barisal_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
451,Barisal,2018,4,5,35.0,21.0,0.2,2.2,95,57.0,9.1,3.5,471.55
1419,Barisal,2020,11,28,28.8,15.8,0.0,2.0,89,56.0,7.1,3.9,308.58
1382,Barisal,2020,10,22,26.4,24.6,145.2,0.0,91,91.0,0.0,11.6,146.8
776,Barisal,2019,2,24,31.0,21.0,0.0,2.0,95,48.0,8.1,3.1,372.88
453,Barisal,2018,4,7,34.0,22.0,22.8,1.8,95,47.0,3.8,8.8,300.11


In [5]:
habiganj_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'habiganj_2019-2020', 'habiganj.csv'), 
                     na_values=missing_values)

habiganj_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
399,Habiganj,2020,2,4,20.8,12.8,0.0,,94,78,8.1,3.1,318.94
122,Habiganj,2019,5,3,26.5,23.2,2.8,,100,95,3.0,10.3,284.62
475,Habiganj,2020,4,20,20.8,22.5,0.0,,88,63,8.5,4.0,452.29
217,Habiganj,2019,8,6,33.2,27.8,0.0,5.0,81,75,9.9,3.0,507.67
424,Habiganj,2020,2,29,29.5,18.2,0.0,,78,79,,1.4,318.94


## 2. Pre-process each station's dataset with the techniques used in 'brri-dataset_pre-process.ipynb' notebook

### 2.1. Replace invalid values with NaN

- Max/Min Temp. (degree Celcius) > 50 
- Relative Humidity (afternoon, %) > 100, 
- Sunshine/Cloudy (hour/day) > 24, 
- Solar Radiation (cal/cm^2/day) > 1000 (from the box plot)

### 2.2. Fill up missing values with monthly average (DO NOT drop values that are still missing after fillup)

In [6]:
def pre_process(_df):
    df = _df.copy()
    
    # apply step 2.1
    df.loc[df['Max Temp. (degree Celcius)'] > 50, 'Max Temp. (degree Celcius)'] = math.nan
    df.loc[df['Min Temp. (degree Celcius)'] > 50, 'Min Temp. (degree Celcius)'] = math.nan
    df.loc[df['Relative Humidity (afternoon, %)'] > 100, 'Relative Humidity (afternoon, %)'] = math.nan
    df.loc[df['Sunshine (hour/day)'] > 24, 'Sunshine (hour/day)'] = math.nan
    df.loc[df['Cloudy (hour/day)'] > 24, 'Cloudy (hour/day)'] = math.nan
    df.loc[df['Solar Radiation (cal/cm^2/day)'] > 1000, 'Solar Radiation (cal/cm^2/day)'] = math.nan
    
    # apply step 2.2
    for column in df.columns:
        if column in ['Station', 'Year', 'Month', 'Day']:
            continue

        df[column] = df.groupby(['Station', 'Month'])[column].transform(
            lambda grp: grp.fillna(np.mean(grp))
        )
        
    # cannot drop missing values here it will mess up the average calculation
    # df.dropna(inplace=True)
    
    return df

def show_missing_data(_df):
    df = _df.copy()
    total_cnt = df.shape[0]
    missing_cnt = df.shape[0]-df.dropna().shape[0]
    print(f'Total instances={total_cnt}, missing={missing_cnt}({round(missing_cnt*100.0/total_cnt, 2)}%)')

In [7]:
gazipur_preProcessed_df = pre_process(gazipur_df)
# gazipur_df.sample(5)
habiganj_preProcessed_df = pre_process(habiganj_df)
barisal_preProcessed_df = pre_process(barisal_df)
rangpur_preProcessed_df = pre_process(rangpur_df)

In [8]:
show_missing_data(gazipur_preProcessed_df)
show_missing_data(habiganj_preProcessed_df)
show_missing_data(rangpur_preProcessed_df)
show_missing_data(barisal_preProcessed_df)

print()

print(gazipur_preProcessed_df.shape, rangpur_preProcessed_df.shape,
      barisal_preProcessed_df.shape, habiganj_preProcessed_df.shape) 

Total instances=1827, missing=0(0.0%)
Total instances=547, missing=210(38.39%)
Total instances=1127, missing=0(0.0%)
Total instances=1453, missing=0(0.0%)

(1827, 13) (1127, 13) (1453, 13) (547, 13)


## 3. Form each station wise weekly-all datasets

In [9]:
def get_weekly_df(_df, num_all_days=7, num_days_before=3):
    '''
    input STATION-WISE dataframe with all expected columns
    returns dataframe with station, month, day and rainfall columns unchanged 
        and other features of 'num_all_days' starting from 'num_days_before' ago
        
    example: num_all_days=7, num_days_before=3
        then rows for January 11 will have rainfall, year, month and day data of Jan 11 
            and other columns will be from Jan 1 to 7
    '''
    df=_df.copy()

    STATION_COL = 'Station'    
    MONTH_COL = 'Month'
    YEAR_COL = 'Year'
    DAY_COL = 'Day'
    RAINFALL_COL = 'Rainfall (mm)'

    # populate dict with daily features
    curr_col_dict = {}
    for col in df.columns:
        curr_col_dict[col] = []
        for val in df[col]:
            curr_col_dict[col].append(val)

    # initialize new columns dict 
    new_col_dict = {}
    for col in df.columns:
        # station, month, year, day columns unchanged
        if col in [STATION_COL, MONTH_COL, YEAR_COL, DAY_COL, RAINFALL_COL]:
            new_col_dict[col] = []
            if col != RAINFALL_COL: 
                continue

        for day_num in range(0, num_all_days):
            new_col_dict[col+str(day_num)] = []

    # populate the new columns dict
    for curr_day in range(num_all_days+num_days_before, df.shape[0]):
        start_day = curr_day-(num_all_days+num_days_before)
        end_day = start_day+num_all_days-1

        for col in df.columns:
            # station, month, year, day columns unchanged
            if col in [STATION_COL, MONTH_COL, YEAR_COL, DAY_COL, RAINFALL_COL]:
                new_col_dict[col].append(curr_col_dict[col][curr_day])
                if col != RAINFALL_COL: 
                    continue

            idx=0
            for day in range(start_day, end_day+1):
                new_col_dict[col+str(idx)].append(curr_col_dict[col][day])
                idx+=1
            
    #form new dataframe from dict and return
    return pd.DataFrame.from_dict(new_col_dict)

In [10]:
gazipur_preProcessed_df = get_weekly_df(gazipur_preProcessed_df)
rangpur_preProcessed_df = get_weekly_df(rangpur_preProcessed_df)
barisal_preProcessed_df = get_weekly_df(barisal_preProcessed_df)
habiganj_preProcessed_df = get_weekly_df(habiganj_preProcessed_df)

In [11]:
barisal_preProcessed_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius)0,Max Temp. (degree Celcius)1,Max Temp. (degree Celcius)2,Max Temp. (degree Celcius)3,Max Temp. (degree Celcius)4,Max Temp. (degree Celcius)5,...,Cloudy (hour/day)4,Cloudy (hour/day)5,Cloudy (hour/day)6,Solar Radiation (cal/cm^2/day)0,Solar Radiation (cal/cm^2/day)1,Solar Radiation (cal/cm^2/day)2,Solar Radiation (cal/cm^2/day)3,Solar Radiation (cal/cm^2/day)4,Solar Radiation (cal/cm^2/day)5,Solar Radiation (cal/cm^2/day)6
243,Barisal,2017,9,15,33.0,34.0,34.6,30.2,32.0,31.8,...,9.3,5.3,9.8,306.674262,389.440328,432.356066,171.79623,251.496885,374.113279,236.169836
1344,Barisal,2020,9,24,32.2,31.6,32.2,32.8,33.8,33.8,...,3.7,3.7,3.8,294.41,358.79,401.7,413.96,423.16,423.16,420.09
503,Barisal,2018,6,6,34.8,35.2,34.2,35.6,32.6,33.4,...,11.3,9.0,8.3,499.27,460.21,447.19,443.93,248.64,334.25,356.82
40,Barisal,2017,2,20,30.4,29.8,30.0,29.0,29.8,29.8,...,4.3,3.9,3.7,347.0,321.13,341.25,321.13,338.38,349.88,355.63
294,Barisal,2017,11,5,31.8,31.0,30.6,28.6,21.0,28.2,...,11.16,4.36,3.7,327.262759,344.726897,341.816207,231.21,146.8,344.726897,313.774364


## 4. Drop missing values

In [12]:
gazipur_preProcessed_df.dropna(inplace=True)
habiganj_preProcessed_df.dropna(inplace=True)
rangpur_preProcessed_df.dropna(inplace=True)
barisal_preProcessed_df.dropna(inplace=True)

show_missing_data(gazipur_preProcessed_df)
show_missing_data(habiganj_preProcessed_df)
show_missing_data(rangpur_preProcessed_df)
show_missing_data(barisal_preProcessed_df)

print()

print(gazipur_preProcessed_df.shape, rangpur_preProcessed_df.shape,
      barisal_preProcessed_df.shape, habiganj_preProcessed_df.shape) 

Total instances=1817, missing=0(0.0%)
Total instances=315, missing=0(0.0%)
Total instances=1117, missing=0(0.0%)
Total instances=1443, missing=0(0.0%)

(1817, 68) (1117, 68) (1443, 68) (315, 68)


## 5. Merge stationwise separate datasets into a single dataset

In [13]:
merged_preProcessedWeekly_df = pd.concat([gazipur_preProcessed_df, habiganj_preProcessed_df, 
                                   rangpur_preProcessed_df, barisal_preProcessed_df])

merged_preProcessedWeekly_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius)0,Max Temp. (degree Celcius)1,Max Temp. (degree Celcius)2,Max Temp. (degree Celcius)3,Max Temp. (degree Celcius)4,Max Temp. (degree Celcius)5,...,Cloudy (hour/day)4,Cloudy (hour/day)5,Cloudy (hour/day)6,Solar Radiation (cal/cm^2/day)0,Solar Radiation (cal/cm^2/day)1,Solar Radiation (cal/cm^2/day)2,Solar Radiation (cal/cm^2/day)3,Solar Radiation (cal/cm^2/day)4,Solar Radiation (cal/cm^2/day)5,Solar Radiation (cal/cm^2/day)6
778,Barisal,2019,3,8,21.8,25.0,24.8,25.6,25.8,26.4,...,2.2,2.7,8.1,232.0,232.0,252.13,463.65,473.19,457.28,285.49
1011,Rangpur,2020,9,17,29.2,27.5,31.2,33.2,33.5,31.5,...,10.2,12.2,10.9,160.2,119.46,311.2,281.0,220.6,160.2,119.46
429,Habiganj,2020,3,15,29.8,24.2,28.8,29.2,28.5,29.3,...,2.3,1.8,3.1,400.3,406.64,162.6,349.59,463.68,479.59,438.33
398,Gazipur,2017,2,12,27.3,27.2,29.2,32.3,31.6,31.2,...,3.7,2.6,1.9,346.19,380.05,396.98,374.41,349.02,380.05,399.8
1417,Gazipur,2019,11,28,30.5,30.8,30.5,30.5,29.6,29.8,...,4.2,3.2,5.2,254.18,256.74,292.54,279.76,292.54,318.12,266.97


## 6. Drop Year, Day columns

In [14]:
merged_preProcessedWeekly_df.drop(columns=['Year', 'Day'], inplace=True)

## 7. Convert categorical 'Station' column to numeric with One-Hot-Encoding

In [15]:
merged_preProcessedWeekly_df = pd.get_dummies(merged_preProcessedWeekly_df, columns=['Station'])

merged_preProcessedWeekly_df.sample(5)

Unnamed: 0,Month,Max Temp. (degree Celcius)0,Max Temp. (degree Celcius)1,Max Temp. (degree Celcius)2,Max Temp. (degree Celcius)3,Max Temp. (degree Celcius)4,Max Temp. (degree Celcius)5,Max Temp. (degree Celcius)6,Min Temp. (degree Celcius)0,Min Temp. (degree Celcius)1,...,Solar Radiation (cal/cm^2/day)1,Solar Radiation (cal/cm^2/day)2,Solar Radiation (cal/cm^2/day)3,Solar Radiation (cal/cm^2/day)4,Solar Radiation (cal/cm^2/day)5,Solar Radiation (cal/cm^2/day)6,Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur
1241,6,32.8,29.0,32.6,32.2,32.8,33.0,35.0,24.2,25.2,...,234.33,430.96,331.03,363.26,366.49,469.64,1,0,0,0
469,4,30.5,30.2,31.1,31.5,30.1,30.5,20.8,20.1,20.5,...,406.85,397.11,423.08,419.83,387.37,452.29,0,0,1,0
1019,10,32.4,32.5,31.8,33.0,33.8,33.0,33.5,22.8,20.8,...,352.33,311.9,326.34,363.88,384.1,392.76,0,1,0,0
1459,1,26.2,27.5,28.8,28.0,23.2,24.8,23.5,11.8,12.6,...,260.84,296.48,259.27,172.45,209.66,160.05,0,1,0,0
649,10,32.2,32.0,31.8,32.0,31.6,30.4,31.2,23.0,23.2,...,335.99,335.99,350.55,344.73,338.91,362.19,1,0,0,0


## 8. Create the classification dataset

In [16]:
def rain_classifier(_df):
    """
    create a column 'Rainfall' with classification labels with margins,
        - rainfall==0 -> no rain -> 0
        - rainfall>0 && rainfal<=22 -> light to moderate rain -> 1
        - rainfall>22 -> heavy to very heavy rain -> 2
        
    and drop the 'Rainfall (mm)' regression column
    """
    
    df = _df.copy()
    
    rainfall_labels = []
    
    for rainfall in df['Rainfall (mm)']:
        if rainfall==0:
            rainfall_labels.append(0) # no rain
        elif rainfall>0 and rainfall<=22:
            rainfall_labels.append(1) # light to moderate
        elif rainfall>22:
            rainfall_labels.append(2) # heavy
        else:
            print(f'outside rainfall margins -> {rainfall}')
            
    # insert classification column
    df['Rainfall'] = rainfall_labels
    # drop regression column
    df.drop(columns=['Rainfall (mm)'], inplace=True)

    return df

In [17]:
merged_preProcessedWeekly_clf_df = rain_classifier(merged_preProcessedWeekly_df)
merged_preProcessedWeekly_clf_df.sample(5)

Unnamed: 0,Month,Max Temp. (degree Celcius)0,Max Temp. (degree Celcius)1,Max Temp. (degree Celcius)2,Max Temp. (degree Celcius)3,Max Temp. (degree Celcius)4,Max Temp. (degree Celcius)5,Max Temp. (degree Celcius)6,Min Temp. (degree Celcius)0,Min Temp. (degree Celcius)1,...,Solar Radiation (cal/cm^2/day)2,Solar Radiation (cal/cm^2/day)3,Solar Radiation (cal/cm^2/day)4,Solar Radiation (cal/cm^2/day)5,Solar Radiation (cal/cm^2/day)6,Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur,Rainfall
629,10,34.8,35.0,35.0,35.2,34.4,34.4,34.6,27.0,26.8,...,373.83,376.74,370.92,338.91,376.74,1,0,0,0,2
5,7,30.5,31.3,32.6,33.3,29.5,30.3,26.5,26.3,27.2,...,284.99,220.73,188.6,284.99,252.86,0,0,0,1,0
194,7,29.0,32.8,33.8,34.6,32.8,26.8,26.0,25.2,26.0,...,445.2,470.96,422.66,187.6,187.6,1,0,0,0,2
159,5,27.5,29.2,27.2,27.2,30.2,31.2,29.8,20.2,20.2,...,317.36,447.13,414.69,382.25,414.69,0,0,0,1,0
102,3,28.2,26.5,28.2,30.2,31.2,27.2,30.2,19.2,16.5,...,329.76,415.12,162.2,424.6,383.5,0,0,0,1,0


## Save the pre-processed and merged datasets

In [18]:
merged_preProcessedWeekly_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessedWeeklyAll_regression.csv'), index=False)
merged_preProcessedWeekly_clf_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessedWeeklyAll_classification.csv'), index=False)

## 9. Train-Test split in 80:20 ratio and scale both datasets using train set

In [19]:
def splitTrainTest_and_scale(_df, class_label, is_regression=False):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    
    df = _df.copy()
    
    X_all = df.drop(columns=class_label)
    y_all = df[class_label]

    if(is_regression):
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True)
    else:    
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True, stratify=y_all)

    # scale the dataset
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
    X_test_scaled = scaler.transform(X_test)
    X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)
    
    # concat X, y
    train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
    test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
    
    return train_df, test_df

In [20]:
merged_preProcessed_train_df, merged_preProcessed_test_df = splitTrainTest_and_scale(merged_preProcessedWeekly_df, \
                                                               class_label='Rainfall (mm)', \
                                                               is_regression=True)

preProcessed_clf_train_df, preProcessed_clf_test_df \
= splitTrainTest_and_scale(merged_preProcessedWeekly_clf_df, class_label='Rainfall')

In [21]:
merged_preProcessed_train_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'train', 'brri-weather_weekly-all_train_regression.csv'), index=False)
merged_preProcessed_test_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'test', 'brri-weather_weekly-all_test_regression.csv'), index=False)

preProcessed_clf_train_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'train', 'brri-weather_weekly-all_train_classification.csv'), index=False)
preProcessed_clf_test_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'test', 'brri-weather_weekly-all_test_classification.csv'), index=False)