In [59]:
# required for jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(8,6)}) # set sns figure size

import os
import math

## 1. Read each station dataset separately

In [91]:
# read Gazipur raw csv by marking missing values as NaN
missing_values = ['NIL', 'nil', '']
gazipur_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'gazipur_2016-2020', 'gazipur.csv'), 
                     na_values=missing_values)

gazipur_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
982,Gazipur,2018,9,9,35.5,26.4,2.0,6.0,76.0,67.0,7.4,4.8,385.6
244,Gazipur,2016,9,1,34.0,27.2,4.4,3.4,76.0,76.0,3.6,8.6,270.26
1595,Gazipur,2020,5,14,34.8,24.2,8.6,5.6,80.0,62.0,7.0,6.3,413.81
18,Gazipur,2016,1,19,24.6,13.8,0.0,1.0,79.0,56.0,2.6,,179.89
181,Gazipur,2016,6,30,34.5,28.4,0.0,5.0,77.0,64.0,5.4,,363.89


In [92]:
rangpur_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'rangpur_mid2017-2020', 'rangpur.csv'), 
                     na_values=missing_values)

rangpur_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
192,Rangpur,2018,6,11,34.5,25.4,16.0,5.0,87.0,73,6.0,7.6,384.0
931,Rangpur,2020,6,19,29.9,26.2,2.0,4.0,91.0,80,3.3,10.3,296.97
272,Rangpur,2018,8,30,31.2,27.2,0.0,4.0,87.0,74,7.0,5.9,413.19
1090,Rangpur,2020,11,25,25.5,13.8,0.0,3.0,75.0,58,8.0,2.9,321.49
555,Rangpur,2019,6,9,33.5,25.8,0.0,4.0,83.0,65,6.3,7.3,393.67


In [93]:
barisal_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'barisal_2017-2020', 'barisal.csv'), 
                     na_values=missing_values)

barisal_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
543,Barisal,2018,7,6,30.4,25.6,0.0,3.0,96,77.0,6.0,7.4,380.8
109,Barisal,2017,4,20,32.4,21.4,7.2,4.2,91,70.0,6.0,6.6,371.28
1411,Barisal,2020,11,20,31.0,20.2,0.0,1.0,90,66.0,3.8,7.2,222.88
387,Barisal,2018,1,31,26.0,12.0,0.0,2.0,94,45.0,7.4,3.4,305.5
1301,Barisal,2020,8,2,32.8,27.4,0.0,2.0,92,69.0,2.8,10.0,275.67


In [94]:
habiganj_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'habiganj_2019-2020', 'habiganj.csv'), 
                     na_values=missing_values)

habiganj_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
192,Habiganj,2019,7,12,31.8,24.2,67.8,5.0,88,92,0.0,13.5,188.6
14,Habiganj,2019,1,15,26.5,14.5,0.0,,89,76,7.3,3.4,294.42
396,Habiganj,2020,2,1,22.2,11.2,0.0,,84,65,6.5,4.7,318.94
47,Habiganj,2019,2,17,25.8,16.5,13.8,,100,75,2.1,9.1,195.51
378,Habiganj,2020,1,14,21.5,12.8,0.0,,94,100,2.0,8.7,163.87


## 2. Pre-process each station's dataset with the techniques used in 'brri-dataset_pre-process.ipynb' notebook

### 2.1. Replace invalid values with NaN

- Max/Min Temp. (degree Celcius) > 50 
- Relative Humidity (afternoon, %) > 100, 
- Sunshine/Cloudy (hour/day) > 24, 
- Solar Radiation (cal/cm^2/day) > 1000 (from the box plot)

### 2.2. Fill up missing values with monthly average (DO NOT drop values that are still missing after fillup)

In [119]:
def pre_process(_df):
    df = _df.copy()
    
    # apply step 2.1
    df.loc[df['Max Temp. (degree Celcius)'] > 50, 'Max Temp. (degree Celcius)'] = math.nan
    df.loc[df['Min Temp. (degree Celcius)'] > 50, 'Min Temp. (degree Celcius)'] = math.nan
    df.loc[df['Relative Humidity (afternoon, %)'] > 100, 'Relative Humidity (afternoon, %)'] = math.nan
    df.loc[df['Sunshine (hour/day)'] > 24, 'Sunshine (hour/day)'] = math.nan
    df.loc[df['Cloudy (hour/day)'] > 24, 'Cloudy (hour/day)'] = math.nan
    df.loc[df['Solar Radiation (cal/cm^2/day)'] > 1000, 'Solar Radiation (cal/cm^2/day)'] = math.nan
    
    # apply step 2.2
    for column in df.columns:
        if column in ['Station', 'Year', 'Month', 'Day']:
            continue

        df[column] = df.groupby(['Station', 'Month'])[column].transform(
            lambda grp: grp.fillna(np.mean(grp))
        )
        
    # cannot drop missing values here it will mess up the average calculation
    # df.dropna(inplace=True)
    
    return df

def show_missing_data(_df):
    df = _df.copy()
    total_cnt = df.shape[0]
    missing_cnt = df.shape[0]-df.dropna().shape[0]
    print(f'Total instances={total_cnt}, missing={missing_cnt}({round(missing_cnt*100.0/total_cnt, 2)}%)')

In [120]:
gazipur_preProcessed_df = pre_process(gazipur_df)
# gazipur_df.sample(5)
habiganj_preProcessed_df = pre_process(habiganj_df)
barisal_preProcessed_df = pre_process(barisal_df)
rangpur_preProcessed_df = pre_process(rangpur_df)

In [123]:
show_missing_data(gazipur_preProcessed_df)
show_missing_data(habiganj_preProcessed_df)
show_missing_data(rangpur_preProcessed_df)
show_missing_data(barisal_preProcessed_df)

print()

print(gazipur_preProcessed_df.shape, rangpur_preProcessed_df.shape,
      barisal_preProcessed_df.shape, habiganj_preProcessed_df.shape) 

Total instances=1827, missing=0(0.0%)
Total instances=547, missing=210(38.39%)
Total instances=1127, missing=0(0.0%)
Total instances=1453, missing=0(0.0%)

(1827, 13) (1127, 13) (1453, 13) (547, 13)


## 3. Form each station wise datasets using weekly average

In [124]:
def get_avg_df(_df, num_avg_days=7, num_days_before=3):
    '''
    input STATION-WISE dataframe with all expected columns
    returns dataframe with rainfall columns unchanged 
        and average of 'num_avg_days' number of days worth other features 
        and starting from 'num_days_before' ago
        
    example: num_avg_days=7, num_days_before=3
        then row for January 10 will have rainfall data of Jan 10 
            and other columns will have average of values from Jan 1 to 7
    '''
    df=_df.copy()

    station = df['Station'].loc[0]
    
    MONTH_COL = 'Month'
    MAX_TEMP_COL = 'Max Temp. (degree Celcius)'
    MIN_TEMP_COL = 'Min Temp. (degree Celcius)'
    RAINFALL_COL = 'Rainfall (mm)'
    ACTUAL_EVA_COL = 'Actual Evaporation (mm)'
    REL_HUMIDITY_M_COL = 'Relative Humidity (morning, %)'
    REL_HUMIDITY_A_COL = 'Relative Humidity (afternoon, %)'
    SUNSHINE_COL = 'Sunshine (hour/day)'
    CLOUDY_COL = 'Cloudy (hour/day)'
    SOLAR_RAD_COL = 'Solar Radiation (cal/cm^2/day)'

    months, min_temps, max_temps, rainfalls, actual_evas, rhs_m, rhs_a, sunshines, cloudies, solar_rads = \
    [], [], [], [], [], [], [], [], [], [] 

    def get_list_with_col(df, col_name):
        # returns list of columns from dataframe
        vals = []
        for val in df[col_name]:
            vals.append(val);
        return vals;

    # populate list with daily features
    months = get_list_with_col(df, MONTH_COL)
    min_temps = get_list_with_col(df, MIN_TEMP_COL)
    max_temps = get_list_with_col(df, MAX_TEMP_COL)
    rainfalls = get_list_with_col(df, RAINFALL_COL)
    actual_evas = get_list_with_col(df, ACTUAL_EVA_COL)
    rhs_m = get_list_with_col(df, REL_HUMIDITY_M_COL)
    rhs_a = get_list_with_col(df, REL_HUMIDITY_A_COL)
    sunshines = get_list_with_col(df, SUNSHINE_COL)
    cloudies = get_list_with_col(df, CLOUDY_COL)
    solar_rads = get_list_with_col(df, SOLAR_RAD_COL)

    def get_avg_in_range(vals, start, end):
        '''
        returns average of list values from start to end index 
        '''
        total = 0.0
        for i in range(start, end+1):
            total+=vals[i]
        return float(total/(end-start+1));

    new_months, new_min_temps, new_max_temps, new_rainfalls, new_actual_evas, \
    new_rhs_m, new_rhs_a, new_sunshines, new_cloudies, new_solar_rads = [], [], [], [], [], [], [], [], [], [] 

    output_rainfalls = []
    stations = []
    
    # populate new features with previous average values
    for curr_idx in range(num_avg_days+num_days_before, df.shape[0]):
        avg_start_idx = curr_idx-(num_avg_days+num_days_before)
        avg_end_idx = avg_start_idx+num_days_before-1
        
        new_min_temps.append(get_avg_in_range(min_temps, avg_start_idx, avg_end_idx))
        new_max_temps.append(get_avg_in_range(max_temps, avg_start_idx, avg_end_idx))
        new_actual_evas.append(get_avg_in_range(actual_evas, avg_start_idx, avg_end_idx))
        new_rhs_m.append(get_avg_in_range(rhs_m, avg_start_idx, avg_end_idx))
        new_rhs_a.append(get_avg_in_range(rhs_a, avg_start_idx, avg_end_idx))
        new_sunshines.append(get_avg_in_range(sunshines, avg_start_idx, avg_end_idx))
        new_cloudies.append(get_avg_in_range(cloudies, avg_start_idx, avg_end_idx))
        new_solar_rads.append(get_avg_in_range(solar_rads, avg_start_idx, avg_end_idx))
        new_rainfalls.append(get_avg_in_range(rainfalls, avg_start_idx, avg_end_idx))
        
        # in case days fall in two months, set the month that covers most days
        new_months.append(int(get_avg_in_range(months, avg_start_idx, avg_end_idx)))
        
        output_rainfalls.append(rainfalls[curr_idx])
        stations.append(station)

    return pd.DataFrame({'Station': stations,
                          MONTH_COL: new_months,
                         'Avg '+ MIN_TEMP_COL: new_min_temps,
                         'Avg '+ MAX_TEMP_COL: new_max_temps,
                         'Avg '+ RAINFALL_COL: new_rainfalls,
                         'Avg '+ ACTUAL_EVA_COL: new_actual_evas, 
                         'Avg '+ REL_HUMIDITY_M_COL: new_rhs_m,
                         'Avg '+ REL_HUMIDITY_A_COL: new_rhs_a,
                         'Avg '+ SUNSHINE_COL: new_sunshines,
                         'Avg '+ CLOUDY_COL: new_cloudies,
                         'Avg '+ SOLAR_RAD_COL: new_solar_rads,
                         RAINFALL_COL: output_rainfalls
                        })

In [125]:
gazipur_preProcessed_df = get_avg_df(gazipur_preProcessed_df)
# gazipur_preProcessed_df.sample(5)
rangpur_preProcessed_df = get_avg_df(rangpur_preProcessed_df)
barisal_preProcessed_df = get_avg_df(barisal_preProcessed_df)
habiganj_preProcessed_df = get_avg_df(habiganj_preProcessed_df)

## 4. Drop missing values

In [126]:
gazipur_preProcessed_df.dropna(inplace=True)
habiganj_preProcessed_df.dropna(inplace=True)
rangpur_preProcessed_df.dropna(inplace=True)
barisal_preProcessed_df.dropna(inplace=True)

In [127]:
show_missing_data(gazipur_preProcessed_df)
show_missing_data(habiganj_preProcessed_df)
show_missing_data(rangpur_preProcessed_df)
show_missing_data(barisal_preProcessed_df)

print()

print(gazipur_preProcessed_df.shape, rangpur_preProcessed_df.shape,
      barisal_preProcessed_df.shape, habiganj_preProcessed_df.shape) 

Total instances=1817, missing=0(0.0%)
Total instances=323, missing=0(0.0%)
Total instances=1117, missing=0(0.0%)
Total instances=1443, missing=0(0.0%)

(1817, 12) (1117, 12) (1443, 12) (323, 12)


## 5. Merge stationwise separate datasets into a single dataset

In [133]:
merged_preProcessedAvg_df = pd.concat([gazipur_preProcessed_df, habiganj_preProcessed_df, 
                                   rangpur_preProcessed_df, barisal_preProcessed_df])
merged_preProcessedAvg_df.sample(5)

Unnamed: 0,Station,Month,Avg Min Temp. (degree Celcius),Avg Max Temp. (degree Celcius),Avg Rainfall (mm),Avg Actual Evaporation (mm),"Avg Relative Humidity (morning, %)","Avg Relative Humidity (afternoon, %)",Avg Sunshine (hour/day),Avg Cloudy (hour/day),Avg Solar Radiation (cal/cm^2/day),Rainfall (mm)
1198,Barisal,4,22.666667,32.266667,6.5,2.333333,90.333333,61.666667,4.033333,8.566667,307.66,16.0
1481,Gazipur,1,12.1,24.433333,0.0,1.333333,82.666667,52.333333,5.5,5.2,251.823333,0.0
827,Barisal,4,24.733333,34.133333,0.0,4.666667,90.333333,60.333333,8.266667,4.333333,444.593333,0.0
462,Rangpur,3,13.2,26.466667,0.0,3.333333,63.666667,47.0,7.533333,4.266667,400.363333,0.0
968,Barisal,9,26.6,30.733333,4.333333,3.0,94.333333,83.0,6.3,5.9,355.72,0.0


## 6. Convert categorical 'Station' column to numeric with One-Hot-Encoding

In [135]:
merged_preProcessedAvg_df = pd.get_dummies(merged_preProcessedAvg_df, columns=['Station'])
merged_preProcessedAvg_df.sample(5)

Unnamed: 0,Month,Avg Min Temp. (degree Celcius),Avg Max Temp. (degree Celcius),Avg Rainfall (mm),Avg Actual Evaporation (mm),"Avg Relative Humidity (morning, %)","Avg Relative Humidity (afternoon, %)",Avg Sunshine (hour/day),Avg Cloudy (hour/day),Avg Solar Radiation (cal/cm^2/day),Rainfall (mm),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur
1371,10,26.266667,35.0,0.0,4.0,91.0,67.333333,7.4,4.2,362.19,51.0,1,0,0,0
778,2,15.433333,30.1,0.0,4.0,71.666667,36.0,8.1,3.1,365.946667,0.0,0,1,0,0
823,4,22.6,34.6,0.0,5.0,73.666667,49.0,7.8,4.7,430.053333,0.0,0,1,0,0
291,9,26.166667,33.833333,8.0,3.666667,83.333333,73.0,4.966667,7.233333,310.2,0.0,0,0,0,1
532,5,22.6,30.266667,3.0,2.666667,84.0,74.0,3.866667,9.433333,313.036667,0.0,0,0,0,1


## 7. Create the classification dataset

In [144]:
def rain_classify(_df):
    df = _df.copy()
    
    # todo: implement
    
    return df

In [145]:
# merged_preProcessedAvg_clf_df = rain_classify(merged_preProcessedAvg_df)

## Save the pre-processed and merged datasets

In [146]:
merged_preProcessedAvg_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessedAvg_regression.csv'), index=False)
# merged_preProcessedAvg_clf_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessedAvg_classification.csv'), index=False)

## 8. Train-Test split in 80:20 ratio

In [148]:
def splitTrainTest_and_scale(_df, class_label, is_regression=False):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    
    df = _df.copy()
    
    X_all = df.drop(columns=class_label)
    y_all = df[class_label]

    if(is_regression):
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True)
    else:    
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True, stratify=y_all)

    # scale the dataset
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
    X_test_scaled = scaler.transform(X_test)
    X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)
    
    # concat X, y
    train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
    test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
    
    return train_df, test_df

In [149]:
merged_preProcessed_train_df, merged_preProcessed_test_df = splitTrainTest_and_scale(merged_preProcessed_df, \
                                                               class_label='Rainfall (mm)', \
                                                               is_regression=True)

# merged_preProcessed_train_df.sample(5)

Unnamed: 0,Month,Avg Min Temp. (degree Celcius),Avg Max Temp. (degree Celcius),Avg Rainfall (mm),Avg Actual Evaporation (mm),"Avg Relative Humidity (morning, %)","Avg Relative Humidity (afternoon, %)",Avg Sunshine (hour/day),Avg Cloudy (hour/day),Avg Solar Radiation (cal/cm^2/day),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur,Rainfall (mm)
811,0.636364,0.803618,0.769231,0.008376,0.234295,0.71875,0.629108,0.665625,0.376,0.674401,0.0,1.0,0.0,0.0,3.0
906,0.181818,0.452196,0.725641,0.0,0.135823,0.86875,0.233558,0.725,0.238933,0.662591,1.0,0.0,0.0,0.0,0.0
613,0.636364,0.815245,0.674359,0.081364,0.169779,0.825,0.769953,0.125,0.837333,0.261099,0.0,0.0,0.0,1.0,0.0
1155,0.454545,0.848837,0.797436,0.002393,0.190895,0.56875,0.58216,0.640625,0.528,0.586866,0.0,1.0,0.0,0.0,39.6
899,0.727273,0.75969,0.684615,0.145378,0.196944,0.6625,0.760563,0.3375,0.6,0.354867,0.0,1.0,0.0,0.0,0.2


In [152]:
merged_preProcessed_train_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'train', 'brri-weather_avg_train_regression.csv'), index=False)
merged_preProcessed_train_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'test', 'brri-weather_avg_test_regression.csv'), index=False)