In [1]:
# required for jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(8,6)}) # set sns figure size

import os
import math

## 1. Read each station dataset separately

In [2]:
# read Gazipur raw csv by marking missing values as NaN
missing_values = ['NIL', 'nil', '']
gazipur_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'gazipur_2016-2020', 'gazipur.csv'), 
                     na_values=missing_values)

gazipur_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
357,Gazipur,2016,12,23,28.6,17.2,0.0,2.0,79.0,60.0,4.2,6.4,210.62
726,Gazipur,2017,12,27,28.5,15.4,0.0,2.0,89.0,60.0,7.7,2.9,294.31
353,Gazipur,2016,12,19,27.0,15.6,0.0,1.0,89.0,66.0,4.8,5.8,224.97
1199,Gazipur,2019,4,14,34.0,21.3,0.0,0.0,74.0,58.0,9.2,3.3,475.54
884,Gazipur,2018,6,3,33.6,26.8,0.6,3.6,77.0,77.0,5.3,8.3,360.68


In [3]:
rangpur_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'rangpur_mid2017-2020', 'rangpur.csv'), 
                     na_values=missing_values)

rangpur_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
300,Rangpur,2018,9,27,32.5,25.2,0.0,3.0,87.0,67,6.3,5.9,350.47
1008,Rangpur,2020,9,4,33.5,27.2,0.0,5.0,80.0,67,7.0,5.2,371.6
505,Rangpur,2019,4,20,32.5,20.2,0.0,5.0,70.0,51,8.3,4.2,444.28
134,Rangpur,2018,4,14,30.2,19.8,19.8,4.0,87.0,46,7.0,5.5,402.23
1013,Rangpur,2020,9,9,31.2,26.2,0.0,3.0,91.0,74,5.0,7.2,311.2


In [4]:
barisal_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'barisal_2017-2020', 'barisal.csv'), 
                     na_values=missing_values)

barisal_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
317,Barisal,2017,11,20,29.6,20.8,0.0,1.0,95,57.0,6.8,3.7,300.789818
680,Barisal,2018,11,20,28.4,16.8,,1.0,94,55.0,6.2,4.8,285.21
56,Barisal,2017,2,26,31.6,14.6,0.0,1.0,94,44.0,8.6,2.6,387.25
1198,Barisal,2020,4,21,32.2,22.6,0.2,2.0,90,60.0,0.2,12.4,183.67
1382,Barisal,2020,10,22,26.4,24.6,145.2,0.0,91,91.0,0.0,11.6,146.8


In [5]:
habiganj_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'habiganj_2019-2020', 'habiganj.csv'), 
                     na_values=missing_values)

habiganj_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
253,Habiganj,2019,9,11,34.5,24.8,0.0,,84,77,6.8,5.4,366.48
161,Habiganj,2019,6,11,36.2,27.8,0.0,4.0,80,71,10.1,3.5,315.62
241,Habiganj,2019,8,30,36.8,26.2,32.3,2.0,87,80,0.0,12.9,183.6
23,Habiganj,2019,1,24,28.5,15.8,0.0,,94,83,8.0,2.7,311.67
231,Habiganj,2019,8,20,36.8,27.5,0.0,4.0,92,73,9.2,3.7,484.76


## 2. Pre-process each station's dataset with the techniques used in 'brri-dataset_pre-process.ipynb' notebook

### 2.1. Replace invalid values with NaN

- Max/Min Temp. (degree Celcius) > 50 
- Relative Humidity (afternoon, %) > 100, 
- Sunshine/Cloudy (hour/day) > 24, 
- Solar Radiation (cal/cm^2/day) > 1000 (from the box plot)

### 2.2. Fill up missing values with monthly average (DO NOT drop values that are still missing after fillup)

In [6]:
def pre_process(_df):
    df = _df.copy()
    
    # apply step 2.1
    df.loc[df['Max Temp. (degree Celcius)'] > 50, 'Max Temp. (degree Celcius)'] = math.nan
    df.loc[df['Min Temp. (degree Celcius)'] > 50, 'Min Temp. (degree Celcius)'] = math.nan
    df.loc[df['Relative Humidity (afternoon, %)'] > 100, 'Relative Humidity (afternoon, %)'] = math.nan
    df.loc[df['Sunshine (hour/day)'] > 24, 'Sunshine (hour/day)'] = math.nan
    df.loc[df['Cloudy (hour/day)'] > 24, 'Cloudy (hour/day)'] = math.nan
    df.loc[df['Solar Radiation (cal/cm^2/day)'] > 1000, 'Solar Radiation (cal/cm^2/day)'] = math.nan
    
    # apply step 2.2
    for column in df.columns:
        if column in ['Station', 'Year', 'Month', 'Day']:
            continue

        df[column] = df.groupby(['Station', 'Month'])[column].transform(
            lambda grp: grp.fillna(np.mean(grp))
        )
        
    # cannot drop missing values here it will mess up the average calculation
    # df.dropna(inplace=True)
    
    return df

def show_missing_data(_df):
    df = _df.copy()
    total_cnt = df.shape[0]
    missing_cnt = df.shape[0]-df.dropna().shape[0]
    print(f'Total instances={total_cnt}, missing={missing_cnt}({round(missing_cnt*100.0/total_cnt, 2)}%)')

In [7]:
gazipur_preProcessed_df = pre_process(gazipur_df)
# gazipur_df.sample(5)
habiganj_preProcessed_df = pre_process(habiganj_df)
barisal_preProcessed_df = pre_process(barisal_df)
rangpur_preProcessed_df = pre_process(rangpur_df)

In [8]:
show_missing_data(gazipur_preProcessed_df)
show_missing_data(habiganj_preProcessed_df)
show_missing_data(rangpur_preProcessed_df)
show_missing_data(barisal_preProcessed_df)

print()

print(gazipur_preProcessed_df.shape, rangpur_preProcessed_df.shape,
      barisal_preProcessed_df.shape, habiganj_preProcessed_df.shape) 

Total instances=1827, missing=0(0.0%)
Total instances=547, missing=210(38.39%)
Total instances=1127, missing=0(0.0%)
Total instances=1453, missing=0(0.0%)

(1827, 13) (1127, 13) (1453, 13) (547, 13)


## 3. Form each station wise datasets using weekly average

In [9]:
def get_avg_df(_df, num_avg_days=7, num_days_before=3):
    '''
    input STATION-WISE dataframe with all expected columns
    returns dataframe with rainfall columns unchanged 
        and average of 'num_avg_days' number of days worth other features 
        and starting from 'num_days_before' ago
        
    example: num_avg_days=7, num_days_before=3
        then row for January 10 will have rainfall data of Jan 10 
            and other columns will have average of values from Jan 1 to 7
    '''
    df=_df.copy()

    station = df['Station'].loc[0]
    
    MONTH_COL = 'Month'
    MAX_TEMP_COL = 'Max Temp. (degree Celcius)'
    MIN_TEMP_COL = 'Min Temp. (degree Celcius)'
    RAINFALL_COL = 'Rainfall (mm)'
    ACTUAL_EVA_COL = 'Actual Evaporation (mm)'
    REL_HUMIDITY_M_COL = 'Relative Humidity (morning, %)'
    REL_HUMIDITY_A_COL = 'Relative Humidity (afternoon, %)'
    SUNSHINE_COL = 'Sunshine (hour/day)'
    CLOUDY_COL = 'Cloudy (hour/day)'
    SOLAR_RAD_COL = 'Solar Radiation (cal/cm^2/day)'

    months, min_temps, max_temps, rainfalls, actual_evas, rhs_m, rhs_a, sunshines, cloudies, solar_rads = \
    [], [], [], [], [], [], [], [], [], [] 

    def get_list_with_col(df, col_name):
        # returns list of columns from dataframe
        vals = []
        for val in df[col_name]:
            vals.append(val);
        return vals;

    # populate list with daily features
    months = get_list_with_col(df, MONTH_COL)
    min_temps = get_list_with_col(df, MIN_TEMP_COL)
    max_temps = get_list_with_col(df, MAX_TEMP_COL)
    rainfalls = get_list_with_col(df, RAINFALL_COL)
    actual_evas = get_list_with_col(df, ACTUAL_EVA_COL)
    rhs_m = get_list_with_col(df, REL_HUMIDITY_M_COL)
    rhs_a = get_list_with_col(df, REL_HUMIDITY_A_COL)
    sunshines = get_list_with_col(df, SUNSHINE_COL)
    cloudies = get_list_with_col(df, CLOUDY_COL)
    solar_rads = get_list_with_col(df, SOLAR_RAD_COL)

    def get_avg_in_range(vals, start, end):
        '''
        returns average of list values from start to end index 
        '''
        total = 0.0
        for i in range(start, end+1):
            total+=vals[i]
        return float(total/(end-start+1));

    new_months, new_min_temps, new_max_temps, new_rainfalls, new_actual_evas, \
    new_rhs_m, new_rhs_a, new_sunshines, new_cloudies, new_solar_rads = [], [], [], [], [], [], [], [], [], [] 

    output_rainfalls = []
    stations = []
    
    # populate new features with previous average values
    for curr_idx in range(num_avg_days+num_days_before, df.shape[0]):
        avg_start_idx = curr_idx-(num_avg_days+num_days_before)
        avg_end_idx = avg_start_idx+num_days_before-1
        
        new_min_temps.append(get_avg_in_range(min_temps, avg_start_idx, avg_end_idx))
        new_max_temps.append(get_avg_in_range(max_temps, avg_start_idx, avg_end_idx))
        new_actual_evas.append(get_avg_in_range(actual_evas, avg_start_idx, avg_end_idx))
        new_rhs_m.append(get_avg_in_range(rhs_m, avg_start_idx, avg_end_idx))
        new_rhs_a.append(get_avg_in_range(rhs_a, avg_start_idx, avg_end_idx))
        new_sunshines.append(get_avg_in_range(sunshines, avg_start_idx, avg_end_idx))
        new_cloudies.append(get_avg_in_range(cloudies, avg_start_idx, avg_end_idx))
        new_solar_rads.append(get_avg_in_range(solar_rads, avg_start_idx, avg_end_idx))
        new_rainfalls.append(get_avg_in_range(rainfalls, avg_start_idx, avg_end_idx))
        
        # in case days fall in two months, set the month that covers most days
        new_months.append(int(get_avg_in_range(months, avg_start_idx, avg_end_idx)))
        
        output_rainfalls.append(rainfalls[curr_idx])
        stations.append(station)

    return pd.DataFrame({'Station': stations,
                          MONTH_COL: new_months,
                         'Avg '+ MIN_TEMP_COL: new_min_temps,
                         'Avg '+ MAX_TEMP_COL: new_max_temps,
                         'Avg '+ RAINFALL_COL: new_rainfalls,
                         'Avg '+ ACTUAL_EVA_COL: new_actual_evas, 
                         'Avg '+ REL_HUMIDITY_M_COL: new_rhs_m,
                         'Avg '+ REL_HUMIDITY_A_COL: new_rhs_a,
                         'Avg '+ SUNSHINE_COL: new_sunshines,
                         'Avg '+ CLOUDY_COL: new_cloudies,
                         'Avg '+ SOLAR_RAD_COL: new_solar_rads,
                         RAINFALL_COL: output_rainfalls
                        })

In [10]:
gazipur_preProcessed_df = get_avg_df(gazipur_preProcessed_df)
# gazipur_preProcessed_df.sample(5)
rangpur_preProcessed_df = get_avg_df(rangpur_preProcessed_df)
barisal_preProcessed_df = get_avg_df(barisal_preProcessed_df)
habiganj_preProcessed_df = get_avg_df(habiganj_preProcessed_df)

## 4. Drop missing values

In [11]:
gazipur_preProcessed_df.dropna(inplace=True)
habiganj_preProcessed_df.dropna(inplace=True)
rangpur_preProcessed_df.dropna(inplace=True)
barisal_preProcessed_df.dropna(inplace=True)

In [12]:
show_missing_data(gazipur_preProcessed_df)
show_missing_data(habiganj_preProcessed_df)
show_missing_data(rangpur_preProcessed_df)
show_missing_data(barisal_preProcessed_df)

print()

print(gazipur_preProcessed_df.shape, rangpur_preProcessed_df.shape,
      barisal_preProcessed_df.shape, habiganj_preProcessed_df.shape) 

Total instances=1817, missing=0(0.0%)
Total instances=323, missing=0(0.0%)
Total instances=1117, missing=0(0.0%)
Total instances=1443, missing=0(0.0%)

(1817, 12) (1117, 12) (1443, 12) (323, 12)


## 5. Merge stationwise separate datasets into a single dataset

In [13]:
merged_preProcessedAvg_df = pd.concat([gazipur_preProcessed_df, habiganj_preProcessed_df, 
                                   rangpur_preProcessed_df, barisal_preProcessed_df])
merged_preProcessedAvg_df.sample(5)

Unnamed: 0,Station,Month,Avg Min Temp. (degree Celcius),Avg Max Temp. (degree Celcius),Avg Rainfall (mm),Avg Actual Evaporation (mm),"Avg Relative Humidity (morning, %)","Avg Relative Humidity (afternoon, %)",Avg Sunshine (hour/day),Avg Cloudy (hour/day),Avg Solar Radiation (cal/cm^2/day),Rainfall (mm)
383,Gazipur,1,12.733333,27.833333,0.0,2.0,76.333333,50.0,8.466667,2.233333,325.416667,0.0
203,Barisal,7,24.933333,26.6,32.933333,1.933333,97.333333,96.0,0.4,13.0,200.48,0.6
1067,Gazipur,12,16.133333,29.233333,0.0,2.0,77.0,52.333333,7.766667,2.833333,295.906667,0.0
770,Barisal,2,16.8,30.8,0.0,2.333333,90.666667,46.0,7.933333,3.266667,368.086667,16.0
1106,Gazipur,1,11.8,27.366667,0.0,1.666667,81.0,47.666667,7.366667,3.333333,364.8,0.0


## 6. Convert categorical 'Station' column to numeric with One-Hot-Encoding

In [14]:
merged_preProcessedAvg_df = pd.get_dummies(merged_preProcessedAvg_df, columns=['Station'])
merged_preProcessedAvg_df.sample(5)

Unnamed: 0,Month,Avg Min Temp. (degree Celcius),Avg Max Temp. (degree Celcius),Avg Rainfall (mm),Avg Actual Evaporation (mm),"Avg Relative Humidity (morning, %)","Avg Relative Humidity (afternoon, %)",Avg Sunshine (hour/day),Avg Cloudy (hour/day),Avg Solar Radiation (cal/cm^2/day),Rainfall (mm),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur
1165,3,22.133333,33.033333,0.866667,3.333333,86.666667,54.666667,6.966667,4.833333,384.333333,0.0,0,1,0,0
1203,4,21.733333,29.066667,12.666667,2.333333,90.0,83.333333,4.766667,7.833333,331.383333,4.0,1,0,0,0
57,1,9.266667,20.6,0.0,1.333333,90.666667,90.666667,5.1,5.6,238.543333,0.0,0,0,0,1
17,1,13.733333,24.2,2.4,2.066667,81.666667,60.0,5.230303,4.584677,196.43,0.0,0,1,0,0
1007,9,27.133333,33.266667,0.0,4.333333,82.666667,75.0,6.433333,5.766667,354.5,76.5,0,0,0,1


## 7. Create the classification dataset

In [15]:
def rain_classify(_df):
    df = _df.copy()
    
    # todo: implement
    
    return df

In [16]:
# merged_preProcessedAvg_clf_df = rain_classify(merged_preProcessedAvg_df)

## Save the pre-processed and merged datasets

In [17]:
merged_preProcessedAvg_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessedAvg_regression.csv'), index=False)
# merged_preProcessedAvg_clf_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessedAvg_classification.csv'), index=False)

## 8. Train-Test split in 80:20 ratio

In [18]:
def splitTrainTest_and_scale(_df, class_label, is_regression=False):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    
    df = _df.copy()
    
    X_all = df.drop(columns=class_label)
    y_all = df[class_label]

    if(is_regression):
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True)
    else:    
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True, stratify=y_all)

    # scale the dataset
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
    X_test_scaled = scaler.transform(X_test)
    X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)
    
    # concat X, y
    train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
    test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
    
    return train_df, test_df

In [19]:
merged_preProcessed_train_df, merged_preProcessed_test_df = splitTrainTest_and_scale(merged_preProcessedAvg_df, \
                                                               class_label='Rainfall (mm)', \
                                                               is_regression=True)

# merged_preProcessed_train_df.sample(5)

In [20]:
merged_preProcessed_train_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'train', 'brri-weather_avg_train_regression.csv'), index=False)
merged_preProcessed_test_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'test', 'brri-weather_avg_test_regression.csv'), index=False)