In [1]:
# required for jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(8,6)}) # set sns figure size

import os
import math

In [2]:
# read raw csv by marking dropping missing values
missing_values = ['NIL', 'nil', '']
raw_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'all-station_raw.csv'), 
                     na_values=missing_values)

raw_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
902,Barisal,2019,6,30,33.2,27.0,4.0,20.1,92.0,68.0,6.2,7.3,389.05
1569,Gazipur,2016,4,26,37.6,28.4,0.0,6.0,69.0,53.0,10.2,,508.4
4503,Habiganj,2019,4,7,28.2,19.5,11.8,,95.0,86.0,4.6,7.9,325.7
3535,Rangpur,2018,8,13,32.3,27.2,16.5,5.0,91.0,70.0,5.0,7.9,347.65
2757,Gazipur,2019,7,28,32.8,26.8,0.0,5.0,79.0,83.0,11.2,2.3,547.31


In [3]:
preProcessed_df = raw_df.copy()

## 1. Replace invalid values with NaN

- Max/Min Temp. (degree Celcius) > 50 
- Relative Humidity (afternoon, %) > 100, 
- Sunshine/Cloudy (hour/day) > 24, 
- Solar Radiation (cal/cm^2/day) > 1000 (from the box plot)

In [4]:
preProcessed_df.loc[preProcessed_df['Max Temp. (degree Celcius)'] > 50, 'Max Temp. (degree Celcius)'] = math.nan
preProcessed_df.loc[preProcessed_df['Min Temp. (degree Celcius)'] > 50, 'Min Temp. (degree Celcius)'] = math.nan
preProcessed_df.loc[preProcessed_df['Relative Humidity (afternoon, %)'] > 100, 'Relative Humidity (afternoon, %)'] = math.nan
preProcessed_df.loc[preProcessed_df['Sunshine (hour/day)'] > 24, 'Sunshine (hour/day)'] = math.nan
preProcessed_df.loc[preProcessed_df['Cloudy (hour/day)'] > 24, 'Cloudy (hour/day)'] = math.nan
preProcessed_df.loc[preProcessed_df['Solar Radiation (cal/cm^2/day)'] > 1000, 'Solar Radiation (cal/cm^2/day)'] = math.nan

In [5]:
# show_missing_data(preProcessed_df)

## 2. Fill up (or drop) the missing values

In [6]:
def show_missing_data(_df):
    df = _df.copy()
    total_cnt = df.shape[0]
    missing_cnt = df.shape[0]-df.dropna().shape[0]
    print(f'Total instances={total_cnt}, missing={missing_cnt}({round(missing_cnt*100.0/total_cnt, 2)}%)')

show_missing_data(preProcessed_df)

Total instances=4954, missing=880(17.76%)


**fillup with monthly average (of all years) for each station**

In [7]:
for column in preProcessed_df.columns:
    if column in ['Station', 'Year', 'Month', 'Day']:
        continue
        
    preProcessed_df[column] = preProcessed_df.groupby(['Station', 'Month'])[column].transform(
        lambda grp: grp.fillna(np.mean(grp))
    )
    
show_missing_data(preProcessed_df)

Total instances=4954, missing=210(4.24%)


In [8]:
# missing_df = preProcessed_df[(preProcessed_df['Station']=='Habiganj') & (preProcessed_df.isna().any(axis=1))]
# missing_df['Month'].value_counts()

In [9]:
# preProcessed_df[preProcessed_df['Station']=='Habiganj']['Month'].value_counts()

**Whole month data is missing (all years) for Habiganj, have to drop them**

In [10]:
preProcessed_df.dropna(inplace=True)
show_missing_data(preProcessed_df)

Total instances=4744, missing=0(0.0%)


In [11]:
preProcessed_df['Station'].value_counts()

Gazipur     1827
Barisal     1453
Rangpur     1127
Habiganj     337
Name: Station, dtype: int64

## 3. Drop unnecessary columns- 'Year', 'Day'

In [12]:
preProcessed_df.drop(columns=['Year', 'Day'], inplace=True)

## 4. Convert categorical column- 'Station' to numerical using One Hot Encoding

In [13]:
preProcessed_df = pd.get_dummies(preProcessed_df, columns=['Station'])
preProcessed_df.sample(5)

Unnamed: 0,Month,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur
3238,11,26.7,21.0,0.0,1.0,74.0,68.0,2.6,8.3,187.69,0,1,0,0
1229,5,32.8,25.0,0.0,3.0,91.0,76.0,5.6,7.6,369.07,1,0,0,0
3552,8,31.2,27.2,0.0,4.0,87.0,74.0,7.0,5.9,413.19,0,0,0,1
3710,2,26.3,10.4,0.0,2.0,82.0,42.0,7.0,4.2,331.5,0,0,0,1
2129,11,33.5,20.8,0.0,4.0,71.0,54.0,9.4,1.5,361.59,0,1,0,0


## Save the pre-processed dataset

In [14]:
preProcessed_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessed_regression.csv'), index=False)

## Form the classification dataset and save
- class0: 0
- class1: 0 to 20 
- class2: greather than 20

In [15]:
def rain_classifier(_df):
    df = _df.copy()
    
    data = df['Rainfall (mm)']
    arr = []
    for x in data:
        if x=='NIL':
            arr.append('NIL')
        else:
            if float(x)==0.0:
                arr.append(0)
            elif float(x)>0.0 and float(x)<=20.0:
                arr.append(1)
            elif float(x)>20.0:
                arr.append(2)
            else: 
                # add NaN values as is
                arr.append(x)
        
    df['Rainfall'] = arr

    return df

In [16]:
preProcessed_classification_df = rain_classifier(preProcessed_df)
preProcessed_classification_df.drop(columns=['Rainfall (mm)'])
preProcessed_classification_df.sample(5)

Unnamed: 0,Month,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur,Rainfall
2945,2,25.5,13.4,0.0,4.0,88.0,53.0,6.4,4.8,317.98,0,1,0,0,0
2000,7,32.4,26.4,15.4,5.4,84.0,91.0,0.0,13.5,188.2,0,1,0,0,1
1687,8,31.0,23.6,0.0,5.0,72.0,77.0,8.7,4.2,468.39,0,1,0,0,0
479,5,31.4,21.0,6.702752,3.71087,100.0,74.0,7.9,5.3,443.93,1,0,0,0,1
3947,9,26.5,22.8,10.7,2.0,95.0,91.0,0.0,12.2,160.2,0,0,0,1,1


In [17]:
preProcessed_classification_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessed_classification.csv'), index=False)

## Create separate train test datasets (ratio=80:20) and save

In [18]:
def splitTrainTest_and_scale(_df, class_label, is_regression=False):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    
    df = _df.copy()
    
    X_all = df.drop(columns=class_label)
    y_all = df[class_label]

    if(is_regression):
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, shuffle=True)
    else:    
        X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42, stratify=y_all)

    # scale the dataset
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
    X_test_scaled = scaler.transform(X_test)
    X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)
    
    # concat X, y
    train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
    test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
    
    return train_df, test_df

In [19]:
preProcessed_train_df, preProcessed_test_df = splitTrainTest_and_scale(preProcessed_df, \
                                                               class_label='Rainfall (mm)', \
                                                               is_regression=True)

preProcessed_classification_train_df, preProcessed_classification_test_df \
= splitTrainTest_and_scale(preProcessed_classification_df, class_label='Rainfall')

In [20]:
preProcessed_train_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'train', 'brri-weather_train_regression.csv'), index=False)
preProcessed_classification_train_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'train', 'brri-weather_train_classification.csv'), index=False)

preProcessed_test_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'test', 'brri-weather_test_regression.csv'), index=False)
preProcessed_classification_test_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'final-dataset', 'test', 'brri-weather_test_classification.csv'), index=False)