In [1]:
# required for jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(8,6)}) # set sns figure size

import os
import math

In [2]:
# read raw csv by marking dropping missing values
missing_values = ['NIL', 'nil', '']
raw_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'all-station_raw.csv'), 
                     na_values=missing_values)

raw_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
1627,Gazipur,2016,6,23,32.6,25.8,0.2,2.2,91.0,77.0,2.4,,267.39
2330,Gazipur,2018,5,27,35.6,27.4,0.0,3.0,70.0,65.0,8.8,4.5,472.08
2200,Gazipur,2018,1,17,24.9,12.4,0.0,1.0,83.0,65.0,4.6,6.1,229.5
787,Barisal,2019,3,7,28.6,21.0,0.0,4.0,90.0,53.0,8.5,3.4,435.01
4769,Habiganj,2019,12,29,26.8,17.8,0.0,,90.0,77.0,8.2,2.4,304.05


In [3]:
preProcessed_df = raw_df.copy()

## 1. Replace invalid values with NaN

- Max/Min Temp. (degree Celcius) > 50 
- Relative Humidity (afternoon, %) > 100, 
- Sunshine/Cloudy (hour/day) > 24, 
- Solar Radiation (cal/cm^2/day) > 1000 (from the box plot)

In [4]:
preProcessed_df.loc[preProcessed_df['Max Temp. (degree Celcius)'] > 50, 'Max Temp. (degree Celcius)'] = math.nan
preProcessed_df.loc[preProcessed_df['Min Temp. (degree Celcius)'] > 50, 'Min Temp. (degree Celcius)'] = math.nan
preProcessed_df.loc[preProcessed_df['Relative Humidity (afternoon, %)'] > 100, 'Relative Humidity (afternoon, %)'] = math.nan
preProcessed_df.loc[preProcessed_df['Sunshine (hour/day)'] > 24, 'Sunshine (hour/day)'] = math.nan
preProcessed_df.loc[preProcessed_df['Cloudy (hour/day)'] > 24, 'Cloudy (hour/day)'] = math.nan
preProcessed_df.loc[preProcessed_df['Solar Radiation (cal/cm^2/day)'] > 1000, 'Solar Radiation (cal/cm^2/day)'] = math.nan

In [5]:
# show_missing_data(preProcessed_df)

## 2. Fill up (or drop) the missing values

In [6]:
def show_missing_data(_df):
    df = _df.copy()
    total_cnt = df.shape[0]
    missing_cnt = df.shape[0]-df.dropna().shape[0]
    print(f'Total instances={total_cnt}, missing={missing_cnt}({round(missing_cnt*100.0/total_cnt, 2)}%)')

show_missing_data(preProcessed_df)

Total instances=4954, missing=880(17.76%)


**fillup with monthly average (of all years) for each station**

In [7]:
for column in preProcessed_df.columns:
    if column in ['Station', 'Year', 'Month', 'Day']:
        continue
        
    preProcessed_df[column] = preProcessed_df.groupby(['Station', 'Month'])[column].transform(
        lambda grp: grp.fillna(np.mean(grp))
    )
    
show_missing_data(preProcessed_df)

Total instances=4954, missing=210(4.24%)


In [8]:
# missing_df = preProcessed_df[(preProcessed_df['Station']=='Habiganj') & (preProcessed_df.isna().any(axis=1))]
# missing_df['Month'].value_counts()

In [9]:
# preProcessed_df[preProcessed_df['Station']=='Habiganj']['Month'].value_counts()

**Whole month data is missing (all years) for Habiganj, have to drop them**

In [10]:
preProcessed_df.dropna(inplace=True)
show_missing_data(preProcessed_df)

Total instances=4744, missing=0(0.0%)


In [11]:
preProcessed_df['Station'].value_counts()

Gazipur     1827
Barisal     1453
Rangpur     1127
Habiganj     337
Name: Station, dtype: int64

## 3. Drop unnecessary columns- 'Year', 'Day'

In [12]:
preProcessed_df.drop(columns=['Year', 'Day'], inplace=True)

## 4. Convert categorical column- 'Station' to numerical using One Hot Encoding

In [13]:
preProcessed_df = pd.get_dummies(preProcessed_df, columns=['Station'])
preProcessed_df.sample(5)

Unnamed: 0,Month,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur
3516,7,29.2,25.8,14.3,2.0,87.0,93.0,2.0,11.5,252.86,0,0,0,1
3463,6,27.8,24.2,7.2,2.0,83.0,67.0,6.0,7.6,384.0,0,0,0,1
370,1,19.0,11.6,0.0,1.0,93.0,74.0,0.2,10.6,123.65,1,0,0,0
864,5,32.6,21.8,0.0,4.0,90.0,76.0,6.2,7.0,388.6,1,0,0,0
2650,4,33.2,23.0,9.6,0.0,86.0,69.0,4.1,8.4,309.82,0,1,0,0


## Save the pre-processed dataset

In [14]:
preProcessed_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessed_regression.csv'), index=False)

## Form the classification dataset and save
- class0: 0
- class1: 0 to 20 
- class2: greather than 20

In [17]:
def rain_classifier(_df):
    df = _df.copy()
    
    data = df['Rainfall (mm)']
    arr = []
    for x in data:
        if x=='NIL':
            arr.append('NIL')
        else:
            if float(x)==0.0:
                arr.append(0)
            elif float(x)>0.0 and float(x)<=20.0:
                arr.append(1)
            elif float(x)>20.0:
                arr.append(2)
            else: 
                # add NaN values as is
                arr.append(x)
        
    df['Rainfall'] = arr

    return df

In [20]:
preProcessed_classification_df = rain_classifier(preProcessed_df)
preProcessed_classification_df.drop(columns=['Rainfall (mm)'])
preProcessed_classification_df.sample(5)

Unnamed: 0,Month,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day),Station_Barisal,Station_Gazipur,Station_Habiganj,Station_Rangpur,Rainfall
1055,11,26.0,17.8,0.0,2.0,88.0,58.0,7.4,3.6,217.69,1,0,0,0,0
246,9,30.2,27.2,2.6,2.6,96.0,88.0,0.3,11.9,171.79623,1,0,0,0,1
4154,4,28.5,20.7,0.0,2.0,86.0,83.0,2.0,10.5,240.49,0,0,0,1,0
1981,6,29.8,24.8,105.0,3.969173,100.0,100.0,0.0,13.6,190.2,0,1,0,0,2
2830,10,33.4,25.8,5.4,1.4,80.0,73.0,2.8,8.7,225.26,0,1,0,0,1


In [21]:
preProcessed_classification_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessed_classification.csv'), index=False)