In [1]:
# required for jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(8,6)}) # set sns figure size

import os
import math

In [2]:
# read raw csv by marking dropping missing values
missing_values = ['NIL', 'nil', '']
raw_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'all-station_raw.csv'), 
                     na_values=missing_values)

raw_df.sample(5)

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day)
3211,Gazipur,2020,10,24,28.4,24.2,1.2,0.0,95.0,95.0,0.0,11.5,144.4
636,Barisal,2018,10,7,34.6,25.4,0.0,3.0,96.0,59.0,8.0,3.6,379.66
1862,Gazipur,2017,2,13,30.7,18.2,0.0,4.0,60.0,39.0,7.3,3.9,343.37
4477,Habiganj,2019,3,12,32.2,20.8,0.0,3.0,86.0,83.0,3.7,8.1,279.86
3197,Gazipur,2020,10,10,36.2,27.2,0.0,5.0,71.0,62.0,6.3,5.2,326.34


In [3]:
preProcessed_df = raw_df.copy()

## 1. Replace invalid values with NaN

- Max/Min Temp. (degree Celcius) > 50 
- Relative Humidity (afternoon, %) > 100, 
- Sunshine/Cloudy (hour/day) > 24, 
- Solar Radiation (cal/cm^2/day) > 1000 (from the box plot)

In [4]:
preProcessed_df.loc[preProcessed_df['Max Temp. (degree Celcius)'] > 50, 'Max Temp. (degree Celcius)'] = math.nan
preProcessed_df.loc[preProcessed_df['Min Temp. (degree Celcius)'] > 50, 'Min Temp. (degree Celcius)'] = math.nan
preProcessed_df.loc[preProcessed_df['Relative Humidity (afternoon, %)'] > 100, 'Relative Humidity (afternoon, %)'] = math.nan
preProcessed_df.loc[preProcessed_df['Sunshine (hour/day)'] > 24, 'Sunshine (hour/day)'] = math.nan
preProcessed_df.loc[preProcessed_df['Cloudy (hour/day)'] > 24, 'Cloudy (hour/day)'] = math.nan
preProcessed_df.loc[preProcessed_df['Solar Radiation (cal/cm^2/day)'] > 1000, 'Solar Radiation (cal/cm^2/day)'] = math.nan

In [5]:
# show_missing_data(preProcessed_df)

## 2. Fill up (or drop) the missing values

In [6]:
def show_missing_data(_df):
    df = _df.copy()
    total_cnt = df.shape[0]
    missing_cnt = df.shape[0]-df.dropna().shape[0]
    print(f'Total instances={total_cnt}, missing={missing_cnt}({round(missing_cnt*100.0/total_cnt, 2)}%)')

show_missing_data(preProcessed_df)

Total instances=4954, missing=880(17.76%)


**fillup with monthly average (of all years) for each station**

In [7]:
for column in preProcessed_df.columns:
    if column in ['Station', 'Year', 'Month', 'Day']:
        continue
        
    preProcessed_df[column] = preProcessed_df.groupby(['Station', 'Month'])[column].transform(
        lambda grp: grp.fillna(np.mean(grp))
    )
    
show_missing_data(preProcessed_df)

Total instances=4954, missing=210(4.24%)


In [8]:
# missing_df = preProcessed_df[(preProcessed_df['Station']=='Habiganj') & (preProcessed_df.isna().any(axis=1))]
# missing_df['Month'].value_counts()

In [9]:
# preProcessed_df[preProcessed_df['Station']=='Habiganj']['Month'].value_counts()

**Whole month data is missing (all years) for Habiganj, have to drop them**

In [10]:
preProcessed_df.dropna(inplace=True)
show_missing_data(preProcessed_df)

Total instances=4744, missing=0(0.0%)


In [11]:
preProcessed_df['Station'].value_counts()

Gazipur     1827
Barisal     1453
Rangpur     1127
Habiganj     337
Name: Station, dtype: int64

## Save the pre-processed dataset

In [12]:
preProcessed_df.to_csv(os.path.join('..', '..', 'Datasets', 'brri-datasets', 'pre-processed', 'brri-weather_preprocessed.csv'))