In [1]:
# required for jupyter notebook
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(8,6)}) # set sns figure size

import os
import math

In [2]:
# read raw csv by marking dropping missing values
missing_values = ['NIL', 'nil', '']
raw_df = pd.read_csv(os.path.join('..', '..', 'Datasets', 'water-level', 'barisal-weather-waterlevel_merged.csv'), 
                     na_values=missing_values)

raw_df.head()

Unnamed: 0,Station,Year,Month,Day,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day),MAX_WL(m),MIN_WL(m),AVE_WL(m)
0,Barisal,2017,1,1,25.2,13.8,0.0,1.0,94,62.0,6.3,4.5,277.72,1.17,0.13,0.65
1,Barisal,2017,1,2,25.4,13.8,0.0,1.0,94,62.0,5.5,5.3,257.52,1.1,0.11,0.65
2,Barisal,2017,1,3,25.6,13.4,0.0,1.0,94,61.0,6.2,4.6,275.2,1.05,0.08,0.65
3,Barisal,2017,1,4,25.4,14.0,0.0,1.0,94,64.0,6.3,4.5,277.72,0.99,0.04,0.6
4,Barisal,2017,1,5,23.0,13.0,0.0,1.0,94,68.0,4.0,6.8,219.63,0.95,0.02,0.55


## Pre-process on the daily dataset

In [3]:
preProcessed_df = raw_df.copy()

### 1. Put NaN for invalid column values

**2017 June 24 and 2019 whole December contains invalid values for Cloudy > 24.0**

In [4]:
preProcessed_df.loc[preProcessed_df['Cloudy (hour/day)'] > 24.0, 'Cloudy (hour/day)'] = math.nan

### 2. Fill missing values with mean of monthly values

In [5]:
def show_missing_data(_df):
    df = _df.copy()
    total_cnt = df.shape[0]
    missing_cnt = df.shape[0]-df.dropna().shape[0]
    print(f'Total instances={total_cnt}, missing={missing_cnt}({round(missing_cnt*100.0/total_cnt, 2)}%)')

show_missing_data(preProcessed_df)

Total instances=1453, missing=212(14.59%)


In [6]:
for column in preProcessed_df.columns:
    if column in ['Station', 'Year', 'Month', 'Day']:
        continue
        
    preProcessed_df[column] = preProcessed_df.groupby(['Month', 'Year'])[column].transform(
        lambda grp: grp.fillna(np.mean(grp))
    )
    
show_missing_data(preProcessed_df)

Total instances=1453, missing=90(6.19%)


In [7]:
# preProcessed_df[preProcessed_df.isna().any(axis=1)]

**Some weather data for whole month of December 2019, February 2018 and May 2018 are still missing. Dropping them.**  

In [8]:
preProcessed_df.dropna(inplace=True)
show_missing_data(preProcessed_df)

Total instances=1363, missing=0(0.0%)


### 3. Drop unnecessary columns: Station, Year, Day

In [9]:
preProcessed_df.drop(columns=['Station', 'Year', 'Day'], inplace=True)
preProcessed_df.head()

Unnamed: 0,Month,Max Temp. (degree Celcius),Min Temp. (degree Celcius),Rainfall (mm),Actual Evaporation (mm),"Relative Humidity (morning, %)","Relative Humidity (afternoon, %)",Sunshine (hour/day),Cloudy (hour/day),Solar Radiation (cal/cm^2/day),MAX_WL(m),MIN_WL(m),AVE_WL(m)
0,1,25.2,13.8,0.0,1.0,94,62.0,6.3,4.5,277.72,1.17,0.13,0.65
1,1,25.4,13.8,0.0,1.0,94,62.0,5.5,5.3,257.52,1.1,0.11,0.65
2,1,25.6,13.4,0.0,1.0,94,61.0,6.2,4.6,275.2,1.05,0.08,0.65
3,1,25.4,14.0,0.0,1.0,94,64.0,6.3,4.5,277.72,0.99,0.04,0.6
4,1,23.0,13.0,0.0,1.0,94,68.0,4.0,6.8,219.63,0.95,0.02,0.55


## Pre-process the forecast-with-avg dataset

In [10]:
from custom_utils import get_avg_df

In [11]:
preProcessed_forecastAvg_df = raw_df.copy()

# put NaN where invalid data for the column 'Cloudy'
preProcessed_forecastAvg_df.loc[preProcessed_forecastAvg_df['Cloudy (hour/day)']>24.0, 'Cloudy (hour/day)'] = math.nan

# fill NaN values with monthly mean before forming the forecast-with-avg dataset
for column in preProcessed_forecastAvg_df.columns:
    if column in ['Station', 'Year', 'Month', 'Day']:
        continue
        
    preProcessed_forecastAvg_df[column] = preProcessed_forecastAvg_df.groupby(['Month', 'Year'])[column].transform(
        lambda grp: grp.fillna(np.mean(grp))
    )

preProcessed_forecastAvg_df = get_avg_df(preProcessed_forecastAvg_df)

In [12]:
show_missing_data(preProcessed_forecastAvg_df)

Total instances=1443, missing=84(5.82%)


In [13]:
preProcessed_forecastAvg_df.dropna(inplace=True)
show_missing_data(preProcessed_forecastAvg_df)

Total instances=1359, missing=0(0.0%)


In [14]:
preProcessed_forecastAvg_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1359 entries, 0 to 1442
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Month                                 1359 non-null   int64  
 1   Avg Max Temp. (degree Celcius)        1359 non-null   float64
 2   Avg Min Temp. (degree Celcius)        1359 non-null   float64
 3   Avg Rainfall (mm)                     1359 non-null   float64
 4   Avg Actual Evaporation (mm)           1359 non-null   float64
 5   Avg Relative Humidity (morning, %)    1359 non-null   float64
 6   Avg Relative Humidity (afternoon, %)  1359 non-null   float64
 7   Avg Sunshine (hour/day)               1359 non-null   float64
 8   Avg Cloudy (hour/day)                 1359 non-null   float64
 9   Avg Solar Radiation (cal/cm^2/day)    1359 non-null   float64
 10  MAX_WL(m)                             1359 non-null   float64
 11  AVE_WL(m)        

## Save the pre-processed datasets

In [15]:
preProcessed_df.to_csv(os.path.join('..', '..', 'Datasets', 'water-level', 'pre-processed', 'weather-waterlevel_preprocessed.csv'), index=False)
preProcessed_forecastAvg_df.to_csv(os.path.join('..', '..', 'Datasets', 'water-level', 'pre-processed', 'weather-waterlevel-forecast-avg_preprocessed.csv'), index=False)