In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
pd.set_option('expand_frame_repr', False)
import warnings
warnings.filterwarnings('ignore')

In [2]:
def MissingValueImputation(data):
    mse_df = pd.DataFrame(index=data.columns[1:], columns=['Median','Mean','Forward Fill','Backward Fill','Linear Interpolation'])
    
    for i in range(len(data.columns)-1):
        
        #Get the feature name and store a copy of that
        col = data.columns[i+1]
        df = data[['Date',col]].copy()
        
        #indices on non empty rows
        non_na = np.where(np.isnan(data[col]) == False)[0]
        
        #In there are no non empty rows -- Continue
        if len(non_na) == 0:
            continue
        
        random.seed(42)
        non_na_masked = random.sample(list(non_na), int(0.1*len(non_na)))
        if len(non_na_masked) == 0:
            continue
            
        df_masked = df.copy()
        df_masked.loc[non_na_masked, col] = np.NaN
        
        df_imputed = df_masked.copy()
        #print('\n *************** {} ***************'.format(col))
        
        #Median
        simpleImputation = SimpleImputer(missing_values=np.nan, strategy='median')
        feature_col = df_masked[col].values.reshape(-1, 1)
        imputed_col = simpleImputation.fit_transform(feature_col)
        df_imputed.loc[:,col] = imputed_col
        
        x1 = df.loc[non_na_masked,col]
        x2 = df_imputed.loc[non_na_masked,col]
        mse_median = mean_squared_error(x1,x2)
        #print('MSE MEDIAN: {}'.format(mse_median))
        
        #Mean
        simpleImputation = SimpleImputer(missing_values=np.nan, strategy='mean')
        feature_col = df_masked[col].values.reshape(-1, 1)
        imputed_col = simpleImputation.fit_transform(feature_col)
        df_imputed.loc[:,col] = imputed_col
        
        x1 = df.loc[non_na_masked,col]
        x2 = df_imputed.loc[non_na_masked,col]
        mse_mean = mean_squared_error(x1,x2)
        #print('MSE MEAN: {}'.format(mse_mean))
        
        #Forward fill
        df_imputed = df_masked.ffill()
        df_imputed = df_imputed.replace(np.NaN, 0)
        x1 = df.loc[non_na_masked,col]
        x2 = df_imputed.loc[non_na_masked,col]
        mse_ffill = mean_squared_error(x1,x2)
        #print('FFILL: {}'.format(mse_ffill))
        
        #Backward fill
        df_imputed = df_masked.bfill()
        df_imputed = df_imputed.replace(np.NaN, 0)
        x1 = df.loc[non_na_masked,col]
        x2 = df_imputed.loc[non_na_masked,col]
        mse_bfill = mean_squared_error(x1,x2)
        #print('BFILL: {}'.format(mse_bfill))
        
        #Linear Interpolation
        li_masked = pd.Series(df_masked[col])
        li_interpolated = li_masked.interpolate(method='linear', limit_direction='forward')
        df_imputed.loc[:,col] = li_interpolated
        
        x1 = df.loc[non_na_masked, col]
        x2 = df_imputed.loc[non_na_masked, col]
        mse_interpolate = mean_squared_error(x1,x2)
        #print('MSE INTERPOLATE: {}'.format(mse_interpolate))
        
        #Appending all mse values to the data frame
        mse_df.loc[col,:]  = [mse_median,mse_mean,mse_ffill,mse_bfill,mse_interpolate]
        
    print(mse_df,'\n')
    return mse_df

In [3]:
def PreprocessDataframe(path):
    df = pd.read_csv(path)
    df.rename(columns = {'From Date':'Date'}, inplace = True)
    available_imp=[]
    imp_cols = ['Date','PM2.5','PM10','NO','NO2','NOx','NH3','SO2','CO','Ozone']
    for c in imp_cols:
        if c in list(df.columns):
            available_imp.append(c)
    #print(available_imp)
    df_imp = df[available_imp]
    df_imp['Date'] = pd.to_datetime(df_imp['Date'])
    df_imp = df_imp.replace('None', np.NaN)
    df_imp[df_imp.columns[1:]] = df_imp[df_imp.columns[1:]].astype(float)
    
    MissingValueImputation(df_imp)

In [4]:
directory = '../../data/Raw'
files = os.listdir(directory)
csv_files = [file for file in files if file.endswith('.csv')]
csv_files

['KhindipadaBhandupWestMumbaiIITM.csv',
 'KandivaliEastMumbaiMPCB.csv',
 'BorivaliEastMumbaiIITM.csv',
 'BandraKurlaComplexMumbaiIITM.csv',
 'BandraMumbaiMPCB.csv',
 'VasaiWestMumbaiMPCB.csv',
 'VileParleWestMumbaiMPCB.csv',
 'DeonarMumbaiIITM.csv',
 'ColabaMumbaiMPCB.csv',
 'WorliMumbaiMPCB.csv',
 'NavyNagarColabaMumbaiIITM.csv',
 'MazgaonMumbaiIITM.csv',
 'SiddharthNagarWorliMumbaiIITM.csv',
 'MaladWestMumbaiIITM.csv',
 'MulundWestMumbaiMPCB.csv',
 'KurlaMumbaiMPCB.csv',
 'ChakalaAndheriEastMumbaiIITM.csv',
 'SionMumbaiMPCB.csv',
 'ChhatrapatiShivajiIntlAirportT2MumbaiMPCB.csv',
 'PowaiMumbaiMPCB.csv',
 'BorivaliEastMumbaiMPCB.csv']

In [5]:
for f in csv_files:
    path = os.path.join(directory,f)
    print('\n',f)
    PreprocessDataframe(path)


 KhindipadaBhandupWestMumbaiIITM.csv
            Median         Mean Forward Fill Backward Fill Linear Interpolation
PM2.5  1956.876682  1876.610962   176.178065    166.791252            36.583596
PM10    6010.46335  5784.350477   525.591023     449.53413           188.634023
NO      308.102585   306.729736   159.481665    163.592624           125.601536
NO2     427.523956   406.874046    118.54681    131.179634            77.795988
NOx     355.313939   334.313422    63.141288     90.678249            45.822649
NH3     203.410088     193.2407    95.403106     94.794526            84.616855
SO2      38.990591    34.718429     7.034394      8.379434             4.679459
CO        0.814931      0.79496     0.005536       0.00879             0.002346
Ozone  1585.392773  1427.932905   187.710322    186.911345           119.620371 


 KandivaliEastMumbaiMPCB.csv
            Median         Mean Forward Fill Backward Fill Linear Interpolation
PM2.5  1606.618766  1483.319995   144.677486    14

            Median         Mean Forward Fill Backward Fill Linear Interpolation
PM2.5   3639.44359  3471.490408   309.613107    200.443421            75.979682
PM10   8710.351884  8389.904228   650.401122    518.113254           210.427548
NO       93.206795    91.877004    47.849507     57.521875            46.283342
NO2    1565.274959  1413.503665   123.240058    136.494326             58.28769
NOx     212.388127      202.882    34.359865     22.142697            13.301801
NH3     850.618092   779.906944   107.393877     89.958428            51.128797
SO2     534.354113      474.558    32.306691     33.766049             19.11103
CO         0.40846     0.399634     0.012781      0.012215             0.005878
Ozone  2323.370415  2000.318638   137.044981    136.413434            77.286109 


 MazgaonMumbaiIITM.csv
             Median          Mean Forward Fill Backward Fill Linear Interpolation
PM2.5   6744.261153   6189.159652   686.235093    988.040634           342.879505
PM10   114

            Median         Mean Forward Fill Backward Fill Linear Interpolation
PM2.5  1240.612725  1175.899754    31.902206     29.400964             8.963598
PM10   4293.330853   4126.82408   153.215951    124.918633            46.571406
NO       43.514627    42.770374     3.173579      2.446548             1.038421
NO2      45.044913    42.556344     4.581871       5.74059              1.07214
NOx      72.178581    69.682778     10.04926      7.040482             2.463288
NH3     275.941211   251.241926     3.997168      3.919484             1.446774
SO2     241.323378   214.169087     4.602344      4.116655             1.071413
CO        0.035838     0.035604     0.001723      0.001312             0.000493
Ozone   673.175375   561.406043    28.821393     31.386937             16.18151 

