In [1]:
#Median imputation method used to treat Step-1.csv
import pandas as pd
import numpy as np
import random
from sklearn import preprocessing
import missingno as msno8u

import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
#Read air quality data parameters
file = pd.read_csv(r'C:\Users\tvams\OneDrive\Desktop\Imputation methods\Step-1.csv')

In [3]:
# Show the initial rows of the DataFrame 'data' to examine its structure and contents.
file.head()

Unnamed: 0,Date,Day,Month,Year,PM25,PM10,NO2,NH3,SO2,CO,Ozone,RH,WS,WD,SR,Temp,Dew,Pre,Season,IsWeekday
0,2017-01-01,1,1,2017,,326.82,56.41,55.61,28.14,3.030469,21.92,75.23,0.87,192.24,74.27,14.83,4.92,0.0,0,0
1,2017-01-02,2,1,2017,,323.0,69.2,53.96,21.63,3.030469,24.35,72.57,0.6,201.23,76.35,15.09,6.27,0.01,0,1
2,2017-01-03,3,1,2017,222.0,361.72,73.33,53.05,28.63,3.030469,24.44,74.1,0.51,228.49,76.33,16.32,3.85,0.0,0,1
3,2017-01-04,4,1,2017,206.84,303.91,51.46,47.04,22.16,3.030469,32.17,69.01,0.64,157.43,80.48,16.91,3.27,0.0,0,1
4,2017-01-05,5,1,2017,278.41,341.33,64.21,47.16,27.88,3.030469,32.03,70.46,0.52,191.89,75.36,18.0,1.26,0.0,0,1


In [4]:
#Replacing the Null Values with median of Column
#display no.of null values of each col before imputation
file.isnull().sum()

Date           0
Day            0
Month          0
Year           0
PM25          52
PM10          91
NO2           48
NH3          173
SO2          104
CO           120
Ozone         93
RH            40
WS            73
WD            40
SR            36
Temp           0
Dew            0
Pre            0
Season         0
IsWeekday      0
dtype: int64

In [5]:
#print total missing values
print("total missing data=", file.isnull().sum().sum())

total missing data= 870


In [6]:
# Determine the amount of missing values in each column, then divide by the entire length of the DataFrame and multiply by 100.
(file.isnull().sum().sort_values(ascending=False)/len(file))*100

NH3          6.768388
CO           4.694836
SO2          4.068858
Ozone        3.638498
PM10         3.560250
WS           2.856025
PM25         2.034429
NO2          1.877934
RH           1.564945
WD           1.564945
SR           1.408451
Temp         0.000000
Season       0.000000
Pre          0.000000
Dew          0.000000
Date         0.000000
Day          0.000000
Year         0.000000
Month        0.000000
IsWeekday    0.000000
dtype: float64

In [7]:
#Calculate missing data percentage
Missing_data = (file.isnull().sum().sum() / (file.shape[0] * file.shape[1])) * 100
print(file.shape[0])
print(Missing_data)

2556
1.7018779342723005


In [8]:
# Impute(median) columns: 'PM25', 'PM10', 'NO2', 'NH3', 'SO2', 'CO', 'Ozone', 'RH', 'WS', 'WD', 'SR'
#select column with datatype number
cols_number = file.select_dtypes(include='number')
print(cols_number)

# DataFrame df_cols_number is built by populating it with data from another DataFrame 'cols_number' while maintaining the same columns.
df_cols_number = pd.DataFrame(file, columns=cols_number.columns)

#'start_time' is configured to record the current time.
start_time = time.time()

# Using the apply function with lambda, the median of each column in df_cols_number is imputed and placed in DataFrame df_imputed.
df_imputed = df_cols_number.apply(lambda x: x.fillna(x.median()),axis=0)

# 'end_time' is set to record the time following the imputation.
#'Time_taken' is derived by subtracting'start_time' from 'end_time' to determine the time required for median imputation.
end_time = time.time()
Time_taken = end_time - start_time

# Finally, the elapsed time for median imputation is displayed.
print("Time spent for median imputation=", Time_taken)

      Day  Month  Year        PM25        PM10        NO2        NH3  \
0       1      1  2017         NaN  326.820000  56.410000  55.610000   
1       2      1  2017         NaN  323.000000  69.200000  53.960000   
2       3      1  2017  222.000000  361.720000  73.330000  53.050000   
3       4      1  2017  206.840000  303.910000  51.460000  47.040000   
4       5      1  2017  278.410000  341.330000  64.210000  47.160000   
...   ...    ...   ...         ...         ...        ...        ...   
2551   27     12  2023  281.364583  395.166667  68.088542  47.009375   
2552   28     12  2023  220.322917  336.229167  64.043750  43.350000   
2553   29     12  2023  270.114583  407.229167  71.351042  43.095833   
2554   30     12  2023  302.565781  455.343750  91.208333  38.090625   
2555   31     12  2023  266.760417  395.239583  78.645833  30.858333   

            SO2        CO      Ozone         RH        WS          WD  \
0     28.140000  3.030469  21.920000  75.230000  0.870000  192

In [9]:
# Determine the amount of missing values in each column, then divide by the entire length of the DataFrame and multiply by 100.
(df_imputed.isnull().sum().sort_values(ascending=False)/len(df_imputed))*100

Day          0.0
RH           0.0
Season       0.0
Pre          0.0
Dew          0.0
Temp         0.0
SR           0.0
WD           0.0
WS           0.0
Ozone        0.0
Month        0.0
CO           0.0
SO2          0.0
NH3          0.0
NO2          0.0
PM10         0.0
PM25         0.0
Year         0.0
IsWeekday    0.0
dtype: float64

In [10]:
# Show the initial rows of the DataFrame 'data' to examine its structure and contents.
df_imputed.head()

Unnamed: 0,Day,Month,Year,PM25,PM10,NO2,NH3,SO2,CO,Ozone,RH,WS,WD,SR,Temp,Dew,Pre,Season,IsWeekday
0,1,1,2017,80.47,326.82,56.41,55.61,28.14,3.030469,21.92,75.23,0.87,192.24,74.27,14.83,4.92,0.0,0,0
1,2,1,2017,80.47,323.0,69.2,53.96,21.63,3.030469,24.35,72.57,0.6,201.23,76.35,15.09,6.27,0.01,0,1
2,3,1,2017,222.0,361.72,73.33,53.05,28.63,3.030469,24.44,74.1,0.51,228.49,76.33,16.32,3.85,0.0,0,1
3,4,1,2017,206.84,303.91,51.46,47.04,22.16,3.030469,32.17,69.01,0.64,157.43,80.48,16.91,3.27,0.0,0,1
4,5,1,2017,278.41,341.33,64.21,47.16,27.88,3.030469,32.03,70.46,0.52,191.89,75.36,18.0,1.26,0.0,0,1


In [11]:
# Extract the "Date" column from the 'data' DataFrame and save it in Date_col.
Date_col = file["Date"]

# Add the "Date" column as the first column to the 'df_imputed' DataFrame.
df_imputed.insert(0, "Date", Date_col)

In [12]:
# Show the initial rows of the DataFrame 'data' to examine its structure and contents.
df_imputed.head()

Unnamed: 0,Date,Day,Month,Year,PM25,PM10,NO2,NH3,SO2,CO,Ozone,RH,WS,WD,SR,Temp,Dew,Pre,Season,IsWeekday
0,2017-01-01,1,1,2017,80.47,326.82,56.41,55.61,28.14,3.030469,21.92,75.23,0.87,192.24,74.27,14.83,4.92,0.0,0,0
1,2017-01-02,2,1,2017,80.47,323.0,69.2,53.96,21.63,3.030469,24.35,72.57,0.6,201.23,76.35,15.09,6.27,0.01,0,1
2,2017-01-03,3,1,2017,222.0,361.72,73.33,53.05,28.63,3.030469,24.44,74.1,0.51,228.49,76.33,16.32,3.85,0.0,0,1
3,2017-01-04,4,1,2017,206.84,303.91,51.46,47.04,22.16,3.030469,32.17,69.01,0.64,157.43,80.48,16.91,3.27,0.0,0,1
4,2017-01-05,5,1,2017,278.41,341.33,64.21,47.16,27.88,3.030469,32.03,70.46,0.52,191.89,75.36,18.0,1.26,0.0,0,1


In [13]:
# write back step2 processed data to Step-2.csv
df_imputed.to_csv(r'C:\Users\tvams\OneDrive\Desktop\Imputation methods\Step-2.csv', index=False)