In [1]:
# Data Preprocessing and EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Load & Download Data

In [3]:
air_aware_data = pd.read_csv(r"C:\Users\Dell\Downloads\air+quality\AirQualityUCI.csv", sep =';',decimal = ',')

In [4]:
# remove the last 2 columns from the dataframe
air_aware_data = air_aware_data.iloc[:,:-2]

In [5]:
# air_aware_data.loc[[9356]] represents the last datapoint in the dataframe, and the remaining rows are 
 #null values    (9357th row)(9356 index)

In [6]:
air_aware_data = air_aware_data.head(9357)

In [7]:
air_aware_data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


In [8]:
air_aware_data.tail()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
9352,04/04/2005,10.00.00,3.1,1314.0,-200.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
9353,04/04/2005,11.00.00,2.4,1163.0,-200.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
9354,04/04/2005,12.00.00,2.4,1142.0,-200.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
9355,04/04/2005,13.00.00,2.1,1003.0,-200.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139
9356,04/04/2005,14.00.00,2.2,1071.0,-200.0,11.9,1047.0,265.0,654.0,168.0,1129.0,816.0,28.5,13.1,0.5028


In [9]:
# Data Cleaning


In [10]:
air_aware_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           9357 non-null   object 
 1   Time           9357 non-null   object 
 2   CO(GT)         9357 non-null   float64
 3   PT08.S1(CO)    9357 non-null   float64
 4   NMHC(GT)       9357 non-null   float64
 5   C6H6(GT)       9357 non-null   float64
 6   PT08.S2(NMHC)  9357 non-null   float64
 7   NOx(GT)        9357 non-null   float64
 8   PT08.S3(NOx)   9357 non-null   float64
 9   NO2(GT)        9357 non-null   float64
 10  PT08.S4(NO2)   9357 non-null   float64
 11  PT08.S5(O3)    9357 non-null   float64
 12  T              9357 non-null   float64
 13  RH             9357 non-null   float64
 14  AH             9357 non-null   float64
dtypes: float64(13), object(2)
memory usage: 1.1+ MB


In [11]:
# checking the missing value in the data frame
air_aware_data.isnull().sum()

Date             0
Time             0
CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64

In [50]:
# This shows that there are no missing values in the dataset. But the actual missing values are tagged with the
# value "-200". Described in the UCI repository documentation

In [51]:
# counting the number of times -200 appears in the data
#air_aware_data.isin([-200]).sum(axis = 0)

Date                0
Time                0
CO(GT)           1683
PT08.S1(CO)       366
NMHC(GT)         8443
C6H6(GT)          366
PT08.S2(NMHC)     366
NOx(GT)          1639
PT08.S3(NOx)      366
NO2(GT)          1642
PT08.S4(NO2)      366
PT08.S5(O3)       366
T                 366
RH                366
AH                366
dtype: int64

In [54]:
# Handle the missing values
#Convert all -200 to NaN
#Replace all NaN values with the mean of that specific column

In [55]:
#air_aware_data = air_aware_data.replace(to_replace = -200 , value = np.NaN)

In [56]:
#air_aware_data.isnull().sum()

Date                0
Time                0
CO(GT)           1683
PT08.S1(CO)       366
NMHC(GT)         8443
C6H6(GT)          366
PT08.S2(NMHC)     366
NOx(GT)          1639
PT08.S3(NOx)      366
NO2(GT)          1642
PT08.S4(NO2)      366
PT08.S5(O3)       366
T                 366
RH                366
AH                366
dtype: int64

In [57]:
#air_aware_data.tail()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
9352,04/04/2005,10.00.00,3.1,1314.0,,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
9353,04/04/2005,11.00.00,2.4,1163.0,,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
9354,04/04/2005,12.00.00,2.4,1142.0,,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
9355,04/04/2005,13.00.00,2.1,1003.0,,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139
9356,04/04/2005,14.00.00,2.2,1071.0,,11.9,1047.0,265.0,654.0,168.0,1129.0,816.0,28.5,13.1,0.5028


In [12]:
# Drop completely empty columns
air_aware_data = air_aware_data.dropna(axis=1, how='all')

In [13]:
# Combine Date + Time into a timestamp column
air_aware_data['timestamp'] = pd.to_datetime(air_aware_data['Date'] + ' ' + air_aware_data['Time'],
                                              dayfirst=True, errors='coerce')

  air_aware_data['timestamp'] = pd.to_datetime(air_aware_data['Date'] + ' ' + air_aware_data['Time'],


In [14]:
# Drop original Date and Time columns
air_aware_data.drop(columns=['Date','Time'], inplace=True)

In [15]:
# Drop rows where the timestamp could not be parsed
air_aware_data.dropna(subset=['timestamp'], inplace=True)

# Sort by timestamp
air_aware_data.sort_values('timestamp', inplace=True)
air_aware_data.reset_index(drop=True, inplace=True)


In [16]:
air_aware_data.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,timestamp


In [63]:
#Data Cleaning

In [18]:
# Remove duplicates
air_aware_data.drop_duplicates(inplace=True)

# Fill missing values (forward fill, then backwards fill)
air_aware_data.fillna(method='ffill', inplace=True)
air_aware_data.fillna(method='bfill', inplace=True)

# Replace -200 with NaN for pollutants
pollutants = ['CO(GT)','PT08.S1(CO)','NMHC(GT)','C6H6(GT)','NOx(GT)','NO2(GT)']

for col in pollutants:
    if col in air_aware_data.columns:
        air_aware_data[col] = air_aware_data[col].replace(-200, np.nan)

# Fill missing values (forward fill then backward fill)
air_aware_data[pollutants] = air_aware_data[pollutants].ffill().bfill()

# Scale features between 0-1
scaler = MinMaxScaler()
air_aware_data[pollutants] = scaler.fit_transform(air_aware_data[pollutants])


  air_aware_data.fillna(method='ffill', inplace=True)
  air_aware_data.fillna(method='bfill', inplace=True)


ValueError: Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required by MinMaxScaler.