In [None]:
Perform the following operations using Python on the Air quality and Heart Diseases data sets
a. Data cleaning
b. Data integration
c. Data transformation
d. Error correcting
e. Data model building

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
df=pd.read_csv('AirQuality.csv',sep=';')

In [3]:
df

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


In [4]:
# data cleaning - remove unnecesssary columns
df.drop(columns=['Unnamed: 15','Unnamed: 16','Date','Time'],inplace=True)

In [5]:
 print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 CO(GT)           114
PT08.S1(CO)      114
NMHC(GT)         114
C6H6(GT)         114
PT08.S2(NMHC)    114
NOx(GT)          114
PT08.S3(NOx)     114
NO2(GT)          114
PT08.S4(NO2)     114
PT08.S5(O3)      114
T                114
RH               114
AH               114
dtype: int64


In [6]:
df



Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578
1,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255
2,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502
3,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867
4,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,
9467,,,,,,,,,,,,,
9468,,,,,,,,,,,,,
9469,,,,,,,,,,,,,


In [7]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True,ignore_index=True)
df

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578
1,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255
2,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502
3,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867
4,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9321,31,1314.0,-200.0,135,1101.0,472.0,539.0,190.0,1374.0,1729.0,219,293,07568
9322,24,1163.0,-200.0,114,1027.0,353.0,604.0,179.0,1264.0,1269.0,243,237,07119
9323,24,1142.0,-200.0,124,1063.0,293.0,603.0,175.0,1241.0,1092.0,269,183,06406
9324,21,1003.0,-200.0,95,961.0,235.0,702.0,156.0,1041.0,770.0,283,135,05139


In [8]:
#replacing commas with dot
cols_to_fix=['CO(GT)','C6H6(GT)','T','RH','AH']
for col in cols_to_fix:
    df[col]=df[col].str.replace(',','.')
    df[col]=df[col].astype(float)

In [9]:
df

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9321,3.1,1314.0,-200.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
9322,2.4,1163.0,-200.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
9323,2.4,1142.0,-200.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
9324,2.1,1003.0,-200.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139


In [10]:
#fixing datatype of the column so that data anyalysis doesnt face any issues
df = df.astype({
    'CO(GT)': 'float64',
    'C6H6(GT)': 'float64',
    'T': 'float64',
    'RH': 'float64',
    'AH': 'float64'
})


In [11]:
#DATA INTEGRATION - Removing missing and duplicate values
df.dropna(inplace=True)
df.drop_duplicates(inplace=True,ignore_index=True)
df

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9321,3.1,1314.0,-200.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
9322,2.4,1163.0,-200.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
9323,2.4,1142.0,-200.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
9324,2.1,1003.0,-200.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139


In [12]:
# Selecting required columns from original DataFrame
df1 = df[['CO(GT)', 'C6H6(GT)', 'NO2(GT)']]
df2 = df[['T', 'RH', 'AH']]

# Integrating data using column-wise concatenation
merged_df = pd.concat([df1, df2], axis=1)

# Showing result
print("Shape of Merged DataFrame:", merged_df.shape)
print("Columns:", merged_df.columns.tolist())

Shape of Merged DataFrame: (9326, 6)
Columns: ['CO(GT)', 'C6H6(GT)', 'NO2(GT)', 'T', 'RH', 'AH']


In [13]:
#DATA TRANSFORMATION - to remove outliers and normalise data
def remove_outliers(df,column,lower=0.01,upper=0.99):
    low=df[column].quantile(lower)
    high=df[column].quantile(upper)
    return df[(df[column]>=low) & (df[column]<=high)]

In [14]:
data=df.copy()

for col in data.columns:
    data=remove_outliers(data,col)




In [15]:
data=data.reset_index(drop = True)

In [16]:
data

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8186,3.1,1314.0,-200.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
8187,2.4,1163.0,-200.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
8188,2.4,1142.0,-200.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
8189,2.1,1003.0,-200.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139


In [17]:
#to normalise
pt=PowerTransformer(method='yeo-johnson')
data=pd.DataFrame(pt.fit_transform(data),columns=data.columns)

In [18]:
data

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,0.717657,1.359569,3.144706,0.469115,0.650631,0.103915,0.852179,0.445186,0.842510,0.904265,-0.486426,0.029489,-0.528643
1,0.327523,1.066370,3.129000,0.061870,0.307194,-0.198976,1.302519,0.114069,0.486934,0.113039,-0.514202,-0.035260,-0.582953
2,0.449867,1.542256,3.117976,0.000834,0.247243,-0.064695,1.172217,0.461262,0.476305,0.384448,-0.640226,0.311461,-0.541562
3,0.449867,1.429023,3.114045,0.031204,0.280949,0.132892,0.989002,0.590807,0.553454,0.729606,-0.718001,0.656632,-0.478720
4,0.105996,0.980758,3.098274,-0.352444,-0.135384,-0.064695,1.421691,0.493493,0.304127,0.480566,-0.700944,0.633185,-0.475044
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8186,1.095609,1.160873,-0.318385,0.751635,0.860143,1.598481,-1.045113,1.749857,-0.000479,2.155206,0.378672,-0.942879,-0.530348
8187,0.579915,0.519354,-0.318385,0.384235,0.578587,1.014008,-0.814888,1.556120,-0.285985,0.906956,0.660166,-1.182723,-0.605355
8188,0.579915,0.431499,-0.318385,0.555639,0.715238,0.720670,-0.818452,1.486217,-0.345247,0.432487,0.979363,-1.394451,-0.718341
8189,0.387731,-0.141017,-0.318385,0.077313,0.329710,0.438196,-0.462590,1.158373,-0.853732,-0.419952,1.157113,-1.564000,-0.901072


In [19]:
#ERROR CORRECTION
print("Missing Values:\n",data.isnull().sum()) #checking missing values again

Missing Values:
 CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64


In [20]:
print("\nData Types:\n",data.dtypes) #ensuring that all are numerical daataypes


Data Types:
 CO(GT)           float64
PT08.S1(CO)      float64
NMHC(GT)         float64
C6H6(GT)         float64
PT08.S2(NMHC)    float64
NOx(GT)          float64
PT08.S3(NOx)     float64
NO2(GT)          float64
PT08.S4(NO2)     float64
PT08.S5(O3)      float64
T                float64
RH               float64
AH               float64
dtype: object


In [21]:
#checking for extreme values upto 3 standard ddeviations
z_scores=data-data.mean()/data.std()
extreme=(np.abs(z_scores)>3).sum()
print("Extreme values(>3 std dev:\n",extreme)


Extreme values(>3 std dev:
 CO(GT)            36
PT08.S1(CO)      284
NMHC(GT)         754
C6H6(GT)          85
PT08.S2(NMHC)    284
NOx(GT)           43
PT08.S3(NOx)      20
NO2(GT)            0
PT08.S4(NO2)     284
PT08.S5(O3)        0
T                  0
RH                 0
AH                 0
dtype: int64


In [22]:
#MODEL BUILDING
# Ab hum apna data train aur test mein divide karte hain aur models banate hain.

# 'X' mein features hain aur 'y' mein target variable 'AH' hai
X = data.drop('AH', axis=1)  # 'AH' ko target variable bana rahe hain
y = data['AH']  # 'AH' target variable hai

# Data ko 80% train aur 20% test mein divide karte hain
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=6, shuffle=True)

# **Linear Regression Model**
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)  # Model ko train karte hain
lr_pred = lr_model.predict(X_test)  # Test data par predictions karte hain


# **Decision Tree Regressor Model**
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)  # Model ko train karte hain
dt_pred = dt_model.predict(X_test)  # Test data par predictions karte hain

rr_model= RandomForestRegressor()
rr_model.fit(X_train, y_train)  # Model ko train karte hain
rr_pred = rr_model.predict(X_test)  # Test data par predictions karte hain

# Model ki performance evaluate karte hain
def evaluate_model(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)  # R-squared score
    mae = mean_absolute_error(y_true, y_pred)  # Mean absolute error
    mse = mean_squared_error(y_true, y_pred)  # Mean squared error
    print(f"\n{name} Model:")
    print(f" R² Score     : {r2:.4f}")
    print(f" MAE          : {mae:.4f}")
    print(f" MSE          : {mse:.4f}")
    print(f" Accuracy (%) : {r2*100:.2f}%")

# Models ki evaluation karte hain
evaluate_model("Linear Regression", y_test, lr_pred)
evaluate_model("Decision Tree Regressor", y_test, dt_pred)
evaluate_model("Random Forest", y_test, rr_pred)




Linear Regression Model:
 R² Score     : 0.8566
 MAE          : 0.2882
 MSE          : 0.1392
 Accuracy (%) : 85.66%

Decision Tree Regressor Model:
 R² Score     : 0.9936
 MAE          : 0.0486
 MSE          : 0.0062
 Accuracy (%) : 99.36%

Random Forest Model:
 R² Score     : 0.9984
 MAE          : 0.0226
 MSE          : 0.0016
 Accuracy (%) : 99.84%
