In [3]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [4]:
df = pd.read_csv('/airquality_data - airquality_data.csv')

# View the first few rows
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
  stn_code       sampling_date           state   location agency  \
0    150.0  February - M021990  Andhra Pradesh  Hyderabad    NaN   
1    151.0  February - M021990  Andhra Pradesh  Hyderabad    NaN   
2    152.0  February - M021990  Andhra Pradesh  Hyderabad    NaN   
3    150.0     March - M031990  Andhra Pradesh  Hyderabad    NaN   
4    151.0     March - M031990  Andhra Pradesh  Hyderabad    NaN   

                                 type  so2   no2  rspm  spm  \
0  Residential, Rural and other Areas  4.8  17.4   NaN  NaN   
1                     Industrial Area  3.1   7.0   NaN  NaN   
2  Residential, Rural and other Areas  6.2  28.5   NaN  NaN   
3  Residential, Rural and other Areas  6.3  14.7   NaN  NaN   
4                     Industrial Area  4.7   7.5   NaN  NaN   

  location_monitoring_station  pm2_5        date  
0                         NaN    NaN  1990-02-01  
1                         NaN    NaN  1990-02-01  
2                         NaN 

  df = pd.read_csv('/airquality_data - airquality_data.csv')


In [5]:
df.tail()

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
435737,SAMP,24-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RIRUO,22.0,50.0,143.0,,"Inside Rampal Industries,ULUBERIA",,2015-12-24
435738,SAMP,29-12-15,West Bengal,ULUBERIA,West Bengal State Pollution Control Board,RIRUO,20.0,46.0,171.0,,"Inside Rampal Industries,ULUBERIA",,2015-12-29
435739,,,andaman-and-nicobar-islands,,,,,,,,,,
435740,,,Lakshadweep,,,,,,,,,,
435741,,,Tripura,,,,,,,,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435742 entries, 0 to 435741
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   stn_code                     291665 non-null  object 
 1   sampling_date                435739 non-null  object 
 2   state                        435742 non-null  object 
 3   location                     435739 non-null  object 
 4   agency                       286261 non-null  object 
 5   type                         430349 non-null  object 
 6   so2                          401096 non-null  float64
 7   no2                          419509 non-null  float64
 8   rspm                         395520 non-null  float64
 9   spm                          198355 non-null  float64
 10  location_monitoring_station  408251 non-null  object 
 11  pm2_5                        9314 non-null    float64
 12  date                         435735 non-null  object 
dtyp

In [7]:
df.describe()

Unnamed: 0,so2,no2,rspm,spm,pm2_5
count,401096.0,419509.0,395520.0,198355.0,9314.0
mean,10.829414,25.809623,108.832784,220.78348,40.791467
std,11.177187,18.503086,74.87243,151.395457,30.832525
min,0.0,0.0,0.0,0.0,3.0
25%,5.0,14.0,56.0,111.0,24.0
50%,8.0,22.0,90.0,187.0,32.0
75%,13.7,32.2,142.0,296.0,46.0
max,909.0,876.0,6307.033333,3380.0,504.0


In [8]:
df.isnull()

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
0,False,False,False,False,True,False,False,False,True,True,True,True,False
1,False,False,False,False,True,False,False,False,True,True,True,True,False
2,False,False,False,False,True,False,False,False,True,True,True,True,False
3,False,False,False,False,True,False,False,False,True,True,True,True,False
4,False,False,False,False,True,False,False,False,True,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
435737,False,False,False,False,False,False,False,False,False,True,False,True,False
435738,False,False,False,False,False,False,False,False,False,True,False,True,False
435739,True,True,False,True,True,True,True,True,True,True,True,True,True
435740,True,True,False,True,True,True,True,True,True,True,True,True,True


In [9]:
df.isnull().sum()

Unnamed: 0,0
stn_code,144077
sampling_date,3
state,0
location,3
agency,149481
type,5393
so2,34646
no2,16233
rspm,40222
spm,237387


In [10]:
# Step 3: Data Cleaning
# Drop duplicates
df = df.drop_duplicates()

# Drop unnecessary columns
df = df.drop(['stn_code', 'agency', 'sampling_date', 'location_monitoring_station', 'pm2_5'], axis=1)

# Convert numerical data types to float32 for memory efficiency
df['so2'] = df['so2'].astype('float32')
df['no2'] = df['no2'].astype('float32')
df['rspm'] = df['rspm'].astype('float32')
df['spm'] = df['spm'].astype('float32')

# Convert 'date' to string (or datetime if you want to use it)
df['date'] = df['date'].astype('string')


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [11]:
df.isnull().sum()

Unnamed: 0,0
state,0
location,3
type,5357
so2,34632
no2,16222
rspm,40035
spm,236908
date,7


In [12]:
# Step 4: Error Correcting - Fill missing values
for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype == 'string':
        df[col] = df[col].fillna(df[col].mode()[0])  # Categorical: use mode
    else:
        df[col] = df[col].fillna(df[col].mean())     # Numeric: use mean


In [13]:
# Step 5: Data Integration - Example by combining two subsets
subSet1 = df[['state', 'type']]
subSet2 = df[['state', 'location']]
concatenated_df = pd.concat([subSet1, subSet2], axis=1)
print("\nIntegrated Data (Sample):")
print(concatenated_df.head())



Integrated Data (Sample):
            state                                type           state  \
0  Andhra Pradesh  Residential, Rural and other Areas  Andhra Pradesh   
1  Andhra Pradesh                     Industrial Area  Andhra Pradesh   
2  Andhra Pradesh  Residential, Rural and other Areas  Andhra Pradesh   
3  Andhra Pradesh  Residential, Rural and other Areas  Andhra Pradesh   
4  Andhra Pradesh                     Industrial Area  Andhra Pradesh   

    location  
0  Hyderabad  
1  Hyderabad  
2  Hyderabad  
3  Hyderabad  
4  Hyderabad  


In [14]:
# Step 6: Data Transformation - Remove outliers
def remove_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return column.clip(lower, upper)  # Replaces outliers with limits

for col in ['so2', 'no2', 'rspm', 'spm']:
    df[col] = remove_outliers(df[col])


In [15]:
# Step 7: Data Model Building - Predict 'so2' using other numeric columns
X = df[['no2', 'rspm', 'spm']]  # Features
y = df['so2']                   # Target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)



Model Evaluation:
Mean Squared Error: 34.72865415865434
Model Coefficients: [ 0.18175344  0.00997868 -0.01391259]
Model Intercept: 7.3876700018083215
