# EXPLORATORY DATA ANALYSIS

## Import Libraries

In [16]:
import pandas as pd
import plotly.express as px


## Load Datasets


In [17]:
# Load India Air Quality dataset
india_data = pd.read_csv("Primitive Data/air-quality-India.csv")

# Load Italy Air Quality dataset
italy_data = pd.read_csv("Primitive Data/air-quality-Italy.csv")

## DATASET 1: India Air Quality dataset

This dataset contains 29,531 air quality measurements from 26 Indian cities between 2015-2020, tracking 14 pollutants along with AQI scores and categories:
- **City**: Geographic identifier for 26 major Indian cities including Delhi, Mumbai, Bengaluru, Chennai, Kolkata, Hyderabad, Pune, Ahmedabad, and others  
- **Date**: Timestamp indicating when measurements were recorded, spanning 2015-2020 with hourly/daily frequency  
- **PM2.5**: Ultra-fine particles ≤2.5 micrometers; penetrate deep into lungs/bloodstream causing cardiovascular disease, stroke, lung cancer (μg/m³)  
- **PM10**: Inhalable particles ≤10 micrometers including dust, pollen, mold; cause throat irritation, coughing, reduced lung function (μg/m³)  
- **NO**: Nitric oxide; primary pollutant from vehicle exhausts and high-temperature combustion processes (μg/m³)  
- **NO2**: Nitrogen dioxide; secondary pollutant formed when NO reacts with oxygen; causes respiratory inflammation, reduced immunity (μg/m³)  
- **NOx**: Combined nitrogen oxides (NO + NO2); indicator of traffic density and combustion sources (μg/m³)  
- **NH3**: Ammonia from agricultural fertilizers, livestock waste, vehicle catalytic converters; contributes to secondary PM2.5 formation (μg/m³)  
- **CO**: Carbon monoxide; colorless, odorless gas from incomplete fuel combustion; reduces oxygen delivery in blood, causes headaches, dizziness (mg/m³)  
- **SO2**: Sulfur dioxide from coal burning, oil refining, metal smelting; causes respiratory problems, acid rain formation (μg/m³)  
- **O3**: Ground-level ozone; not directly emitted but formed when NOx and VOCs react in sunlight; causes chest pain, coughing, asthma attacks (μg/m³)  
- **Benzene**: Known human carcinogen from gasoline, industrial solvents, cigarette smoke; causes leukemia, bone marrow damage (μg/m³)  
- **Toluene**: From paint thinners, nail polish, gasoline; affects central nervous system, causes fatigue, confusion (μg/m³)  
- **Xylene**: Industrial solvent in paints, rubber, leather; affects nervous system, liver, kidneys with prolonged exposure (μg/m³)  
- **AQI**: Air Quality Index; standardized 0-500+ scale combining multiple pollutants into single health-based number using worst pollutant (dimensionless)  
- **AQI_Bucket**: Categorical classification: Good (0-50), Satisfactory (51-100), Moderate (101-200), Poor (201-300), Very Poor (301-400), Severe (401-500+)  
- **CO2**: Carbon dioxide is not directly measured in this dataset, but is a major greenhouse gas from fossil fuel combustion, contributing to climate change and urban heat island effects (ppm)

In [18]:
india_data.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [19]:
india_data.isnull().sum()

City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64

In [20]:
india_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [21]:
india_data.dropna(axis=0, inplace=True)

In [22]:
india_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6236 entries, 2123 to 29529
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        6236 non-null   object 
 1   Date        6236 non-null   object 
 2   PM2.5       6236 non-null   float64
 3   PM10        6236 non-null   float64
 4   NO          6236 non-null   float64
 5   NO2         6236 non-null   float64
 6   NOx         6236 non-null   float64
 7   NH3         6236 non-null   float64
 8   CO          6236 non-null   float64
 9   SO2         6236 non-null   float64
 10  O3          6236 non-null   float64
 11  Benzene     6236 non-null   float64
 12  Toluene     6236 non-null   float64
 13  Xylene      6236 non-null   float64
 14  AQI         6236 non-null   float64
 15  AQI_Bucket  6236 non-null   object 
dtypes: float64(13), object(3)
memory usage: 828.2+ KB


In [25]:
# Save the preprocessed India dataset
import os

# Create Preprocessed Data directory if it doesn't exist
os.makedirs("Preprocessed Data", exist_ok=True)

# Save the cleaned India dataset
india_data.to_csv("Preprocessed Data/preprocessed_india_air_quality.csv", index=False)

print(f"Preprocessed India dataset saved to: Preprocessed Data/preprocessed_india_air_quality.csv")
print(f"Final dataset shape: {india_data.shape}")

Preprocessed India dataset saved to: Preprocessed Data/preprocessed_india_air_quality.csv
Final dataset shape: (6236, 16)


In [23]:
india_data.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
2123,Amaravati,2017-11-25,81.4,124.5,1.44,20.5,12.08,10.72,0.12,15.24,127.09,0.2,6.5,0.06,184.0,Moderate
2124,Amaravati,2017-11-26,78.32,129.06,1.26,26.0,14.85,10.28,0.14,26.96,117.44,0.22,7.95,0.08,197.0,Moderate
2125,Amaravati,2017-11-27,88.76,135.32,6.6,30.85,21.77,12.91,0.11,33.59,111.81,0.29,7.63,0.12,198.0,Moderate
2126,Amaravati,2017-11-28,64.18,104.09,2.56,28.07,17.01,11.42,0.09,19.0,138.18,0.17,5.02,0.07,188.0,Moderate
2127,Amaravati,2017-11-29,72.47,114.84,5.23,23.2,16.59,12.25,0.16,10.55,109.74,0.21,4.71,0.08,173.0,Moderate


In [24]:

india_data['Date'] = pd.to_datetime(india_data['Date'])

fig1 = px.line(india_data, x='Date', y='AQI', color='City', title='AQI Trend Over Time')
fig1.show()

# Box plot for AQI distribution by City
fig2 = px.box(india_data, x='City', y='AQI', title='AQI Distribution by City')
fig2.update_layout(xaxis={'categoryorder':'total descending'})
fig2.show()

# Scatter plot matrix for selected features
selected_features = ['PM2.5', 'NO2', 'CO', 'O3', 'AQI']
fig3 = px.scatter_matrix(india_data[selected_features], title='Scatter Plot Matrix')
fig3.show()

## DATASET 2: Italy Air Quality dataset

This dataset contains 9,471 hourly measurements from March 2004-February 2005 comparing 5 metal oxide chemical sensor responses for CO, NMHC, benzene, NOx, and NO2 at a roadside location in a polluted Italian city.
- **Date**: Measurement date in DD/MM/YYYY format, spanning March 2004 to February 2005  
- **Time**: Hourly timestamp in HH.MM.SS format for precise temporal tracking  
- **CO(GT)**: True carbon monoxide concentration from certified reference analyzer; toxic gas from incomplete combustion (mg/m³)  
- **NMHC(GT)**: True non-methanic hydrocarbons concentration; volatile organic compounds excluding methane (μg/m³)  
- **C6H6(GT)**: True benzene concentration; known carcinogen from gasoline and industrial processes (μg/m³)  
- **NOx(GT)**: True nitrogen oxides concentration; traffic pollution indicator (ppb - parts per billion)  
- **NO2(GT)**: True nitrogen dioxide concentration; respiratory irritant from combustion (μg/m³)  
- **PT08.S1(CO)**: Tin oxide sensor response targeting CO detection; provides real-time CO monitoring capability (sensor units)  
- **PT08.S2(NMHC)**: Titania sensor response targeting non-methanic hydrocarbons detection (sensor units)  
- **PT08.S3(NOx)**: Tungsten oxide sensor response targeting nitrogen oxides detection (sensor units)  
- **PT08.S4(NO2)**: Tungsten oxide sensor response targeting nitrogen dioxide detection (sensor units)  
- **PT08.S5(O3)**: Indium oxide sensor response targeting ozone detection; monitors photochemical pollution (sensor units)  
- **T**: Ambient temperature affecting sensor performance and pollutant behavior (°C)  
- **RH**: Relative humidity impacting sensor sensitivity and atmospheric chemistry (%)  
- **AH**: Absolute humidity providing complete moisture content measurement (g/m³)  


In [None]:
# Display basic information about Italy dataset
print("Italy Air Quality Dataset - Basic Information:")
print(f"Shape: {italy_data.shape}")
print("\nFirst few rows:")
italy_data.head()

In [None]:
# Check for missing values in Italy dataset
print("Missing values in Italy dataset:")
italy_data.isnull().sum()

In [None]:
# Display detailed info about Italy dataset
italy_data.info()

In [None]:
# Clean the Italy dataset by removing rows with missing values
print("Before cleaning Italy dataset:")
print(f"Shape: {italy_data.shape}")

# Remove rows with any missing values
italy_data.dropna(axis=0, inplace=True)

print("After cleaning Italy dataset:")
print(f"Shape: {italy_data.shape}")
print(f"Removed {italy_data.shape[0]} rows with missing values" if italy_data.shape[0] > 0 else "No rows removed")

In [None]:
# Save the preprocessed Italy dataset
italy_data.to_csv("Preprocessed Data/preprocessed_italy_air_quality.csv", index=False)

print(f"Preprocessed Italy dataset saved to: Preprocessed Data/preprocessed_italy_air_quality.csv")
print(f"Final dataset shape: {italy_data.shape}")

# Display summary of both preprocessed datasets
print("\n" + "="*60)
print("PREPROCESSING SUMMARY")
print("="*60)
print(f"India dataset: {india_data.shape[0]} rows, {india_data.shape[1]} columns")
print(f"Italy dataset: {italy_data.shape[0]} rows, {italy_data.shape[1]} columns")
print(f"\nBoth datasets saved in: Preprocessed Data/ folder")