In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Load the dataset
# Assuming the dataset is uploaded to Kaggle or available locally
df = pd.read_excel('/kaggle/input/air-exposure/Air Exposure (Individual Response).xlsx')
df.head()

Unnamed: 0,Timestamp,Score,1. Gender:,2. Age:,3. Occupation:,4. Do you smoke?,"5. Do you have any lung diseases? (e.g., Asthma, Tuberculosis)",If Yes (Please specify the disease):,6. Which area do you live in inside Dhaka City?:,7. How many hours do you stay at home per day?,...,11. Does that place have air conditioning?,12. How many hours do you travel per day?,14. What is your primary mode of transport?,15. During what time do you usually travel?,16. How many days per week do you travel for work/school?,17. How do you perceive the air quality in your living area?,18. How do you perceive the air quality in your workplace?,19. Have you experienced any respiratory issues due to air pollution?,If Yes please specify:,20. Do you believe your profession increases your exposure to air pollution?
0,2025-02-28 15:56:59.729,0,Male,18 - 25,Student,No,No,,Shahinbag,6 -12 hours,...,Partially Air-conditioned,1-3 hours,Private Car (AC),"Morning, Noon, Afternoon",5,Poor,Poor,Yes,"Stuffy nose, dust allergy",No
1,2025-03-01 12:18:56.963,0,Female,18 - 25,Student,No,No,,Bashundhara,More than 12 hours,...,Partially Air-conditioned,Less than 1 hour,Rickshaw/ CNG,"Morning, Noon, Afternoon",4,Hazardous,Moderate,No,,No
2,2025-03-01 12:45:52.269,0,Male,18 - 25,Student,Yes,No,,Bashundhara,More than 12 hours,...,Fully Air-conditioned,1-3 hours,Walking/ Cycling,"Morning, Evening",6,Poor,Moderate,No,,Unsure
3,2025-03-01 12:51:02.368,0,Male,18 - 25,Student,No,No,,Shahbagh,6 -12 hours,...,Fully Air-conditioned,1-3 hours,Public Bus (Non-AC),Afternoon,5,Poor,Poor,No,,No
4,2025-03-01 12:53:03.665,0,Male,18 - 25,Student,No,No,,Bashundhara,6 -12 hours,...,Fully Air-conditioned,Less than 1 hour,Walking/ Cycling,Morning,5,Hazardous,Moderate,No,,No


In [3]:
df.columns

Index(['Timestamp', 'Score', '1. Gender:  ', '2. Age:  ', '3. Occupation: ',
       '4. Do you smoke?',
       '5. Do you have any lung diseases? (e.g., Asthma, Tuberculosis)  ',
       'If Yes (Please specify the disease):',
       '6. Which area do you live in inside Dhaka City?:',
       '7. How many hours do you stay at home per day?  ',
       '8. What type of house do you live in?  ',
       '9. Specify the location or address you spend most time at- NOT your home (e.g. Workplace location, University name, Restaurant/Park locations, Outing places etc)',
       '10. How many hours do you spend there per day?',
       '11. Does that place have air conditioning?  ',
       '12. How many hours do you travel per day? ',
       '14. What is your primary mode of transport?  ',
       '15. During what time do you usually travel?  ',
       '16. How many days per week do you travel for work/school?  ',
       '17. How do you perceive the air quality in your living area?  ',
       '18. Ho

In [4]:
# Important columns for AQI calculation
columns_needed = [ 
    '6. Which area do you live in inside Dhaka City?:', 
    '7. How many hours do you stay at home per day?  ', 
    '8. What type of house do you live in?  ', 
    '9. Specify the location or address you spend most time at- NOT your home (e.g. Workplace location, University name, Restaurant/Park locations, Outing places etc)', 
    '10. How many hours do you spend there per day?', 
    '11. Does that place have air conditioning?  ', 
    '12. How many hours do you travel per day? ', 
    '14. What is your primary mode of transport?  '
]
df = df[columns_needed]

# Simplifying the column names
df.columns = [
    'Home_Location', 'T_home', 'F_home', 
    'Work_Location', 'T_work', 'F_work', 
    'T_travel', 'Transport_Mode'
]

df

Unnamed: 0,Home_Location,T_home,F_home,Work_Location,T_work,F_work,T_travel,Transport_Mode
0,Shahinbag,6 -12 hours,Partially Open (Do use AC when needed),Bashundhara,4 - 8 hours,Partially Air-conditioned,1-3 hours,Private Car (AC)
1,Bashundhara,More than 12 hours,Open House (No AC),Bashundhara,4 - 8 hours,Partially Air-conditioned,Less than 1 hour,Rickshaw/ CNG
2,Bashundhara,More than 12 hours,Open House (No AC),Bashundhara,4 - 8 hours,Fully Air-conditioned,1-3 hours,Walking/ Cycling
3,Shahbagh,6 -12 hours,Open House (No AC),Shahbagh,4 - 8 hours,Fully Air-conditioned,1-3 hours,Public Bus (Non-AC)
4,Bashundhara,6 -12 hours,Open House (No AC),Bashundhara,4 - 8 hours,Fully Air-conditioned,Less than 1 hour,Walking/ Cycling
...,...,...,...,...,...,...,...,...
141,Uttara,6 -12 hours,Partially Open (Do use AC when needed),Uttara,4 - 8 hours,Partially Air-conditioned,1-3 hours,Public Bus (Non-AC)
142,Dakshinkhan (Airport),6 -12 hours,Partially Open (Do use AC when needed),Uttara,4 - 8 hours,Partially Air-conditioned,1-3 hours,Public Bus (Non-AC)
143,Banani,Almost 24 hours,Partially Open (Do use AC when needed),Uttara,4 - 8 hours,Partially Air-conditioned,1-3 hours,Private Car (AC)
144,Badda,Less than 6 hours,Partially Open (Do use AC when needed),Uttara,Less than 4 hours,Fully Air-conditioned,1-3 hours,Motorcycle


In [5]:
# Display unique values in each column
for column in df.columns:
    print(f"Unique values in {column}:")
    print(df[column].unique())
    print("\n")

Unique values in Home_Location:
['Shahinbag' 'Bashundhara' 'Shahbagh' 'Basundhara' 'Uttara' 'Eskaton'
 'Bashundhara ' 'Mirpur Cantonment' 'Adabor' 'Savar' 'Bashudhara '
 'Farmgate ' 'Mohammadpur' 'Paltan' 'Jatrabari ' 'Kafrul' 'Kawlar Bazar'
 'Khilgaon' 'Mirpur ' 'Mogbazar' 'Lalmatia' 'Dhanmondi' 'Mirpur'
 'Gulshan-2' 'Motijheel' 'Rampura' 'Demra' 'Gulshan-1' 'Puran Dhaka'
 'Ramna' 'Shahinbhag' 'Baridhara' 'Banasree' 'New Market' 'Agargaon'
 'Jatrabari' 'Abdullahpur' 'Banani' 'Badda' 'Khilkhet' 'Wari'
 'Dakshinkhan (Airport)']


Unique values in T_home:
['6 -12 hours' 'More than 12 hours' 'Less than 6 hours' 'Almost 24 hours']


Unique values in F_home:
['Partially Open (Do use AC when needed)' 'Open House (No AC)'
 'Fully Closed House (Usage of AC all the time)']


Unique values in Work_Location:
['Bashundhara' 'Shahbagh' 'Basundhara' 'Paltan' 'Mirpur' 'Adabor'
 'Bashudhara ' 'Savar' 'Farmgate ' 'Mohammadpur' 'Motijheel'
 'Kawlar Bazar' 'Uttara' 'Mogbazar' 'Agargaon' 'Lalmatia' 'Dhanm

In [6]:
location_aqi = {
    'Agargaon': 218, 'Adabor': 240, 'Baridhara': 187, 'Banani': 220, 'Banasree': 117,
    'Bashundhara': 186, 'Baily Road': 240, 'Bangshal': 233, 'Dhanmondi': 143, 'Demra': 277,
    'Dhamrai': 287, 'Dakshinkhan (Airport)': 298, 'Elephant Road': 289, 'Eskaton': 301,
    'Gulshan-1': 283, 'Gulshan-2': 322, 'Gabtali': 311, 'Gulistan': 391, 'Hatirjheel': 175,
    'Hazaribagh': 351, 'Jatrabari': 309, 'Kadamtali': 322, 'Khilkhet': 289, 'Katabon': 366,
    'Kalabagan': 289, 'Khilgaon': 221, 'Kafrul': 278, 'Kawlar Bazar': 312, 'Keraniganj': 399,
    'Lalmatia': 286, 'Lalbagh': 265, 'Mirpur': 242, 'Mirpur Cantonment': 233, 'Mohammadpur': 230,
    'Mohakhali': 271, 'Motijheel': 155, 'Middle Badda': 276, 'North Badda': 278, 'Niketan': 260,
    'Nikunja': 254, 'New Market': 380, 'Puran Dhaka': 389, 'Paltan': 386, 'Paribagh': 340,
    'Polashi': 360, 'Pallabi': 256, 'Rayerbazar': 381, 'Rampura': 270, 'Ramna': 155,
    'Shyamoli': 250, 'Shahinbhag': 230, 'Shyampur': 256, 'Savar': 289, 'Shahbagh': 250,
    'Tejgaon': 240, 'Turag': 398, 'Uttara': 183, 'Wari': 260
}

In [7]:
# Create a fresh copy of the DataFrame to avoid any slicing issues
df_clean = df.copy()

# Remove spaces and correct spellings
df_clean['Home_Location'] = df_clean['Home_Location'].str.strip().replace({
    'Bashudhara': 'Bashundhara',
    'Basundhara': 'Bashundhara',
    'Shahinbag': 'Shahinbhag'
})
df_clean['Work_Location'] = df_clean['Work_Location'].str.strip().replace({
    'Bashudhara': 'Bashundhara',
    'Basundhara': 'Bashundhara',
    'Shahinbag': 'Shahinbhag'
})

# Check remaining unmapped locations
unmapped_home = df_clean[~df_clean['Home_Location'].isin(location_aqi.keys())]['Home_Location'].unique()
unmapped_work = df_clean[~df_clean['Work_Location'].isin(location_aqi.keys())]['Work_Location'].unique()
print("Remaining Unmapped Home Locations:", unmapped_home)
print("Remaining Unmapped Work Locations:", unmapped_work)

Remaining Unmapped Home Locations: ['Farmgate' 'Mogbazar' 'Abdullahpur' 'Badda']
Remaining Unmapped Work Locations: ['Farmgate' 'Mogbazar']


In [8]:
# Filter rows and create a new DataFrame
df_filtered = df_clean[df_clean['Home_Location'].isin(location_aqi.keys()) & 
                      df_clean['Work_Location'].isin(location_aqi.keys())].copy()
print(f"Rows after removing unmapped locations: {len(df_filtered)}")

Rows after removing unmapped locations: 139


In [9]:
# Map locations to AQI values using .loc
df_filtered.loc[:, 'AQI_home'] = df_filtered['Home_Location'].map(location_aqi)
df_filtered.loc[:, 'AQI_work'] = df_filtered['Work_Location'].map(location_aqi)

# Update original DataFrame if needed
df = df_filtered.copy()

# Display first few rows
df.head()

Unnamed: 0,Home_Location,T_home,F_home,Work_Location,T_work,F_work,T_travel,Transport_Mode,AQI_home,AQI_work
0,Shahinbhag,6 -12 hours,Partially Open (Do use AC when needed),Bashundhara,4 - 8 hours,Partially Air-conditioned,1-3 hours,Private Car (AC),230,186
1,Bashundhara,More than 12 hours,Open House (No AC),Bashundhara,4 - 8 hours,Partially Air-conditioned,Less than 1 hour,Rickshaw/ CNG,186,186
2,Bashundhara,More than 12 hours,Open House (No AC),Bashundhara,4 - 8 hours,Fully Air-conditioned,1-3 hours,Walking/ Cycling,186,186
3,Shahbagh,6 -12 hours,Open House (No AC),Shahbagh,4 - 8 hours,Fully Air-conditioned,1-3 hours,Public Bus (Non-AC),250,250
4,Bashundhara,6 -12 hours,Open House (No AC),Bashundhara,4 - 8 hours,Fully Air-conditioned,Less than 1 hour,Walking/ Cycling,186,186


In [10]:
print("Missing AQI values:")
print(df[['AQI_home', 'AQI_work']].isna().sum())

Missing AQI values:
AQI_home    0
AQI_work    0
dtype: int64


In [11]:
# Drop Home_Location and Work_Location columns
df = df.drop(['Home_Location', 'Work_Location'], axis=1)
df.head()

Unnamed: 0,T_home,F_home,T_work,F_work,T_travel,Transport_Mode,AQI_home,AQI_work
0,6 -12 hours,Partially Open (Do use AC when needed),4 - 8 hours,Partially Air-conditioned,1-3 hours,Private Car (AC),230,186
1,More than 12 hours,Open House (No AC),4 - 8 hours,Partially Air-conditioned,Less than 1 hour,Rickshaw/ CNG,186,186
2,More than 12 hours,Open House (No AC),4 - 8 hours,Fully Air-conditioned,1-3 hours,Walking/ Cycling,186,186
3,6 -12 hours,Open House (No AC),4 - 8 hours,Fully Air-conditioned,1-3 hours,Public Bus (Non-AC),250,250
4,6 -12 hours,Open House (No AC),4 - 8 hours,Fully Air-conditioned,Less than 1 hour,Walking/ Cycling,186,186


In [12]:
# Define mappings for time columns based on the survey
hours_mapping = {
    # T_home
    'Less than 6 hours': 3,
    '6 -12 hours': 9,
    'More than 12 hours': 18,
    'Almost 24 hours': 24,
    # T_work
    'Less than 4 hours': 2,
    '4 - 8 hours': 6,
    '8 -12 hours': 10,
    'More than 12 hours': 18,
    # T_travel
    'Less than 1 hour': 0.5,
    '1-3 hours': 2,
    '4 - 6 hours': 5,
    '7 - 12 hours': 9.5,
    'More than 12 hours': 18
}

# Convert time columns to numerical values
df['T_home'] = df['T_home'].map(hours_mapping)
df['T_work'] = df['T_work'].map(hours_mapping)
df['T_travel'] = df['T_travel'].map(hours_mapping)

# Check for any unmapped time values
print("Unmapped T_home:", df['T_home'][df['T_home'].isna()].unique())
print("Unmapped T_work:", df['T_work'][df['T_work'].isna()].unique())
print("Unmapped T_travel:", df['T_travel'][df['T_travel'].isna()].unique())

df

Unmapped T_home: []
Unmapped T_work: []
Unmapped T_travel: []


Unnamed: 0,T_home,F_home,T_work,F_work,T_travel,Transport_Mode,AQI_home,AQI_work
0,9.0,Partially Open (Do use AC when needed),6.0,Partially Air-conditioned,2.0,Private Car (AC),230,186
1,18.0,Open House (No AC),6.0,Partially Air-conditioned,0.5,Rickshaw/ CNG,186,186
2,18.0,Open House (No AC),6.0,Fully Air-conditioned,2.0,Walking/ Cycling,186,186
3,9.0,Open House (No AC),6.0,Fully Air-conditioned,2.0,Public Bus (Non-AC),250,250
4,9.0,Open House (No AC),6.0,Fully Air-conditioned,0.5,Walking/ Cycling,186,186
...,...,...,...,...,...,...,...,...
140,9.0,Partially Open (Do use AC when needed),6.0,Fully Air-conditioned,2.0,Motorcycle,183,186
141,9.0,Partially Open (Do use AC when needed),6.0,Partially Air-conditioned,2.0,Public Bus (Non-AC),183,183
142,9.0,Partially Open (Do use AC when needed),6.0,Partially Air-conditioned,2.0,Public Bus (Non-AC),298,183
143,24.0,Partially Open (Do use AC when needed),6.0,Partially Air-conditioned,2.0,Private Car (AC),220,183


In [13]:
# Exposure factors for F_home (House_Type)
f_home_mapping = {
    'Fully Closed House (Usage of AC all the time)': 0.1,
    'Partially Open (Do use AC when needed)': 0.5,
    'Open House (No AC)': 1.0
}

# Exposure factors for F_work (Work_AC)
f_work_mapping = {
    'Fully Air-conditioned': 0.1,
    'Partially Air-conditioned': 0.5,
    'Fully Open place': 1.0
}

# Map exposure factors
df['F_home'] = df['F_home'].map(f_home_mapping)
df['F_work'] = df['F_work'].map(f_work_mapping)

# Check for unmapped factors
print("Unmapped F_home:", df['F_home'][df['F_home'].isna()].unique())
print("Unmapped F_work:", df['F_work'][df['F_work'].isna()].unique())

Unmapped F_home: []
Unmapped F_work: []


In [14]:
# Transport exposure factors

# Clean Transport_Mode: Fix 'Walking/rickshaw ' to 'Walking/Cycling'
df['Transport_Mode'] = df['Transport_Mode'].str.strip().replace('Walking/rickshaw', 'Walking/ Cycling')

transport_factor = {
    'Private Car (AC)': 0.1,
    'Public Bus (AC)': 0.2,
    'Metro Rail': 0.2,
    'Bus Rapid Transit': 0.3,
    'Private Car (No AC)': 0.5,
    'Rickshaw/ CNG': 0.7,
    'Motorcycle': 0.8,
    'Public Bus (Non-AC)': 0.9,
    'Walking/ Cycling': 1.0
}

# Map transport factors
df['Transport_factor'] = df['Transport_Mode'].map(transport_factor)

# Calculate AQI_travel as average of Home_Location_AQI and Work_Location_AQI, adjusted by transport factor
df['AQI_travel'] = ((df['AQI_home'] + df['AQI_work']) / 2) * df['Transport_factor']

# Check for unmapped transport factors
print("Unmapped Transport_Mode:", df['Transport_Mode'][df['Transport_factor'].isna()].unique())

df = df.drop(['Transport_Mode', 'Transport_factor'], axis=1)

df

Unmapped Transport_Mode: []


Unnamed: 0,T_home,F_home,T_work,F_work,T_travel,AQI_home,AQI_work,AQI_travel
0,9.0,0.5,6.0,0.5,2.0,230,186,20.80
1,18.0,1.0,6.0,0.5,0.5,186,186,130.20
2,18.0,1.0,6.0,0.1,2.0,186,186,186.00
3,9.0,1.0,6.0,0.1,2.0,250,250,225.00
4,9.0,1.0,6.0,0.1,0.5,186,186,186.00
...,...,...,...,...,...,...,...,...
140,9.0,0.5,6.0,0.1,2.0,183,186,147.60
141,9.0,0.5,6.0,0.5,2.0,183,183,164.70
142,9.0,0.5,6.0,0.5,2.0,298,183,216.45
143,24.0,0.5,6.0,0.5,2.0,220,183,20.15


In [15]:
# Verify AQI_travel exists and has no missing values
print("Missing AQI_travel:", df['AQI_travel'].isna().sum())

# Calculate AQI_avg using the formula
df['AQI_avg'] = (((df['T_home'] * df['F_home'] * df['AQI_home']) +
                 (df['T_work'] * df['F_work'] * df['AQI_work']) +
                 (df['T_travel'] * df['AQI_travel'])) / \
                (df['T_home'] + df['T_work'] + df['T_travel'])).round(2)

df

Missing AQI_travel: 0


Unnamed: 0,T_home,F_home,T_work,F_work,T_travel,AQI_home,AQI_work,AQI_travel,AQI_avg
0,9.0,0.5,6.0,0.5,2.0,230,186,20.80,96.15
1,18.0,1.0,6.0,0.5,0.5,186,186,130.20,162.09
2,18.0,1.0,6.0,0.1,2.0,186,186,186.00,147.37
3,9.0,1.0,6.0,0.1,2.0,250,250,225.00,167.65
4,9.0,1.0,6.0,0.1,0.5,186,186,186.00,121.20
...,...,...,...,...,...,...,...,...,...
140,9.0,0.5,6.0,0.1,2.0,183,186,147.60,72.37
141,9.0,0.5,6.0,0.5,2.0,183,183,164.70,100.11
142,9.0,0.5,6.0,0.5,2.0,298,183,216.45,136.64
143,24.0,0.5,6.0,0.5,2.0,220,183,20.15,100.92


In [16]:
# Define features (X) and target (y)
X = df[['T_home', 'F_home', 'T_work', 'F_work', 'T_travel', 'AQI_home', 'AQI_work', 'AQI_travel']]
y = df['AQI_avg']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Dictionary to store models and their performance
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Support Vector Machine': SVR()
}

# Dictionary to store performance metrics
performance = {
    'Model': [],
    'MSE': [],
    'MAE': [],
    'R2 Score': []
}

# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the metrics
    performance['Model'].append(model_name)
    performance['MSE'].append(mse)
    performance['MAE'].append(mae)
    performance['R2 Score'].append(r2)

# Create a DataFrame to display the performance
performance_df = pd.DataFrame(performance)
print(performance_df)

                    Model          MSE        MAE  R2 Score
0       Linear Regression   116.473561   7.827668  0.964207
1           Decision Tree   238.992271  10.532857  0.926556
2           Random Forest   192.445784  10.006700  0.940860
3  Support Vector Machine  2943.322674  40.669577  0.095502
