In [2]:
import pandas as pd
# Load the dataset
df = pd.read_csv('global air pollution dataset.csv')
# Display the first few rows to understand it
print(df.head())


              Country              City  AQI Value AQI Category  CO AQI Value  \
0  Russian Federation        Praskoveya         51     Moderate             1   
1              Brazil  Presidente Dutra         41         Good             1   
2               Italy   Priolo Gargallo         66     Moderate             1   
3              Poland         Przasnysz         34         Good             1   
4              France          Punaauia         22         Good             0   

  CO AQI Category  Ozone AQI Value Ozone AQI Category  NO2 AQI Value  \
0            Good               36               Good              0   
1            Good                5               Good              1   
2            Good               39               Good              2   
3            Good               34               Good              0   
4            Good               22               Good              0   

  NO2 AQI Category  PM2.5 AQI Value PM2.5 AQI Category  
0             Good     

In [37]:
#AQI means Air Quality Index
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23035 entries, 0 to 23462
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Country             23035 non-null  object
 1   City                23035 non-null  object
 2   AQI Value           23035 non-null  int64 
 3   AQI Category        23035 non-null  object
 4   CO AQI Value        23035 non-null  int64 
 5   CO AQI Category     23035 non-null  object
 6   Ozone AQI Value     23035 non-null  int64 
 7   Ozone AQI Category  23035 non-null  object
 8   NO2 AQI Value       23035 non-null  int64 
 9   NO2 AQI Category    23035 non-null  object
 10  PM2.5 AQI Value     23035 non-null  int64 
 11  PM2.5 AQI Category  23035 non-null  object
dtypes: int64(5), object(7)
memory usage: 2.3+ MB


In [38]:
df.describe()

Unnamed: 0,AQI Value,CO AQI Value,Ozone AQI Value,NO2 AQI Value,PM2.5 AQI Value
count,23035.0,23035.0,23035.0,23035.0,23035.0
mean,72.344693,1.376254,35.233905,3.084741,68.883482
std,56.360992,1.844926,28.236613,5.281708,55.057396
min,6.0,0.0,0.0,0.0,0.0
25%,39.0,1.0,21.0,0.0,35.0
50%,55.0,1.0,31.0,1.0,54.0
75%,80.0,1.0,40.0,4.0,79.0
max,500.0,133.0,235.0,91.0,500.0


In [39]:
df.isnull().sum()


Country               0
City                  0
AQI Value             0
AQI Category          0
CO AQI Value          0
CO AQI Category       0
Ozone AQI Value       0
Ozone AQI Category    0
NO2 AQI Value         0
NO2 AQI Category      0
PM2.5 AQI Value       0
PM2.5 AQI Category    0
dtype: int64

In [40]:
#There are missing values on the Country column and City
#We cannot fill the categorical values with mean or meadian
#If we fill those rows with mode or Unknown it's goining to affect the model performance.
#In conclution we can remve those null rows because in country column there are 428 rows null
#therer are total of 23463 rows availbale so 428/23463 = 1.8% so if we romove the null rows it's not goining to affect the model performance.

# Print the number of rows before dropping
print("Rows before dropping:", len(df))


Rows before dropping: 23035


In [41]:
# Drop rows where 'Country' or 'City' is null
# inplace=True means the change will happen directly to our 'df' dataframe

df.dropna(subset=['Country', 'City'], inplace=True)


In [42]:
# Print the number of rows after dropping to see the change
print("Rows after dropping:", len(df))


Rows after dropping: 23035


In [43]:
# Check for missing values again
print("\nMissing values after cleaning:")
print(df.isnull().sum())



Missing values after cleaning:
Country               0
City                  0
AQI Value             0
AQI Category          0
CO AQI Value          0
CO AQI Category       0
Ozone AQI Value       0
Ozone AQI Category    0
NO2 AQI Value         0
NO2 AQI Category      0
PM2.5 AQI Value       0
PM2.5 AQI Category    0
dtype: int64


In [44]:
#Remove Data Leakages
# List of columns to drop
columns_to_drop = [
    'AQI Value',
    'AQI Category',
    'CO AQI Category',
    'Ozone AQI Category',
    'NO2 AQI Category',
    'PM2.5 AQI Category'
]

In [45]:
# Drop the columns from the DataFrame
# We are creating a new DataFrame 'df_cleaned' to be safe
df_cleaned = df.drop(columns=columns_to_drop, axis=1)


In [46]:
# Display the first few rows of the new cleaned DataFrame to check
print("DataFrame after dropping leaky columns:")
print(df_cleaned.head())

DataFrame after dropping leaky columns:
              Country              City  CO AQI Value  Ozone AQI Value  \
0  Russian Federation        Praskoveya             1               36   
1              Brazil  Presidente Dutra             1                5   
2               Italy   Priolo Gargallo             1               39   
3              Poland         Przasnysz             1               34   
4              France          Punaauia             0               22   

   NO2 AQI Value  PM2.5 AQI Value  
0              0               51  
1              1               41  
2              2               66  
3              0               20  
4              0                6  


In [47]:
# Drop the 'City' column because of the Curse of Dimensionality

print("Shape before dropping City:", df_cleaned.shape)
df_cleaned = df_cleaned.drop(columns=['City'])
print("Shape after dropping City:", df_cleaned.shape)

Shape before dropping City: (23035, 6)
Shape after dropping City: (23035, 5)


In [48]:
# Check the new columns
print("\nNew columns:", df_cleaned.columns)


New columns: Index(['Country', 'CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value',
       'PM2.5 AQI Value'],
      dtype='object')


In [49]:
print("Original shape:", df_cleaned.shape)

Original shape: (23035, 5)


In [50]:
#Encording Country using One-Hot Encoding
#If we get the City Column we have to Encode it as well
# Apply One-Hot Encoding using pandas get_dummies
# This will find all text columns and convert them

print("\nStarting One-Hot Encoding...")
df_encoded = pd.get_dummies(df_cleaned, columns=['Country'], drop_first=True)


Starting One-Hot Encoding...


In [51]:

# Display the first few rows of the new encoded DataFrame
print("\nDataFrame after One-Hot Encoding:")
print(df_encoded.head())



DataFrame after One-Hot Encoding:
   CO AQI Value  Ozone AQI Value  NO2 AQI Value  PM2.5 AQI Value  \
0             1               36              0               51   
1             1                5              1               41   
2             1               39              2               66   
3             1               34              0               20   
4             0               22              0                6   

   Country_Albania  Country_Algeria  Country_Andorra  Country_Angola  \
0            False            False            False           False   
1            False            False            False           False   
2            False            False            False           False   
3            False            False            False           False   
4            False            False            False           False   

   Country_Argentina  Country_Armenia  ...  \
0              False            False  ...   
1              False           

In [52]:
# Check the new shape. You will see a lot more columns!
print("\nNew shape:", df_encoded.shape)



New shape: (23035, 178)


In [53]:
# Define the Target (y)
# We want to predict the 'PM2.5 AQI Value'

y = df_encoded['PM2.5 AQI Value']


In [54]:
# Define the Features (X)
# X is everything ELSE, so we drop the target column from the DataFrame

X = df_encoded.drop(columns=['PM2.5 AQI Value'], axis=1)

In [55]:
# Check the shapes to make sure it's correct
print("Shape of X (Features):", X.shape)
print("Shape of y (Target):", y.shape)

Shape of X (Features): (23035, 177)
Shape of y (Target): (23035,)


In [56]:
# You can also look at the first few rows of each
print("\nFirst 5 rows of X:")
print(X.head())
print("\nFirst 5 values of y:")
print(y.head())


First 5 rows of X:
   CO AQI Value  Ozone AQI Value  NO2 AQI Value  Country_Albania  \
0             1               36              0            False   
1             1                5              1            False   
2             1               39              2            False   
3             1               34              0            False   
4             0               22              0            False   

   Country_Algeria  Country_Andorra  Country_Angola  Country_Argentina  \
0            False            False           False              False   
1            False            False           False              False   
2            False            False           False              False   
3            False            False           False              False   
4            False            False           False              False   

   Country_Armenia  Country_Aruba  ...  Country_United Republic of Tanzania  \
0            False          False  ...         

In [57]:
# Train The model

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# Check the shapes of the new sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (18428, 177)
Shape of X_test: (4607, 177)
Shape of y_train: (18428,)
Shape of y_test: (4607,)


In [59]:
from sklearn.linear_model import LinearRegression

# Create an empty instance of the Linear Regression model
# We'll call our model 'lr_model' (Linear Regression)

lr_model = LinearRegression()

In [60]:
# Train the model using our training data
# This is where the model "learns"
print("Training the Linear Regression model...")
lr_model.fit(X_train, y_train)
print("Training complete!")

Training the Linear Regression model...
Training complete!


In [61]:
# Use the trained model (lr_model) to make predictions on the test data (X_test)
print("Making predictions on the test data...")
y_pred_lr = lr_model.predict(X_test)
print("Predictions are ready!")

Making predictions on the test data...
Predictions are ready!


In [62]:
# You can look at the first 5 predictions and compare them with the first 5 actual values
print("\nFirst 5 Predictions:", y_pred_lr[:5])
print("First 5 Actual Values:", y_test.values[:5])



First 5 Predictions: [143.28743436 133.46314589  26.96426152 146.23900771  34.92835843]
First 5 Actual Values: [139 154  32  99  97]


In [63]:
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2_lr = r2_score(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)

# First, calculate the Mean Squared Error (MSE)
mse_lr = mean_squared_error(y_test, y_pred_lr)
# Then, get the square root of the MSE to find the RMSE
rmse_lr = np.sqrt(mse_lr)


In [64]:
print("\n--- Linear Regression Model Evaluation ---")
print(f"R-squared (R²) Score: {r2_lr:.4f}")
print(f"Mean Absolute Error (MAE): {mae_lr:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr:.4f}")


--- Linear Regression Model Evaluation ---
R-squared (R²) Score: -64509351397954504.0000
Mean Absolute Error (MAE): 211544538.9356
Root Mean Squared Error (RMSE): 14358559862.1514


In [65]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# 1. Create an empty instance of the Random Forest Regressor model
# n_estimators=100 means it will build 100 decision trees
# random_state=42 ensures we get the same result every time
# n_jobs=-1 tells the model to use all available CPU power to speed up training

rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

In [66]:
#Train the model using the SAME training data
print("Training the Random Forest Model...")

rf_model.fit(X_train,y_train)
print("Training Complete")

Training the Random Forest Model...
Training Complete


In [67]:
#Make predictions with the new, powerful model

print("\nMaking predictions with the Random Forest model...")
y_pred_rf = rf_model.predict(X_test)



Making predictions with the Random Forest model...


In [68]:
#Evaluate the new model using RMSE calculation

r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

In [69]:
#Print the new results AND compare them with the Linear Regression results

print("\n--- Model Comparison ---")
print("                    Random Forest   |   Linear Regression")
print("----------------------------------------------------------")
print(f"R-squared (R²) Score:   {r2_rf:.4f}          |   {r2_lr:.4f}")
print(f"Mean Absolute Error (MAE):  {mae_rf:.4f}         |   {mae_lr:.4f}")
print(f"Root Mean Squared Error (RMSE):{rmse_rf:.4f}         |   {rmse_lr:.4f}")


--- Model Comparison ---
                    Random Forest   |   Linear Regression
----------------------------------------------------------
R-squared (R²) Score:   0.7152          |   -64509351397954504.0000
Mean Absolute Error (MAE):  16.0553         |   211544538.9356
Root Mean Squared Error (RMSE):30.1699         |   14358559862.1514


In [70]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Get the feature importances from the trained Random Forest model
importances = rf_model.feature_importances_

In [None]:
# Get the names of the features
feature_names = X_train.columns

In [None]:
# Create a DataFrame to view them together
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

In [None]:
# Sort the DataFrame to see the most important features at the top
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Visualize the Top 10 most important features
plt.figure(figsize=(10, 6)) # Set the size of the plot
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(10)) # Plot a bar chart
plt.title('Top 10 Most Important Features for Predicting PM2.5 AQI')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Print the top 10 features
print("\nTop 10 most important features:")
print(feature_importance_df.head(10))

In [1]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Load and preprocess the data
print("Loading and preprocessing data...")
df = pd.read_csv('global air pollution dataset.csv')

# Remove null values
df.dropna(subset=['Country', 'City'], inplace=True)
print(f"Data shape after cleaning: {df.shape}")

# Remove data leakages
columns_to_drop = [
    'AQI Value',
    'AQI Category',
    'CO AQI Category',
    'Ozone AQI Category',
    'NO2 AQI Category',
    'PM2.5 AQI Category'
]
df_cleaned = df.drop(columns=columns_to_drop, axis=1)

# Drop City column to avoid curse of dimensionality
df_cleaned = df_cleaned.drop(columns=['City'])

# Apply One-Hot Encoding to Country column
df_encoded = pd.get_dummies(df_cleaned, columns=['Country'], drop_first=True)
print(f"Shape after encoding: {df_encoded.shape}")

# Define features and target
y = df_encoded['PM2.5 AQI Value']
X = df_encoded.drop(columns=['PM2.5 AQI Value'], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

print("Training models...")

Loading and preprocessing data...
Data shape after cleaning: (23035, 12)
Shape after encoding: (23035, 178)
Training set shape: (18428, 177)
Test set shape: (4607, 177)
Training models...


In [2]:
# Train Linear Regression Model
print("Training Linear Regression model...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluate Linear Regression
r2_lr = r2_score(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)

print("Linear Regression Results:")
print(f"R² Score: {r2_lr:.4f}")
print(f"MAE: {mae_lr:.4f}")
print(f"RMSE: {rmse_lr:.4f}")

# Train Random Forest Model
print("\nTraining Random Forest model...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

print("Random Forest Results:")
print(f"R² Score: {r2_rf:.4f}")
print(f"MAE: {mae_rf:.4f}")
print(f"RMSE: {rmse_rf:.4f}")

print("\n--- Model Comparison ---")
print("                    Random Forest   |   Linear Regression")
print("----------------------------------------------------------")
print(f"R-squared (R²) Score:   {r2_rf:.4f}          |   {r2_lr:.4f}")
print(f"Mean Absolute Error (MAE):  {mae_rf:.4f}         |   {mae_lr:.4f}")
print(f"Root Mean Squared Error (RMSE):{rmse_rf:.4f}         |   {rmse_lr:.4f}")

Training Linear Regression model...
Linear Regression Results:
R² Score: -64509351397954504.0000
MAE: 211544538.9356
RMSE: 14358559862.1514

Training Random Forest model...
Random Forest Results:
R² Score: 0.7152
MAE: 16.0553
RMSE: 30.1699

--- Model Comparison ---
                    Random Forest   |   Linear Regression
----------------------------------------------------------
R-squared (R²) Score:   0.7152          |   -64509351397954504.0000
Mean Absolute Error (MAE):  16.0553         |   211544538.9356
Root Mean Squared Error (RMSE):30.1699         |   14358559862.1514


In [3]:
# Save the trained models using joblib
print("\nSaving models to joblib files...")

# Save Linear Regression model
joblib.dump(lr_model, 'linear_regression_air_pollution_model.joblib')
print("✓ Linear Regression model saved as 'linear_regression_air_pollution_model.joblib'")

# Save Random Forest model
joblib.dump(rf_model, 'random_forest_air_pollution_model.joblib')
print("✓ Random Forest model saved as 'random_forest_air_pollution_model.joblib'")

# Also save the feature names for future use
feature_names = X_train.columns.tolist()
joblib.dump(feature_names, 'feature_names.joblib')
print("✓ Feature names saved as 'feature_names.joblib'")

print(f"\nTotal features used: {len(feature_names)}")
print("Models successfully saved and ready for deployment!")


Saving models to joblib files...
✓ Linear Regression model saved as 'linear_regression_air_pollution_model.joblib'
✓ Random Forest model saved as 'random_forest_air_pollution_model.joblib'
✓ Feature names saved as 'feature_names.joblib'

Total features used: 177
Models successfully saved and ready for deployment!


In [4]:
# Demonstration: How to load and use the saved models
print("=== Loading Saved Models ===")

# Load the saved models
loaded_lr_model = joblib.load('linear_regression_air_pollution_model.joblib')
loaded_rf_model = joblib.load('random_forest_air_pollution_model.joblib')
loaded_feature_names = joblib.load('feature_names.joblib')

print("✓ Linear Regression model loaded successfully")
print("✓ Random Forest model loaded successfully")
print("✓ Feature names loaded successfully")

print(f"\nModel details:")
print(f"- Number of features: {len(loaded_feature_names)}")
print(f"- Random Forest estimators: {loaded_rf_model.n_estimators}")

# Example prediction using a sample from the test set
print("\n=== Example Prediction ===")
sample_index = 0
sample_features = X_test.iloc[sample_index:sample_index+1]
actual_value = y_test.iloc[sample_index]

# Make predictions with both models
lr_prediction = loaded_lr_model.predict(sample_features)[0]
rf_prediction = loaded_rf_model.predict(sample_features)[0]

print(f"Actual PM2.5 AQI Value: {actual_value}")
print(f"Linear Regression Prediction: {lr_prediction:.2f}")
print(f"Random Forest Prediction: {rf_prediction:.2f}")
print(f"Random Forest Error: {abs(rf_prediction - actual_value):.2f}")

print("\n🎉 Models are ready for production use!")

=== Loading Saved Models ===
✓ Linear Regression model loaded successfully
✓ Random Forest model loaded successfully
✓ Feature names loaded successfully

Model details:
- Number of features: 177
- Random Forest estimators: 100

=== Example Prediction ===
Actual PM2.5 AQI Value: 139
Linear Regression Prediction: 143.29
Random Forest Prediction: 144.63
Random Forest Error: 5.63

🎉 Models are ready for production use!
