In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DML/AQI_for Public Hel/master.csv')

display(df.head())

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.describe(include='all')

In [None]:
# --- AQI Distribution Visualization ---
plt.figure(figsize=(8,5))
sns.histplot(df['aqi'], bins=30, kde=True, color="skyblue")
plt.title("Distribution of AQI in Dataset", fontsize=14)
plt.xlabel("AQI Value")
plt.ylabel("Frequency")
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to 'district' column
df['district_encoded'] = label_encoder.fit_transform(df['district'])

# Apply Label Encoding to 'division' column
df['division_encoded'] = label_encoder.fit_transform(df['division'])

# Display the first few rows with the new encoded columns
display(df.head())

In [None]:
# Find the unique values of the 'district_encoded' column
unique_districts_encoded = df['district_encoded'].unique()

# Find the unique values of the 'division_encoded' column
unique_divisions_encoded = df['division_encoded'].unique()

# Display the unique values
print("Unique Encoded Districts:")
print(unique_districts_encoded)
print("\nUnique Encoded Divisions:")
print(unique_divisions_encoded)

In [None]:
# Create a mapping for district
district_mapping = df[['district', 'district_encoded']].drop_duplicates().sort_values('district_encoded').reset_index(drop=True)

# Create a mapping for division
division_mapping = df[['division', 'division_encoded']].drop_duplicates().sort_values('division_encoded').reset_index(drop=True)

# Display the mappings
print("District Encoding Mapping:")
print(district_mapping.to_string())

print("\nDivision Encoding Mapping:")
display(division_mapping)

In [None]:
# Count the number of entries for each division
division_counts = df['division'].value_counts()

# Display the results
print("Number of entries per division:")
print(division_counts)

In [None]:
# Find unique values of the 'aqi' column
unique_aqi_values = df['aqi'].unique()

# Count the occurrences of each unique AQI value
aqi_value_counts = df['aqi'].value_counts()

# Display the unique values and their counts
print("Unique AQI Values:")
print(unique_aqi_values)

print("\nAQI Value Counts:")
print(aqi_value_counts)

In [None]:
# Group by 'division' and calculate the mean of numerical columns
division_mean_stats = df.groupby('division').mean(numeric_only=True)

# Display the results
display(division_mean_stats)

# Print max and min overall AQI values
print("\nMaximum AQI value:", df['aqi'].max())
print("Minimum AQI value:", df['aqi'].min())

# Print max and min mean AQI values from division_mean_stats and mention the division name
max_mean_aqi_division = division_mean_stats['aqi'].idxmax()
min_mean_aqi_division = division_mean_stats['aqi'].idxmin()

print(f"\nMaximum mean AQI across divisions: {division_mean_stats['aqi'].max():.4f} ({max_mean_aqi_division})")
print(f"Minimum mean AQI across divisions: {division_mean_stats['aqi'].min():.4f} ({min_mean_aqi_division})")

In [None]:
# Group by 'district' and calculate the mean of numerical columns
district_mean_stats = df.groupby('district').mean(numeric_only=True)

# Display the results
display(district_mean_stats)

# Print max and min values for each numerical column
print("\nMax values per numerical column:")
print(district_mean_stats.max())

print("\nMin values per numerical column:")
print(district_mean_stats.min())

# Print max and min mean AQI values from district_mean_stats
print("\nMaximum mean AQI across districts:", district_mean_stats['aqi'].max())
print("Minimum mean AQI across districts:", district_mean_stats['aqi'].min())

In [None]:
df.columns

In [None]:
# List of numerical columns to analyze
numerical_cols_to_analyze = [
    'temp_c', 'humidity', 'pressure', 'wind_speed', 'clouds', 'rain',
    'aqi', 'pm2_5', 'pm10', 'o3', 'no2', 'so2', 'co'
]

# Group by 'division' and calculate max, min, and median for the specified columns
division_stats = df.groupby('division')[numerical_cols_to_analyze].agg(['max', 'min', 'median'])

# Display the results
print(division_stats.to_string())

# EDA


## Analyze numerical features

### Subtask:
Explore the distribution and summary statistics of numerical columns.


**Reasoning**:
Select numerical columns, generate descriptive statistics, and visualize distributions using histograms for key numerical features.



In [None]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=np.number).columns

# Generate descriptive statistics
numerical_stats = df[numerical_cols].describe()

# Display descriptive statistics
display(numerical_stats)

# Generate histograms for a few key numerical features
key_numerical_features = ['temp_c', 'humidity', 'wind_speed', 'aqi', 'pm2_5', 'pm10']

plt.figure(figsize=(15, 10))
for i, col in enumerate(key_numerical_features):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

## Analyze categorical features

### Subtask:
Examine the unique values and counts of categorical columns.


**Reasoning**:
Select categorical columns and print the number of unique values and value counts for each to examine their distributions.



In [None]:
# Select categorical columns
categorical_cols = df.select_dtypes(include='object').columns

# Examine unique values and their counts for each categorical column
for col in categorical_cols:
    print(f"Column: {col}")
    print(f"Number of unique values: {df[col].nunique()}")
    print("Value Counts:")
    print(df[col].value_counts())
    print("-" * 30)

## Analyze relationships

### Subtask:
Investigate the relationships between different features, including the target variable 'aqi', using visualizations and correlation analysis.


**Reasoning**:
Calculate the correlation matrix for numerical columns and visualize it using a heatmap to understand relationships between features, especially with 'aqi'.



In [None]:
# Calculate the correlation matrix for numerical columns
numerical_cols = df.select_dtypes(include=np.number).columns
correlation_matrix = df[numerical_cols].corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of Numerical Features", fontsize=16)
plt.show()

# Analyze correlations with 'aqi'
print("\nCorrelation with AQI:")
print(correlation_matrix['aqi'].sort_values(ascending=False))

**Reasoning**:
Based on the correlation analysis, visualize the relationship between 'aqi' and the features with strong correlations ('pm10', 'pm2_5', 'co', 'so2', 'no2', 'lat', 'lon') using scatter plots to further explore these relationships visually.



In [None]:
# Create scatter plots for features with notable correlations with 'aqi'
features_to_plot = ['pm10', 'pm2_5', 'co', 'so2', 'no2', 'lat', 'lon']

plt.figure(figsize=(15, 15))
for i, col in enumerate(features_to_plot):
    plt.subplot(3, 3, i + 1)
    sns.scatterplot(x=df[col], y=df['aqi'], alpha=0.6)
    plt.title(f'AQI vs {col}')
    plt.xlabel(col)
    plt.ylabel('AQI')
plt.tight_layout()
plt.show()

## Visualize distributions

### Subtask:
Create visualizations to understand the distribution of key variables.


**Reasoning**:
Create box plots to visualize the distribution of 'aqi' across different divisions and districts.



In [None]:
# Create box plots for AQI distribution across divisions
plt.figure(figsize=(12, 6))
sns.boxplot(x='division', y='aqi', data=df)
plt.title('AQI Distribution Across Divisions', fontsize=14)
plt.xlabel('Division')
plt.ylabel('AQI Value')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Create box plots for AQI distribution across districts (select a few for clarity)
# We'll select the top 10 districts by mean AQI for better visualization
district_mean_aqi = df.groupby('district')['aqi'].mean().sort_values(ascending=False).head(10).index
df_top_districts = df[df['district'].isin(district_mean_aqi)]

plt.figure(figsize=(15, 7))
sns.boxplot(x='district', y='aqi', data=df_top_districts)
plt.title('AQI Distribution Across Top 10 Districts by Mean AQI', fontsize=14)
plt.xlabel('District')
plt.ylabel('AQI Value')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

**Reasoning**:
Create count plots to visualize the distribution of data across different months and seasons.



In [None]:
# Ensure 'date' column is in datetime format and extract month
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month

# Create a 'season' column based on the month
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

df['season'] = df['month'].apply(get_season)

# Create count plot for month distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='month', data=df, palette='viridis')
plt.title('Distribution of Data Across Months', fontsize=14)
plt.xlabel('Month')
plt.ylabel('Number of Entries')
plt.show()

# Create count plot for season distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='season', data=df, palette='viridis')
plt.title('Distribution of Data Across Seasons', fontsize=14)
plt.xlabel('Season')
plt.ylabel('Number of Entries')
plt.show()

In [None]:
# Select some key numerical features for box plots
key_numerical_features = ['temp_c', 'humidity', 'aqi', 'pm2_5', 'pm10']

# Create box plots for each selected numerical feature across divisions
for col in key_numerical_features:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='division', y=col, data=df)
    plt.title(f'Distribution of {col} Across Divisions', fontsize=14)
    plt.xlabel('Division')
    plt.ylabel(col)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

##Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Select features (X) and target variable (y)
# Exclude 'date', 'district', 'division', 'month', 'season', and 'aqi' from features
features = ['lat', 'lon', 'temp_c', 'humidity', 'pressure', 'wind_speed', 'clouds', 'rain',
            'pm2_5', 'pm10', 'o3', 'no2', 'so2', 'co', 'district_encoded', 'division_encoded']
X = df[features]
y = df['aqi']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

**Note on AQI Scale:**

Based on the dataset's definition, the Air Quality Index (AQI) values in this dataset follow a scale where:

*   **1** represents **Good** air quality.
*   **5** represents **Bad** air quality.

Values in between represent varying degrees of air quality between good and bad.

##LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

model

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

In [None]:
# Visualize the distribution of actual and predicted AQI values using KDE plots
plt.figure(figsize=(10, 6))
sns.kdeplot(y_test, label='Actual AQI', fill=True)
sns.kdeplot(y_pred, label='Predicted AQI', fill=True)
plt.xlabel("AQI Value")
plt.ylabel("Density")
plt.title("Distribution of Actual vs. Predicted AQI Values (Linear Regression)")
plt.legend()
plt.grid(True)
plt.show()

##RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

rf_model

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the testing data using the Random Forest model
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Regressor Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae_rf}")
print(f"Mean Squared Error (MSE): {mse_rf}")
print(f"R-squared (R2): {r2_rf}")

In [None]:
# Visualize the distribution of actual and predicted AQI values using KDE plots for Random Forest
plt.figure(figsize=(10, 6))
sns.kdeplot(y_test, label='Actual AQI', fill=True)
sns.kdeplot(y_pred_rf, label='Predicted AQI (Random Forest)', fill=True)
plt.xlabel("AQI Value")
plt.ylabel("Density")
plt.title("Distribution of Actual vs. Predicted AQI Values (Random Forest Regressor)")
plt.legend()
plt.grid(True)
plt.show()

##GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR

# Initialize the Gradient Boosting Regressor model
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model on the training data
gbr_model.fit(X_train, y_train)

gbr_model


In [None]:
# Make predictions on the testing data using the Gradient Boosting model
y_pred_gbr = gbr_model.predict(X_test)



In [None]:
# Evaluate the Gradient Boosting model
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print("\nGradient Boosting Regressor Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae_gbr}")
print(f"Mean Squared Error (MSE): {mse_gbr}")
print(f"R-squared (R2): {r2_gbr}")

In [None]:
# Visualize the distribution of actual and predicted AQI values using KDE plots for Gradient Boosting
plt.figure(figsize=(10, 6))
sns.kdeplot(y_test, label='Actual AQI', fill=True)
sns.kdeplot(y_pred_gbr, label='Predicted AQI (Gradient Boosting)', fill=True)
plt.xlabel("AQI Value")
plt.ylabel("Density")
plt.title("Distribution of Actual vs. Predicted AQI Values (Gradient Boosting Regressor)")
plt.legend()
plt.grid(True)
plt.show()

##Support Vector Regressor

In [None]:
# Initialize the Support Vector Regressor model
# Using a linear kernel for simplicity, other kernels like 'rbf' can be explored
svr_model = SVR(kernel='linear')

# Train the model on the training data
svr_model.fit(X_train, y_train)

svr_model

In [None]:
# Make predictions on the testing data using the SVR model
y_pred_svr = svr_model.predict(X_test)

# Evaluate the SVR model
mae_svr = mean_absolute_error(y_test, y_pred_svr)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("\nSupport Vector Regressor Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae_svr}")
print(f"Mean Squared Error (MSE): {mse_svr}")
print(f"R-squared (R2): {r2_svr}")

In [None]:
# Visualize the distribution of actual and predicted AQI values using KDE plots for SVR
plt.figure(figsize=(10, 6))
sns.kdeplot(y_test, label='Actual AQI', fill=True)
sns.kdeplot(y_pred_svr, label='Predicted AQI (SVR)', fill=True)
plt.xlabel("AQI Value")
plt.ylabel("Density")
plt.title("Distribution of Actual vs. Predicted AQI Values (Support Vector Regressor)")
plt.legend()
plt.grid(True)
plt.show()

We have trained and evaluated the following regression models for predicting AQI:

*   **Linear Regression:**
    *   Mean Absolute Error (MAE): {{mae}}
    *   Mean Squared Error (MSE): {{mse}}
    *   R-squared (R2): {{r2}}

*   **Random Forest Regressor:**
    *   Mean Absolute Error (MAE): {{mae_rf}}
    *   Mean Squared Error (MSE): {{mse_rf}}
    *   R-squared (R2): {{r2_rf}}

*   **Gradient Boosting Regressor:**
    *   Mean Absolute Error (MAE): {{mae_gbr}}
    *   Mean Squared Error (MSE): {{mse_gbr}}
    *   R-squared (R2): {{r2_gbr}}

*   **Support Vector Regressor (SVR):**
    *   Mean Absolute Error (MAE): {{mae_svr}}
    *   Mean Squared Error (MSE): {{mse_svr}}
    *   R-squared (R2): {{r2_svr}}

Based on these metrics, the Random Forest Regressor and Gradient Boosting Regressor models appear to have the best performance with the lowest MAE and MSE and the highest R2 values.

Would you like to try any other specific models, or would you like me to interpret these results further?

## XGBoost Regressor

In [None]:
import xgboost as xgb

# Initialize the XGBoost Regressor model
# You can tune hyperparameters like n_estimators, learning_rate, max_depth, etc.
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model on the training data
xgb_model.fit(X_train, y_train)

xgb_model

In [None]:
 #Make predictions on the testing data
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
# Evaluate the XGBoost model
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("\nXGBoost Regressor Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae_xgb}")
print(f"Mean Squared Error (MSE): {mse_xgb}")
print(f"R-squared (R2): {r2_xgb}")

In [None]:
# Visualize the distribution of actual and predicted AQI values using KDE plots for XGBoost
plt.figure(figsize=(10, 6))
sns.kdeplot(y_test, label='Actual AQI', fill=True)
sns.kdeplot(y_pred_xgb, label='Predicted AQI (XGBoost)', fill=True)
plt.xlabel("AQI Value")
plt.ylabel("Density")
plt.title("Distribution of Actual vs. Predicted AQI Values (XGBoost Regressor)")
plt.legend()
plt.grid(True)
plt.show()

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
# Initialize the Decision Tree Regressor model
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model on the training data
dt_model.fit(X_train, y_train)

dt_model

In [None]:
# Make predictions on the testing data
y_pred_dt = dt_model.predict(X_test)

In [None]:
# Evaluate the Decision Tree model
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print("\nDecision Tree Regressor Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae_dt}")
print(f"Mean Squared Error (MSE): {mse_dt}")
print(f"R-squared (R2): {r2_dt}")


In [None]:
# Visualize the distribution of actual and predicted AQI values using KDE plots for Decision Tree
plt.figure(figsize=(10, 6))
sns.kdeplot(y_test, label='Actual AQI', fill=True)
sns.kdeplot(y_pred_dt, label='Predicted AQI (Decision Tree)', fill=True)
plt.xlabel("AQI Value")
plt.ylabel("Density")
plt.title("Distribution of Actual vs. Predicted AQI Values (Decision Tree Regressor)")
plt.legend()
plt.grid(True)
plt.show()

 own input features below and get predictions from the trained models. Make sure the input features are in the same order as the features used for training (`lat`, `lon`, `temp_c`, `humidity`, `pressure`, `wind_speed`, `clouds`, `rain`, `pm2_5`, `pm10`, `o3`, `no2`, `so2`, `co`, `district_encoded`, `division_encoded`).

In [None]:
# Define your custom input features as a list
# Example input (replace with your desired values):
custom_input = [[23.8, 90.4, 30.0, 70, 1005, 3.0, 80, 0.1, 15.0, 20.0, 40.0, 5.0, 2.0, 150.0, 13, 2]]

# Convert the custom input list to a pandas DataFrame
custom_input_df = pd.DataFrame(custom_input, columns=X_train.columns)

# Make predictions using the three best-performing models
prediction_rf_custom = rf_model.predict(custom_input_df)
prediction_gbr_custom = gbr_model.predict(custom_input_df)
prediction_dt_custom = dt_model.predict(custom_input_df)

# Print the predictions
print("Predictions for custom input:")
print(f"Random Forest: {prediction_rf_custom.tolist()}")
print(f"Gradient Boosting: {prediction_gbr_custom.tolist()}")
print(f"Decision Tree: {prediction_dt_custom.tolist()}")