In [1]:
import pandas as pd
import numpy as np

df=pd.read_csv('AQI_Pre.csv')
df.head()

Unnamed: 0,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,...,City_Kochi,City_Kolkata,City_Lucknow,City_Mumbai,City_Patna,City_Shillong,City_Talcher,City_Thiruvananthapuram,City_Visakhapatnam,Month
0,2015-01-29,83.13,,6.93,28.71,33.72,,6.93,49.52,59.76,...,False,False,False,False,False,False,False,False,False,1
1,2015-01-30,79.84,,13.85,28.68,41.08,,13.85,48.49,97.07,...,False,False,False,False,False,False,False,False,False,1
2,2015-01-31,94.52,,24.39,32.66,52.61,,24.39,67.39,111.33,...,False,False,False,False,False,False,False,False,False,1
3,2015-02-01,135.99,,43.48,42.08,84.57,,43.48,75.23,102.7,...,False,False,False,False,False,False,False,False,False,2
4,2015-02-02,178.33,,54.56,35.31,72.8,,54.56,55.04,107.38,...,False,False,False,False,False,False,False,False,False,2


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24850 entries, 0 to 24849
Data columns (total 40 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     24850 non-null  object 
 1   PM2.5                    24172 non-null  float64
 2   PM10                     17764 non-null  float64
 3   NO                       24463 non-null  float64
 4   NO2                      24459 non-null  float64
 5   NOx                      22993 non-null  float64
 6   NH3                      18314 non-null  float64
 7   CO                       24405 non-null  float64
 8   SO2                      24245 non-null  float64
 9   O3                       24043 non-null  float64
 10  Benzene                  21315 non-null  float64
 11  Toluene                  19024 non-null  float64
 12  Xylene                   9478 non-null   float64
 13  AQI                      24850 non-null  float64
 14  City_Aizawl           

In [2]:
columns_to_drop = ['Date', 'PM2.5', 'NOx', 'Benzene', 'Toluene', 'Xylene', 'Month']
df = df.drop(columns=columns_to_drop)


In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Assuming your target variable is 'AQI'
X = df.drop('AQI', axis=1)
Y = df['AQI']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create the SimpleImputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the training data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Create the Decision Tree Regressor model
dt_model = DecisionTreeRegressor()

# Define the parameter grid for GridSearchCV
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, Y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_

# Use the best parameters to train the final model
final_dt_model = DecisionTreeRegressor(**best_params)
final_dt_model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred = final_dt_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

# Print the evaluation metrics
print("Decision Tree Regression Metrics:")
print("Best Parameters:", best_params)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)


Decision Tree Regression Metrics:
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Mean Squared Error (MSE): 4529.538907527653
Root Mean Squared Error (RMSE): 67.30184921328427
Mean Absolute Error (MAE): 38.37355128701566
R-squared (R2): 0.7526331891832327


In [6]:
import joblib

# Save the model to a file
joblib.dump(final_dt_model, 'AQI_Web.joblib')


['AQI_Web.joblib']

In [7]:
from sklearn.ensemble import RandomForestRegressor

# Create the SimpleImputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the training data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Create the Random Forest Regressor model with guessed parameters
rf_model = RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)

# Train the Random Forest Regressor model
rf_model.fit(X_train_imputed, Y_train)

# Make predictions on the test data
Y_pred_rf = rf_model.predict(X_test_imputed)

# Evaluate the Random Forest Regressor model
mse_rf = mean_squared_error(Y_test, Y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(Y_test, Y_pred_rf)
r2_rf = r2_score(Y_test, Y_pred_rf)

# Print the evaluation metrics for Random Forest Regressor
print("\nRandom Forest Regression Metrics:")
print("Mean Squared Error (MSE):", mse_rf)
print("Root Mean Squared Error (RMSE):", rmse_rf)
print("Mean Absolute Error (MAE):", mae_rf)
print("R-squared (R2):", r2_rf)



Random Forest Regression Metrics:
Mean Squared Error (MSE): 2995.126412562444
Root Mean Squared Error (RMSE): 54.727748104251866
Mean Absolute Error (MAE): 29.90675057647472
R-squared (R2): 0.8364303996953549


In [9]:
joblib.dump(rf_model, 'AQI_RWeb.joblib', compress=('zlib', 3))


['AQI_RWeb.joblib']

In [10]:
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Load the compressed Random Forest Regressor model
loaded_rf_model = joblib.load('AQI_RWeb.joblib')

# Make predictions on the test data
Y_pred_rf = loaded_rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(Y_test, Y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(Y_test, Y_pred_rf)
r2_rf = r2_score(Y_test, Y_pred_rf)

# Print the evaluation metrics for the Random Forest Regressor model
print("Random Forest Regression Metrics:")
print("Mean Squared Error (MSE):", mse_rf)
print("Root Mean Squared Error (RMSE):", rmse_rf)
print("Mean Absolute Error (MAE):", mae_rf)
print("R-squared (R2):", r2_rf)


Random Forest Regression Metrics:
Mean Squared Error (MSE): 2995.126412562444
Root Mean Squared Error (RMSE): 54.727748104251866
Mean Absolute Error (MAE): 29.90675057647472
R-squared (R2): 0.8364303996953549


In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24850 entries, 0 to 24849
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PM10                     17764 non-null  float64
 1   NO                       24463 non-null  float64
 2   NO2                      24459 non-null  float64
 3   NH3                      18314 non-null  float64
 4   CO                       24405 non-null  float64
 5   SO2                      24245 non-null  float64
 6   O3                       24043 non-null  float64
 7   City_Aizawl              24850 non-null  bool   
 8   City_Amaravati           24850 non-null  bool   
 9   City_Amritsar            24850 non-null  bool   
 10  City_Bengaluru           24850 non-null  bool   
 11  City_Bhopal              24850 non-null  bool   
 12  City_Brajrajnagar        24850 non-null  bool   
 13  City_Chandigarh          24850 non-null  bool   
 14  City_Chennai          

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV

X = df.drop('AQI', axis=1)
Y = df['AQI']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [5]:
df = df.dropna()


In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV

X = df.drop('AQI', axis=1)
Y = df['AQI']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Create a Linear Regression model
linear_reg = LinearRegression()

# Define the parameter grid to search
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False],
}

# Create GridSearchCV
grid_search = GridSearchCV(linear_reg, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, Y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Get the best model
best_linear_reg = grid_search.best_estimator_

# Predict on the test set
Y_pred = best_linear_reg.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

# Print results
print("Best Hyperparameters:", best_params)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)


Best Hyperparameters: {'fit_intercept': True, 'positive': False}
Mean Squared Error (MSE): 1027.1117491972482
R-squared (R2): 0.8682496453675741


In [9]:
from joblib import dump

# Assuming best_linear_reg is the best model obtained from GridSearchCV
# Save the best model to a file
dump(best_linear_reg, 'linreg.joblib')


['linreg.joblib']