In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time

In [None]:
# Load the air quality data 
air_quality_data = pd.read_csv(r'C:\Users\tvams\OneDrive\Desktop\Imputation methods\Step-3.csv')

In [None]:
# Perform min-max normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(air_quality_data.drop(columns=['Date','AQI']))
X_normalized = normalized_data
y = air_quality_data['AQI']

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

In [None]:

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'SVM ': SVR(),
    'KNN ': KNeighborsRegressor()
}

In [None]:
# Define evaluation metrics
metrics = {
    'MAE': mean_absolute_error,
    'MSE': mean_squared_error,
    'RMSE': lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    'R2 Score': r2_score
}

In [None]:
# Perform k-fold cross-validation and evaluate each model
results = {}
for model_name, model in models.items():
    print(f'Running {model_name}...')
    start_time = time.time()
    scores = {'Time': [], 'MAE': [], 'MSE': [], 'RMSE': [], 'R2 Score': []}
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)
        end_time = time.time()
        time_taken = end_time - start_time
        scores['Time'].append(time_taken)
        for metric_name, metric_func in metrics.items():
            if metric_name != 'Time':
                scores[metric_name].append(metric_func(y_val_fold, y_pred))
    results[model_name] = scores

In [None]:
# Print evaluation results
for model_name, scores in results.items():
    print(f'\n{model_name}:')
    for metric_name, metric_values in scores.items():
        if metric_name == 'Time':
            print(f'Time: {np.mean(metric_values):.4f} seconds')
        else:
            print(f'{metric_name}: {np.mean(metric_values):.4f}')


In [None]:
# Plot actual versus predicted results for each model
plt.figure(figsize=(15, 10))
for i, (model_name, model) in enumerate(models.items(), 1):
    plt.subplot(3, 3, i)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plt.scatter(y_test, y_pred)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
    plt.xlabel('Actual AQI')
    plt.ylabel('Predicted AQI')
    plt.title(f'Actual vs Predicted AQI ({model_name})')
plt.tight_layout()
plt.show()

In [None]:
# Plot actual versus predicted results for each model
plt.figure(figsize=(15, 10))
for i, (model_name, model) in enumerate(models.items(), 1):
    plt.subplot(3, 3, i)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plt.plot(y_test.values, label='Actual', color='blue')
    plt.plot(y_pred, label='Predicted', color='red')
    plt.xlabel('Time')
    plt.ylabel('AQI')
    plt.title(f'Actual vs Predicted AQI ({model_name})')
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Plot actual AQI values against dates
plt.figure(figsize=(10, 6))
plt.plot(air_quality_data['Date'], air_quality_data['AQI'], color='blue')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.title('Actual AQI over Time')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Convert Date column to datetime format with correct format
air_quality_data['Date'] = pd.to_datetime(air_quality_data['Date'], format='%Y-%m-%d')

# Extract year from Date column
air_quality_data['Year'] = air_quality_data['Date'].dt.year

# Calculate average AQI for each year
average_aqi_by_year = air_quality_data.groupby('Year')['AQI'].mean()

# Plot average AQI values against years
plt.figure(figsize=(10, 6))
plt.plot(average_aqi_by_year.index, average_aqi_by_year.values, color='blue', marker='o', linestyle='-')
plt.xlabel('Year')
plt.ylabel('Average AQI')
plt.title('Average AQI by Year')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the Excel file and extract data
comparision = pd.read_excel(r'C:\Users\tvams\OneDrive\Desktop\Imputation methods\Comparison1.xlsx')

methods = comparision['Models'].tolist()
rmse_values = comparision['RMSE'].tolist()
mse_values = comparision['MSE'].tolist()
mae_values = comparision['MAE'].tolist()
r_squared_values = comparision['R2 Score'].tolist()

# Width of each bar
bar_width = 0.15
index = np.arange(len(methods))

# Plotting
plt.figure(figsize=(16, 8))

plt.bar(index - 1.5 * bar_width, rmse_values, bar_width, label='RMSE', color='blue')
plt.bar(index - 0.5 * bar_width, mse_values, bar_width, label='MSE', color='green')
plt.bar(index + 0.5 * bar_width, mae_values, bar_width, label='MAE', color='orange')
plt.bar(index + 1.5 * bar_width, r_squared_values, bar_width, label='R-squared', color='red')

plt.xlabel('Methods')
plt.ylabel('Error Value')
plt.title('Comparison of Methods by Evaluation  Metrics')
plt.xticks(index, methods)
plt.legend()
plt.tight_layout()
plt.show()

# Separate plots for each metric
plt.figure(figsize=(10, 6))

# RMSE Plot
plt.bar(methods, rmse_values, color='blue')
plt.xlabel('Methods')
plt.ylabel('RMSE')
plt.title('Comparison of Methods by RMSE')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# MSE Plot
plt.figure(figsize=(10, 6))
plt.bar(methods, mse_values, color='green')
plt.xlabel('Methods')
plt.ylabel('MSE')
plt.title('Comparison of Methods by MSE')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# MAE Plot
plt.figure(figsize=(10, 6))
plt.bar(methods, mae_values, color='orange')
plt.xlabel('Methods')
plt.ylabel('MAE')
plt.title('Comparison of Methods by MAE')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# R-squared Plot
plt.figure(figsize=(10, 6))
plt.bar(methods, r_squared_values, color='red')
plt.xlabel('Methods')
plt.ylabel('R-squared')
plt.title('Comparison of Methods by R-squared')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
