In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load the data
safedata = pd.read_csv('../Processed_Data/AQI_Final.csv')
data = pd.read_csv('../Processed_Data/AQI_Final.csv')

In [None]:
def randomRF(State_Name):
    safedata = pd.read_csv('/Users/rahib/Air_Quality_Predictor/Processed_Data/AQI_Final.csv')
    data = pd.read_csv('/Users/rahib/Air_Quality_Predictor/Processed_Data/AQI_Final.csv')
    # Filter the state
    data = data[data['State Name'] == State_Name]
    
    # Drop non-numeric and irrelevant columns for prediction (e.g., 'State Name' and 'Year')
    safedata = safedata.drop(columns=[ 'Unknown Fuel','Ethanol/Flex (E85)', 'Diesel', 'Hybrid Electric (HEV)', 
                            'Electric (EV)','Biodiesel' ,'Compressed Natural Gas (CNG)', 'Gasoline', 'Plug-In Hybrid Electric (PHEV)', ])

    data = data.drop(columns=[ 'Year','State Name' ])

    # Check for missing values and fill or drop them as needed
    if data.isnull().sum().sum() > 0:
        data = data.fillna(data.mean())  # Fill missing values with column means (alternative: data.dropna())

    # Split data into features (X) and target (y)
    X = data.drop(columns=['Overall AQI'])  # Features
    y = data['Overall AQI']  # Target

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize features to improve model performance
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, color='blue', alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)  # 45-degree line
    plt.xlabel('Actual AQI')
    plt.ylabel('Predicted AQI')
    plt.title(f'Actual vs Predicted AQI for {State_Name} ')
    plt.show()

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Model Performance for {State_Name}")
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R^2 Score: {r2}")

    # Optional: Feature Importance
    feature_importances = pd.DataFrame(model.feature_importances_, index=data.drop(columns=['Overall AQI']).columns, columns=['Importance']).sort_values('Importance', ascending=False)
    #print("\nFeature Importances:")
    #print(feature_importances)
    merge = pd.merge(safedata,data, how='inner')
    #print(merge)
    return mse, r2, mae

In [None]:
states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", 
    "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", 
    "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maryland", 
    "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", 
    "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", 
    "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", 
    "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", 
    "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

year = [2016,2017,2018,2019,2020,2021,2022]


In [None]:
AQI_error = []
AQI_r2 = []
AQi_MAe = []
for state in states:
    mse, r2, mae = randomRF(state)
    AQI_error.append(mse)
    AQi_MAe.append(mae)
    AQI_r2.append(r2)

In [None]:
print(f"Overall mean MSE: {np.mean(AQI_error)}")
print(f"Overall mean Mae: {np.mean(AQi_MAe)}")
print(f"Overall mean R^2: {np.mean(AQI_r2)}")

In [None]:
safedata = pd.read_csv('/Users/rahib/Air_Quality_Predictor/Processed_Data/AQI_Final.csv')
data = pd.read_csv('/Users/rahib/Air_Quality_Predictor/Processed_Data/AQI_Final.csv')
# Filter the state
#data = data[data['State Name'] == State_Name]

# Drop non-numeric and irrelevant columns for prediction (e.g., 'State Name' and 'Year')
safedata = safedata.drop(columns=[ 'Unknown Fuel','Ethanol/Flex (E85)', 'Diesel', 'Hybrid Electric (HEV)', 
                        'Electric (EV)','Biodiesel' ,'Compressed Natural Gas (CNG)', 'Gasoline', 'Plug-In Hybrid Electric (PHEV)', ])

data = data.drop(columns=[ 'Year','State Name', 'Unknown Fuel','Ethanol/Flex (E85)', 'Diesel', 'Hybrid Electric (HEV)', 
                        'Electric (EV)','Biodiesel' ,'Compressed Natural Gas (CNG)', 'Gasoline', 'Plug-In Hybrid Electric (PHEV)' ])

# Check for missing values and fill or drop them as needed
if data.isnull().sum().sum() > 0:
    data = data.fillna(data.mean())  # Fill missing values with column means (alternative: data.dropna())

# Split data into features (X) and target (y)
X = data.drop(columns=['Overall AQI'])  # Features
y = data['Overall AQI']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=500)

# Standardize features to improve model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)  # 45-degree line
plt.xlabel('Actual AQI')
plt.ylabel('Predicted AQI')
plt.title(f'Actual vs Predicted AQI ')
plt.show()

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance ")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")

# Optional: Feature Importance
feature_importances = pd.DataFrame(model.feature_importances_, index=data.drop(columns=['Overall AQI']).columns, columns=['Importance']).sort_values('Importance', ascending=False)
#print("\nFeature Importances:")
#print(feature_importances)
merge = pd.merge(safedata,data, how='inner')
#print(merge)


In [None]:
def predict_aqi(state_name, population, non_renewable_vehicles, model, df):
    
    df = df.dropna()
    df.reset_index(inplace=True)

    # One-hot encode the State Name for the custom input
    df_encoded = pd.get_dummies(df, columns=['State Name'], drop_first=True)

    # Prepare the custom input data
    input_data = pd.DataFrame({
        'Population': [population],
        'Non-Renewable Vehicles': [non_renewable_vehicles],
        'State Name_' + state_name: [1],  # One-hot encode the State Name input
    })
    
    # Ensure all possible one-hot encoded columns are present, fill with 0 for missing states
    for column in df_encoded.columns:
        if column not in input_data.columns:
            input_data[column] = 0

    # Ensure the column names match
    input_data = input_data[['Population', 'Non-Renewable Vehicles'] + [col for col in df_encoded.columns if 'State Name_' in col]]
    
    # Make the prediction
    predicted_aqi = model.predict(input_data)
    return predicted_aqi[0]  # Return the predicted AQI

In [36]:
state_name = "New York"  # Example state
population = 1500  # / 1000
non_renewable_vehicles = 9956000  # Example number of non-renewable vehicles

predicted_aqi = predict_aqi(state_name, population, non_renewable_vehicles, randomRF, data)
print(f"Predicted AQI for {state_name}: {predicted_aqi}")

KeyError: "None of [Index(['State Name'], dtype='object')] are in the [columns]"