In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
# import xgboost
import tensorflow as tf


In [None]:
#Getting data from kaggle

import os 

print(os.getcwd())

if not os.path.exists("../secrets"):
    os.makedirs("../secrets")
    print("Created directory: ../secrets")
    raise Exception("Place kaggle.json in the secrets directory")
elif not os.path.exists("../secrets/kaggle.json"):
    raise Exception("Place kaggle.json in the secrets directory")


#adding gitignore to secrets directory
gitignore_path = os.path.join("../secrets", ".gitignore")

if not os.path.exists(gitignore_path):
    with open(gitignore_path, 'w') as f:
        f.write("*\n")


# Set environment variable for Kaggle config directory
if not os.environ.get('KAGGLE_CONFIG_DIR'):
    os.environ['KAGGLE_CONFIG_DIR'] = os.path.abspath("../secrets")

print("KAGGLE_CONFIG_DIR set to:", os.environ['KAGGLE_CONFIG_DIR'])

from kaggle.api.kaggle_api_extended import KaggleApi
import kaggle

# Download the dataset from Kaggle
api = KaggleApi()
api.authenticate()

dataset_name = "sid321axn/beijing-multisite-airquality-data-set"

download_dir = "../data"

if not os.path.exists(download_dir):
    os.makedirs(download_dir)
    print(f"Created directory: {download_dir}")

print(f"Downloading dataset {dataset_name} to {download_dir}")
api.dataset_download_files(dataset_name, path=download_dir, unzip=True)
print(f"Dataset downloaded and unzipped to {download_dir}")
if not os.path.exists(os.path.join(download_dir, ".gitignore")):
    with open(os.path.join(download_dir, ".gitignore"), 'w') as f:
        f.write("*\n")
    print(f"Created .gitignore in {download_dir}")



In [None]:
data = pd.read_csv('../data/PRSA_Data_Aotizhongxin_20130301-20170228.csv')

In [None]:
print("Shape:", data.shape)
data.head()

In [None]:
data.info()

In [None]:
missing = data.isnull().sum()
print(missing[missing > 0])

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('Visualization of Missing Values')
plt.show()

In [None]:
missing_percent = (data.isnull().sum() / len(data)) * 100
print(missing_percent[missing_percent > 0].sort_values(ascending=False))

In [None]:
# Forward-fill missing values
data_ffill = data.fillna(method='ffill')

# Backward-fill remaining if necessary
data_ffill_bfill = data_ffill.fillna(method='bfill')

# Verify no missing values remain
print("Remaining Missing:", data_ffill_bfill.isnull().sum().sum())

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(data_ffill_bfill.isnull(), cbar=False, cmap='viridis')
plt.title('Visualization of Missing Values After Imputation')
plt.show()

In [None]:
data_ffill_bfill.to_csv('../data/cleaned_AQI_data.csv', index=False)

In [None]:
# Combine date and time into single datetime column
data_ffill_bfill['datetime'] = pd.to_datetime(data_ffill_bfill[['year', 'month', 'day', 'hour']])

# Extracting time-related features
data_ffill_bfill['day_of_week'] = data_ffill_bfill['datetime'].dt.dayofweek
data_ffill_bfill['is_weekend'] = data_ffill_bfill['day_of_week'].isin([5,6]).astype(int)
data_ffill_bfill['month'] = data_ffill_bfill['datetime'].dt.month
data_ffill_bfill['hour'] = data_ffill_bfill['datetime'].dt.hour
data_ffill_bfill['season'] = data_ffill_bfill['month']%12 // 3 + 1  # 1: Winter, 2: Spring, 3: Summer, 4: Autumn

# View newly created features
data_ffill_bfill.head()

In [None]:
data_features = data_ffill_bfill.drop(columns=['No', 'year', 'month', 'day', 'hour', 'station'])

In [None]:
data_features.to_csv('../data/feature_engineered_AQI_data.csv', index=False)

In [None]:
data_features.head()


In [None]:
X = data_features.drop(columns=['datetime', 'PM2.5'])
y = data_features['PM2.5']

# Checking the shape clearly
print("Features shape:", X.shape)
print("Target shape:", y.shape)

In [None]:
from sklearn.model_selection import train_test_split

# 80-20 split (Time-series data should NOT be shuffled)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print(f"Training size: {X_train.shape[0]}, Testing size: {X_test.shape[0]}")

In [None]:
X_encoded = pd.get_dummies(X, columns=['wd'], drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, shuffle=False
)


In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify scaling
print("Scaled training data sample:\n", X_train_scaled[:5])

In [None]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Fit on training data ONLY, then transform both train & test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify scaling
print("Scaled training data sample:\n", X_train_scaled[:5])

In [None]:
np.save('../data/X_train_scaled.npy', X_train_scaled)
np.save('../data/X_test_scaled.npy', X_test_scaled)
np.save('../data/y_train.npy', y_train)
np.save('../data/y_test.npy', y_test)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
rf_model = RandomForestRegressor(
    n_estimators=100,      # Number of trees
    max_depth=None,        # Let trees expand fully
    random_state=42,       # For reproducibility
    n_jobs=-1              # Use all cores
)

In [None]:
rf_model.fit(X_train_scaled, y_train)

In [None]:
y_pred = rf_model.predict(X_test_scaled)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(y_test.values[:200], label='Actual', alpha=0.7)
plt.plot(y_pred[:200], label='Predicted', alpha=0.7)
plt.title("Random Forest - Actual vs Predicted PM2.5")
plt.xlabel("Time (hours)")
plt.ylabel("PM2.5 Concentration")
plt.legend()
plt.show()

In [None]:
importances = rf_model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame
feat_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df)
plt.title("Random Forest Feature Importances")
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}


In [None]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # RMSE (negative for minimization)
    cv=3,  # 3-fold cross-validation
    verbose=2,
    n_jobs=-1
)

In [None]:
grid_search.fit(X_train_scaled, y_train)

In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE (CV):", -grid_search.best_score_)


In [None]:
best_rf = grid_search.best_estimator_

y_pred_tuned = best_rf.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred_tuned)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_tuned))
r2 = r2_score(y_test, y_pred_tuned)

print(f"🔧 Tuned RF — MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.3f}")

In [None]:
importances = best_rf.feature_importances_
feat_imp_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df)
plt.title("Tuned Random Forest Feature Importances")
plt.show()