<a href="https://colab.research.google.com/github/Rustam64/RandomForest/blob/main/XGBoost_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

#File names based on the area
files = {
    "central": "central.xlsx",
    "east": "east.xlsx",
    "west": "west.xlsx",
    "north": "north.xlsx",
    "south": "south.xlsx"
}

dfs = []
# Load each file and add the area
for area, filename in files.items():
    df_area = pd.read_excel(filename)
    df_area["area"] = area
    dfs.append(df_area)


# Combine into one dataframe
df = pd.concat(dfs, ignore_index=True)

# Replace 'tmin' with NaN before dropping rows
df['tmin'] = df['tmin'].astype(str).str.replace('`', '', regex=False)
df['tmin'] = pd.to_numeric(df['tmin'], errors='coerce')
df = df.dropna()
df = df.drop(['observation '], axis=1)

print(df.head())
print(df["area"].value_counts())

FileNotFoundError: [Errno 2] No such file or directory: 'central.xlsx'

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode area into integers
df["area_id"] = LabelEncoder().fit_transform(df["area"])
df = df.drop(['area'], axis=1)

In [None]:
# Data is already sorted by time. This is just to confirm before training the model.
df = df.sort_values(by="Time")

n = len(df)
# Iloc is used to set start and endpoints in the dataframe
# Data split: 70% train, 15% val, 15% test
train_df = df.iloc[:int(0.7*n)]
val_df   = df.iloc[int(0.7*n):int(0.85*n)]
test_df  = df.iloc[int(0.85*n):]

In [None]:
features = ["logPM2.5","Rain","tmin","Traffic","Temp","RH","tmax","Wd","Ws","wc","ws","Lo","La"]
target = "logPM2.5"

X_train, y_train = train_df[features], train_df[target]
X_val, y_val = val_df[features], val_df[target]
X_test, y_test = test_df[features], test_df[target]

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Initialize model
xgb = XGBRegressor(
    n_estimators=500,      # number of (trees)
    learning_rate=0.05,
    max_depth=6,           # depth of trees
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    # reg_lambda=1.0,        # L2 regularization
    # reg_alpha=0.0          # L1 regularization
)

xgb.fit(X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False)

# Validate
val_preds = xgb.predict(X_val)
val_rmse = mean_squared_error(y_val, val_preds)
print("Validation RMSE:", val_rmse)

In [None]:
test_preds = xgb.predict(X_test)
test_mae = mean_squared_error(y_test, test_preds)
print("Test MAE:", test_mae)
test_rmse = mean_squared_error(y_test, test_preds)
print("Test RMSE:", test_rmse)
test_r2 = xgb.score(X_test, y_test)
print("Test R2:", test_r2)

In [None]:
from xgboost import plot_importance
plot_importance(xgb, importance_type="gain")