In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# xgboost
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

In [3]:
data = pd.read_csv(r"..\data\part_2\daily_data_cleaned.csv")
data.head()

Unnamed: 0,day_id,city_id,temperature_celsius,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,condition_text
0,D0003,0,20.0,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,3
1,D0004,0,17.0,6.1,150,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,1,0
2,D0007,0,21.0,4.0,310,1015.0,0.0,100,50,21.0,10.0,1.0,15.1,2,6
3,D0019,0,19.0,3.6,64,1017.0,0.0,88,0,19.0,10.0,1.0,8.3,3,0
4,D0028,1,19.0,3.6,83,1010.0,0.0,73,25,19.0,10.0,1.0,8.3,1,6


In [8]:
le = LabelEncoder().fit(pickle.load(open(r"..\models\label_encoder.pkl", "rb")))

In [9]:
# Splitting the data into X and y
X = data.drop(columns=["city_id", "day_id", "condition_text"])
y = data["condition_text"]

In [12]:
# Balancing the data
smote = SMOTE()

X, y = smote.fit_resample(X, y)

In [13]:
y.value_counts()

condition_text
3    122
0    122
6    122
2    122
1    122
4    122
7    122
5    122
8    122
Name: count, dtype: int64

In [14]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# XGBoost model with GridSearchCV
model = XGBClassifier()

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.1, 0.01, 0.001],
    "subsample": [0.5, 0.7, 1],
    "colsample_bytree": [0.5, 0.7, 1],
}

grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

best_model


In [16]:
# Evaluating the model
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix: \n{conf_matrix}")

report = classification_report(y_test, y_pred)
print(f"Classification Report: \n{report}")

Accuracy: 0.8727272727272727
Confusion Matrix: 
[[31  0  0  0  1  0  0  0  0]
 [ 0 25  0  0  0  0  0  0  1]
 [ 0  1 19  1  0  1  3  1  0]
 [ 0  0  0 16  0  0  2  0  0]
 [ 4  0  0  0 24  0  2  0  0]
 [ 0  0  1  0  0 20  1  0  0]
 [ 0  0  2  1  0  0 17  1  0]
 [ 0  0  1  0  0  0  0 22  0]
 [ 0  1  1  1  0  0  1  0 18]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.89      0.97      0.93        32
           1       0.93      0.96      0.94        26
           2       0.79      0.73      0.76        26
           3       0.84      0.89      0.86        18
           4       0.96      0.80      0.87        30
           5       0.95      0.91      0.93        22
           6       0.65      0.81      0.72        21
           7       0.92      0.96      0.94        23
           8       0.95      0.82      0.88        22

    accuracy                           0.87       220
   macro avg       0.88      0.87      0.87       220
weighte

In [17]:
# Training accuracy
y_train_pred = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy}")

Training Accuracy: 1.0


In [18]:
# Get the predictions
predictions = pd.read_csv(r"..\data\part_2\predict_data.csv")

In [21]:
X_pred = predictions.drop(columns=["day_id", "city_id"])

In [22]:
# Predict the data
y_pred = best_model.predict(X_pred)

# Inverse transform the predictions
y_pred = le.inverse_transform(y_pred)

# Add the predictions to the dataframe
predictions["condition_text"] = y_pred

predictions

Unnamed: 0,day_id,city_id,temperature_celsius,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,condition_text
0,D0001,0,27.0,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,Partly Cloudy
1,D0002,0,22.0,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,Partly Cloudy
2,D0005,0,18.0,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,Clear and Sunny
3,D0006,0,20.0,3.6,96,1019.0,0.0,88,0,20.0,10.0,1.0,11.2,1,Clear and Sunny
4,D0008,0,21.0,20.2,330,1011.0,0.0,53,75,21.0,10.0,1.0,17.3,1,Partly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2409,D2888,111,19.5,16.6,113,1015.0,0.0,38,0,19.5,10.0,1.0,26.7,1,Clear and Sunny
2410,D2890,111,17.4,13.0,51,1017.0,0.0,49,0,17.4,10.0,1.0,22.2,1,Clear and Sunny
2411,D2891,111,19.2,11.5,46,1016.0,0.0,34,0,19.2,10.0,1.0,21.3,2,Clear and Sunny
2412,D2892,111,19.2,14.4,76,1017.0,0.0,45,2,19.2,10.0,1.0,24.9,2,Clear and Sunny


In [23]:
predictions[["day_id", "condition_text"]]

Unnamed: 0,day_id,condition_text
0,D0001,Partly Cloudy
1,D0002,Partly Cloudy
2,D0005,Clear and Sunny
3,D0006,Clear and Sunny
4,D0008,Partly Cloudy
...,...,...
2409,D2888,Clear and Sunny
2410,D2890,Clear and Sunny
2411,D2891,Clear and Sunny
2412,D2892,Clear and Sunny


In [24]:
submission = pd.read_csv(r"..\data\part_2\submission.csv")
submission

Unnamed: 0,day_id,condition_text
0,D0001,
1,D0002,
2,D0003,Light Rain with Thunder
3,D0004,Clear and Sunny
4,D0005,
...,...,...
2888,D2889,Clear and Sunny
2889,D2890,
2890,D2891,
2891,D2892,


In [25]:
# Left join the submission and predictions dataframes in one column
submission = submission.merge(predictions[["day_id", "condition_text"]], on="day_id", how="left")

In [26]:
# condition_text_y and condition_text_x fill with the values of condition_text_y
submission["condition_text"] = submission["condition_text_x"].fillna(submission["condition_text_y"])
submission

Unnamed: 0,day_id,condition_text_x,condition_text_y,condition_text
0,D0001,,Partly Cloudy,Partly Cloudy
1,D0002,,Partly Cloudy,Partly Cloudy
2,D0003,Light Rain with Thunder,,Light Rain with Thunder
3,D0004,Clear and Sunny,,Clear and Sunny
4,D0005,,Clear and Sunny,Clear and Sunny
...,...,...,...,...
2888,D2889,Clear and Sunny,,Clear and Sunny
2889,D2890,,Clear and Sunny,Clear and Sunny
2890,D2891,,Clear and Sunny,Clear and Sunny
2891,D2892,,Clear and Sunny,Clear and Sunny


In [27]:
submission.drop(columns=["condition_text_x", "condition_text_y"], inplace=True)
submission

Unnamed: 0,day_id,condition_text
0,D0001,Partly Cloudy
1,D0002,Partly Cloudy
2,D0003,Light Rain with Thunder
3,D0004,Clear and Sunny
4,D0005,Clear and Sunny
...,...,...
2888,D2889,Clear and Sunny
2889,D2890,Clear and Sunny
2890,D2891,Clear and Sunny
2891,D2892,Clear and Sunny


In [28]:
submission.to_csv(r"..\data\part_2\submission_xg.csv", index=False)