# Import the Dataset

In [None]:
import pandas as pd

daily_data = pd.read_csv('../Dataset/daily_data.csv', sep=',')
daily_data

In [None]:
daily_data.nunique()

# Check For outliers

In [None]:
daily_data['city_id'].value_counts()  # No outliers

In [None]:
# Convert the time columns to datetime
daily_data['sunrise'] = pd.to_datetime(daily_data['sunrise'], format='%I:%M %p')
daily_data['sunset'] = pd.to_datetime(daily_data['sunset'], format='%I:%M %p')

# Convert Clock data to minutes
daily_data['sunrise'] = daily_data['sunrise'].dt.hour * 60 + daily_data['sunrise'].dt.minute
daily_data['sunset'] = daily_data['sunset'].dt.hour * 60 + daily_data['sunset'].dt.minute
daily_data

In [None]:
# One-hot encoding the 'city_id' column
daily_data = pd.get_dummies(daily_data, columns=['city_id'])
daily_data

In [None]:
daily_data['city_id_C109'].dtypes

In [None]:
# Convert all boolean columns to integers (1, 0)
daily_data = daily_data.astype({col: 'int' for col in daily_data.select_dtypes(include='bool').columns})
daily_data.dtypes

In [None]:
daily_data

In [None]:
#Save file
daily_data.to_csv('daily_data.csv_processed.csv', index=False)

# Create the training set

In [None]:
df = daily_data.copy()
df = df.dropna(subset=['condition_text'])
df

# Encode The labels

In [None]:
weather_mapping = {
    'Clear and Sunny': 0,
    'Partly Cloudy': 1,
    'Light Precipitation': 2,
    'Cloudy and Overcast': 3,
    'Mist or Fog': 4,
    'Rain Showers': 5,
    'Light Rain with Thunder': 6,
    'Thunderstorms': 7,
    'Moderate to Heavy Rain': 8
}

In [None]:
df['condition_text'] = df['condition_text'].map(weather_mapping)
df

In [None]:
'''
# Step 3: Decode the integers back into weather conditions
reverse_weather_mapping = {v: k for k, v in weather_mapping.items()}
df['weather_decoded'] = df['weather_encoded'].map(reverse_weather_mapping)
print("Decoded DataFrame:\n", df)
'''

In [None]:
#Save file
df.to_csv('weather_data_processed.csv', index=False)

# Feature Engineering => From HERE

In [None]:
import pandas as pd

df = pd.read_csv('weather_data_processed.csv', sep=',')
df

Unnamed: 0,day_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,...,city_id_C103,city_id_C104,city_id_C105,city_id_C106,city_id_C107,city_id_C108,city_id_C109,city_id_C110,city_id_C111,city_id_C112
0,D0003,20.0,6,3.6,10,1011.0,4.50,100,75,20.0,...,0,0,0,0,0,0,0,0,0,0
1,D0004,17.0,0,6.1,150,1018.0,0.00,88,0,17.0,...,0,0,0,0,0,0,0,0,0,0
2,D0007,21.0,1,4.0,310,1015.0,0.00,100,50,21.0,...,0,0,0,0,0,0,0,0,0,0
3,D0019,19.0,0,3.6,64,1017.0,0.00,88,0,19.0,...,0,0,0,0,0,0,0,0,0,0
4,D0028,19.0,1,3.6,83,1010.0,0.00,73,25,19.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,D2839,27.0,1,19.1,90,1014.0,0.01,70,25,29.8,...,0,0,0,0,0,0,0,1,0,0
475,D2864,21.4,2,3.6,178,1012.0,0.00,89,75,21.4,...,0,0,0,0,0,0,0,0,1,0
476,D2874,16.9,0,16.6,68,1018.0,0.00,44,2,16.9,...,0,0,0,0,0,0,0,0,0,1
477,D2887,19.5,0,5.4,27,1014.0,0.00,34,0,19.4,...,0,0,0,0,0,0,0,0,0,1


In [None]:
df = df.drop(columns=['day_id'])

In [None]:
df.dtypes

temperature_celsius    float64
condition_text           int64
wind_kph               float64
wind_degree              int64
pressure_mb            float64
                        ...   
city_id_C108             int64
city_id_C109             int64
city_id_C110             int64
city_id_C111             int64
city_id_C112             int64
Length: 127, dtype: object

## <----------------------Base Line Changes-------------------->

In [None]:
df = df.drop(columns=['wind_degree'])    #'wind_degree'
df['temp_diff'] = df['feels_like_celsius']-df['temperature_celsius']
#------------------------------------------------------------------Confirmed ^^
#df = df.drop(columns=['uv_index'])
df

Unnamed: 0,temperature_celsius,condition_text,wind_kph,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,...,city_id_C104,city_id_C105,city_id_C106,city_id_C107,city_id_C108,city_id_C109,city_id_C110,city_id_C111,city_id_C112,temp_diff
0,20.0,6,3.6,1011.0,4.50,100,75,20.0,10.0,1.0,...,0,0,0,0,0,0,0,0,0,0.0
1,17.0,0,6.1,1018.0,0.00,88,0,17.0,10.0,1.0,...,0,0,0,0,0,0,0,0,0,0.0
2,21.0,1,4.0,1015.0,0.00,100,50,21.0,10.0,1.0,...,0,0,0,0,0,0,0,0,0,0.0
3,19.0,0,3.6,1017.0,0.00,88,0,19.0,10.0,1.0,...,0,0,0,0,0,0,0,0,0,0.0
4,19.0,1,3.6,1010.0,0.00,73,25,19.0,10.0,1.0,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,27.0,1,19.1,1014.0,0.01,70,25,29.8,10.0,6.0,...,0,0,0,0,0,0,1,0,0,2.8
475,21.4,2,3.6,1012.0,0.00,89,75,21.4,10.0,6.0,...,0,0,0,0,0,0,0,1,0,0.0
476,16.9,0,16.6,1018.0,0.00,44,2,16.9,10.0,1.0,...,0,0,0,0,0,0,0,0,1,0.0
477,19.5,0,5.4,1014.0,0.00,34,0,19.4,10.0,1.0,...,0,0,0,0,0,0,0,0,1,-0.1


# Imports

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import re
from scipy import stats
from scipy.stats import ttest_ind
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelBinarizer
import numpy as np


# XGBoost Model

In [None]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report


In [None]:
# Make a copy for tensorflow usage
df_xgb = df.copy()

In [None]:
X = df_xgb.drop(columns=['condition_text']).copy()
y = df_xgb['condition_text'].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
param_grid = {
    'n_estimators': [150,200,250],
    'max_depth': [2,4,6,8],
    'learning_rate': [0.001,0.001,0.005],
    'subsample': [0.8,0.9,0.7],
    'colsample_bytree': [0.8,0.7,0.6],
    'gamma': [0,]
}


In [None]:

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid,
                           scoring='accuracy', cv=3, verbose=1, n_jobs=-1)

grid_search.fit(X, y)


Fitting 3 folds for each of 324 candidates, totalling 972 fits


In [None]:
print(f'Best parameters found: {grid_search.best_params_}')

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
#['colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8]

Best parameters found: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.001, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.9}
Accuracy: 0.8020833333333334
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        29
           1       0.74      0.96      0.84        27
           2       0.62      0.42      0.50        12
           3       0.67      0.89      0.76         9
           4       1.00      0.70      0.82        10
           5       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         5
           8       1.00      0.67      0.80         3

    accuracy                           0.80        96
   macro avg       0.62      0.58      0.59        96
weighted avg       0.77      0.80      0.77        96



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Predictions

In [None]:
import pandas as pd

df = pd.read_csv('daily_data.csv_processed.csv', sep = ',')
df

Unnamed: 0,day_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,...,city_id_C103,city_id_C104,city_id_C105,city_id_C106,city_id_C107,city_id_C108,city_id_C109,city_id_C110,city_id_C111,city_id_C112
0,D0001,27.0,,6.1,210,1006.0,0.0,54,75,28.0,...,0,0,0,0,0,0,0,0,0,0
1,D0002,22.0,,6.1,170,1006.0,0.0,73,75,24.5,...,0,0,0,0,0,0,0,0,0,0
2,D0003,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,...,0,0,0,0,0,0,0,0,0,0
3,D0004,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,17.0,...,0,0,0,0,0,0,0,0,0,0
4,D0005,18.0,,3.6,92,1019.0,0.0,94,0,18.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2888,D2889,17.1,Clear and Sunny,13.3,61,1017.0,0.0,57,3,17.1,...,0,0,0,0,0,0,0,0,0,1
2889,D2890,17.4,,13.0,51,1017.0,0.0,49,0,17.4,...,0,0,0,0,0,0,0,0,0,1
2890,D2891,19.2,,11.5,46,1016.0,0.0,34,0,19.2,...,0,0,0,0,0,0,0,0,0,1
2891,D2892,19.2,,14.4,76,1017.0,0.0,45,2,19.2,...,0,0,0,0,0,0,0,0,0,1


In [None]:
df = df.drop(columns=['wind_degree'])    #'wind_degree'
df['temp_diff'] = df['feels_like_celsius']-df['temperature_celsius']
df

Unnamed: 0,day_id,temperature_celsius,condition_text,wind_kph,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,...,city_id_C104,city_id_C105,city_id_C106,city_id_C107,city_id_C108,city_id_C109,city_id_C110,city_id_C111,city_id_C112,temp_diff
0,D0001,27.0,,6.1,1006.0,0.0,54,75,28.0,10.0,...,0,0,0,0,0,0,0,0,0,1.0
1,D0002,22.0,,6.1,1006.0,0.0,73,75,24.5,10.0,...,0,0,0,0,0,0,0,0,0,2.5
2,D0003,20.0,Light Rain with Thunder,3.6,1011.0,4.5,100,75,20.0,10.0,...,0,0,0,0,0,0,0,0,0,0.0
3,D0004,17.0,Clear and Sunny,6.1,1018.0,0.0,88,0,17.0,10.0,...,0,0,0,0,0,0,0,0,0,0.0
4,D0005,18.0,,3.6,1019.0,0.0,94,0,18.0,10.0,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2888,D2889,17.1,Clear and Sunny,13.3,1017.0,0.0,57,3,17.1,10.0,...,0,0,0,0,0,0,0,0,1,0.0
2889,D2890,17.4,,13.0,1017.0,0.0,49,0,17.4,10.0,...,0,0,0,0,0,0,0,0,1,0.0
2890,D2891,19.2,,11.5,1016.0,0.0,34,0,19.2,10.0,...,0,0,0,0,0,0,0,0,1,0.0
2891,D2892,19.2,,14.4,1017.0,0.0,45,2,19.2,10.0,...,0,0,0,0,0,0,0,0,1,0.0


In [None]:
weather_mapping = {
    'Clear and Sunny': 0,
    'Partly Cloudy': 1,
    'Light Precipitation': 2,
    'Cloudy and Overcast': 3,
    'Mist or Fog': 4,
    'Rain Showers': 5,
    'Light Rain with Thunder': 6,
    'Thunderstorms': 7,
    'Moderate to Heavy Rain': 8
}
df['condition_text'] = df['condition_text'].map(weather_mapping)
df

Unnamed: 0,day_id,temperature_celsius,condition_text,wind_kph,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,...,city_id_C104,city_id_C105,city_id_C106,city_id_C107,city_id_C108,city_id_C109,city_id_C110,city_id_C111,city_id_C112,temp_diff
0,D0001,27.0,,6.1,1006.0,0.0,54,75,28.0,10.0,...,0,0,0,0,0,0,0,0,0,1.0
1,D0002,22.0,,6.1,1006.0,0.0,73,75,24.5,10.0,...,0,0,0,0,0,0,0,0,0,2.5
2,D0003,20.0,6.0,3.6,1011.0,4.5,100,75,20.0,10.0,...,0,0,0,0,0,0,0,0,0,0.0
3,D0004,17.0,0.0,6.1,1018.0,0.0,88,0,17.0,10.0,...,0,0,0,0,0,0,0,0,0,0.0
4,D0005,18.0,,3.6,1019.0,0.0,94,0,18.0,10.0,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2888,D2889,17.1,0.0,13.3,1017.0,0.0,57,3,17.1,10.0,...,0,0,0,0,0,0,0,0,1,0.0
2889,D2890,17.4,,13.0,1017.0,0.0,49,0,17.4,10.0,...,0,0,0,0,0,0,0,0,1,0.0
2890,D2891,19.2,,11.5,1016.0,0.0,34,0,19.2,10.0,...,0,0,0,0,0,0,0,0,1,0.0
2891,D2892,19.2,,14.4,1017.0,0.0,45,2,19.2,10.0,...,0,0,0,0,0,0,0,0,1,0.0


In [None]:
df['condition_text']=df['condition_text'].astype('Int64')
df

NameError: name 'df' is not defined

In [None]:
# Predict on the rows where target is NaN
X_pred = df[df['condition_text'].isna()].drop('condition_text', axis=1).drop(columns=['day_id'])
predictions = best_model.predict(X_pred)

# Fill NaN values with predictions
df.loc[df['condition_text'].isna(), 'condition_text'] = predictions
df

Unnamed: 0,day_id,temperature_celsius,condition_text,wind_kph,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,...,city_id_C104,city_id_C105,city_id_C106,city_id_C107,city_id_C108,city_id_C109,city_id_C110,city_id_C111,city_id_C112,temp_diff
0,D0001,27.0,1,6.1,1006.0,0.0,54,75,28.0,10.0,...,0,0,0,0,0,0,0,0,0,1.0
1,D0002,22.0,1,6.1,1006.0,0.0,73,75,24.5,10.0,...,0,0,0,0,0,0,0,0,0,2.5
2,D0003,20.0,6,3.6,1011.0,4.5,100,75,20.0,10.0,...,0,0,0,0,0,0,0,0,0,0.0
3,D0004,17.0,0,6.1,1018.0,0.0,88,0,17.0,10.0,...,0,0,0,0,0,0,0,0,0,0.0
4,D0005,18.0,0,3.6,1019.0,0.0,94,0,18.0,10.0,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2888,D2889,17.1,0,13.3,1017.0,0.0,57,3,17.1,10.0,...,0,0,0,0,0,0,0,0,1,0.0
2889,D2890,17.4,0,13.0,1017.0,0.0,49,0,17.4,10.0,...,0,0,0,0,0,0,0,0,1,0.0
2890,D2891,19.2,0,11.5,1016.0,0.0,34,0,19.2,10.0,...,0,0,0,0,0,0,0,0,1,0.0
2891,D2892,19.2,0,14.4,1017.0,0.0,45,2,19.2,10.0,...,0,0,0,0,0,0,0,0,1,0.0


In [None]:
submission = df[['day_id','condition_text']]
submission

Unnamed: 0,day_id,condition_text
0,D0001,1
1,D0002,1
2,D0003,6
3,D0004,0
4,D0005,0
...,...,...
2888,D2889,0
2889,D2890,0
2890,D2891,0
2891,D2892,0


In [None]:
reverse_weather_mapping = {v: k for k, v in weather_mapping.items()}
submission['condition_text'] = submission['condition_text'].map(reverse_weather_mapping)
submission.to_csv('submission_XGB_full_set.csv', index=False)
submission

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['condition_text'] = submission['condition_text'].map(reverse_weather_mapping)


Unnamed: 0,day_id,condition_text
0,D0001,Partly Cloudy
1,D0002,Partly Cloudy
2,D0003,Light Rain with Thunder
3,D0004,Clear and Sunny
4,D0005,Clear and Sunny
...,...,...
2888,D2889,Clear and Sunny
2889,D2890,Clear and Sunny
2890,D2891,Clear and Sunny
2891,D2892,Clear and Sunny


# LGBM

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df_lgbm = df.copy()
X = df_lgbm.drop(columns=['condition_text']).copy()
y = df_lgbm['condition_text'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)


In [None]:
# Define LightGBM classifier
lgb_model = lgb.LGBMClassifier()

# Define the model
model = lgb.LGBMClassifier()

# Define the parameter grid
param_grid = {
    'boosting_type': ['gbdt', 'dart', 'goss'],
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 10, 20]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='accuracy', cv=3, verbose=1)

# Fit the model
grid_search.fit(X, y)



In [None]:
# Print the best parameters and the best score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

# Predict on the test set and evaluate
y_pred = grid_search.predict(X_test)
print(f"Test set accuracy: {accuracy_score(y_test, y_pred)}")

## Predictions


In [None]:
import pandas as pd

df = pd.read_csv('daily_data.csv_processed.csv', sep = ',')
df

In [None]:
df = df.drop(columns=['wind_degree'])    #'wind_degree'
df['temp_diff'] = df['feels_like_celsius']-df['temperature_celsius']
df

In [None]:
weather_mapping = {
    'Clear and Sunny': 0,
    'Partly Cloudy': 1,
    'Light Precipitation': 2,
    'Cloudy and Overcast': 3,
    'Mist or Fog': 4,
    'Rain Showers': 5,
    'Light Rain with Thunder': 6,
    'Thunderstorms': 7,
    'Moderate to Heavy Rain': 8
}
df['condition_text'] = df['condition_text'].map(weather_mapping)
df

In [None]:
df['condition_text']=df['condition_text'].astype('Int64')
df

In [None]:
# Predict on the rows where target is NaN
X_pred = df[df['condition_text'].isna()].drop('condition_text', axis=1).drop(columns=['day_id'])
predictions = grid_search.predict(X_pred)

# Fill NaN values with predictions
df.loc[df['condition_text'].isna(), 'condition_text'] = predictions
df

In [None]:
submission = df[['day_id','condition_text']]
submission

In [None]:
reverse_weather_mapping = {v: k for k, v in weather_mapping.items()}
submission['condition_text'] = submission['condition_text'].map(reverse_weather_mapping)
submission.to_csv('submission_lgbm_1.csv', index=False)
submission