In [None]:
from datetime import datetime

import pandas as pd
import numpy as np

import mafese
from mafese.wrapper.mha import MhaSelector

from sklearn.svm import SVR
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, KFold

import xgboost as xgb
from xgboost import XGBRegressor

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt
import altair as alt

import shap

# Read in Data

In [None]:
data = pd.read_csv("../data/combined_data.csv")

In [None]:
data.sample(10)

## Format the Languages to be a list

In [None]:
data["Language"] = data["Language"].str.removeprefix("[").str.removesuffix("]").str.split(",")
data['Language'] = data['Language'].apply(lambda languages: [lang.strip().strip("'\" ") for lang in languages])



## Encode the Languages as Binaries

In [None]:
mlb = MultiLabelBinarizer()
language_encoded = mlb.fit_transform(data['Language'])

In [None]:
mlb.classes_

In [None]:
language_df = pd.DataFrame(language_encoded, columns=mlb.classes_)

In [None]:
data = pd.concat([data, language_df], axis=1).drop(columns=['Language'])


In [None]:
data.sample(10)

## Transform for ML

In [None]:
categorical_columns = ['Country Name', "Country Code", "Song", "Artist"]
numeric_cols = data.columns.difference(categorical_columns)

In [None]:
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())


In [None]:
empty_columns = data.columns[data.isnull().all()]
data = data.drop(columns=empty_columns).drop(columns=["Song", "Country Code", "Year", "Grand Final Points", "Semifinal", "Artist", "Semifinal Points", "Semifinal Place", "index", "level_0", "Unnamed: 0"])
data.sample(10)

In [None]:
# selected_cols = gb_importance_df["Feature"].head(50).values

In [None]:
data_encoded = pd.get_dummies(data, columns=["Country Name"], drop_first=True)


In [None]:
# target = data_encoded["Grand Final Place"].values
# non_targets = data_encoded[selected_cols].values


In [None]:
target = data_encoded["Grand Final Place"].values
non_targets = data_encoded.drop(columns=["Grand Final Place"]).values


## Transform using Mafese

In [None]:
data = mafese.Data(non_targets, target)


In [None]:
data.split_train_test(test_size=0.2, inplace=True)

In [None]:
X_train_df = pd.DataFrame(data.X_train)
X_test_df = pd.DataFrame(data.X_test)

In [None]:
standard_scaler = StandardScaler()

data.X_train = standard_scaler.fit_transform(data.X_train)
data.X_test = standard_scaler.transform(data.X_test)


In [None]:
data.y_train, scaler_y = data.encode_label(data.y_train)
data.y_test = scaler_y.transform(data.y_test)

## Feature Selection

In [None]:
feat_selector = MhaSelector(problem="regression")

In [None]:
weights = [0.9, 0.1]

In [None]:
feat_selector.fit(data.X_train, data.y_train, fit_weights=weights, verbose=True)

In [None]:
X_train_selected = feat_selector.transform(data.X_train)
X_test_selected = feat_selector.transform(data.X_test)

In [None]:
unique_classes, class_counts = np.unique(data.y_train, return_counts=True)
print("Unique classes in y_train:", unique_classes)
print("Counts of each class in y_train:", class_counts)

In [None]:
feat_selector.selected_feature_indexes

In [None]:
selected_feature_indices = feat_selector.selected_feature_indexes
feature_names = data_encoded.drop(columns=["Grand Final Place"]).columns.values
selected_features = feature_names[selected_feature_indices]


In [None]:
data

## Gradient Boosting

In [None]:
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
gb_model = GradientBoostingRegressor(random_state=42)

In [None]:
grid_search_gb = GridSearchCV(gb_model, gb_param_grid, cv=5, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')

In [None]:
grid_search_gb.fit(X_train_selected, data.y_train)

In [None]:
best_params_gb = grid_search_gb.best_params_
best_gb_model = grid_search_gb.best_estimator_
print("Best parameters: ", best_params_gb)

In [None]:
gb_model = GradientBoostingRegressor(**best_params_gb)
gb_model.fit(X_train_selected, data.y_train)

In [None]:
gb_importances = gb_model.feature_importances_
gb_importance_df = pd.DataFrame({
    'Feature': feature_names[selected_feature_indices],
    'Importance': gb_importances
}).sort_values(by='Importance', ascending=False)


In [None]:
print("\nGradient Boosting - Feature Importances:")
gb_importance_df.head(15)

In [None]:
alt.Chart(gb_importance_df.head(10)).mark_bar().encode(
    x=alt.X('Importance:Q', title='Importance'),
    y=alt.Y('Feature:N', sort='-x', title=None),
    color="Importance"
).properties(
    title='Top 10 Feature Importances - Gradient Boosting',
    width=1000,
    height=400
).configure_axisY(
    labelAngle=-25,  # Align text horizontally
    labelLimit=500,  # Maximum allowed pixel width of a label
    labelAlign='right'  # Align labels to the left
)

In [None]:
y_pred_gb = best_gb_model.predict(X_test_selected)


In [None]:
test_rmse_gb = root_mean_squared_error(data.y_test, y_pred_gb)
test_mae_gb = mean_absolute_error(data.y_test, y_pred_gb)
test_r2_gb = r2_score(data.y_test, y_pred_gb)


print("Test RMSE: ", test_rmse_gb)
print("Test MAE: ", test_mae_gb)
print("Test R²: ", test_r2_gb)

## XGBoost

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


In [None]:
gb_model = GradientBoostingRegressor(random_state=42)

In [None]:
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'colsample_bytree': [0.3, 0.7]
}

In [None]:
grid_search_xgb = GridSearchCV(xgb_model, xgb_param_grid, cv=5, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')


In [None]:
grid_search_xgb.fit(X_train_selected, data.y_train)


In [None]:
best_params_xgb = grid_search_xgb.best_params_
best_xgb_model = grid_search_xgb.best_estimator_
print("Best parameters: ", best_params_xgb)

In [None]:
best_gb_model.feature_importances_

In [None]:
xgb_model = best_xgb_model
xgb_importances = best_xgb_model.feature_importances_
xgb_importance_df = pd.DataFrame({
    'Feature': feature_names[selected_feature_indices],
    'Importance': xgb_importances
}).sort_values(by='Importance', ascending=False)


In [None]:
xgb_importance_df.head(15)

In [None]:
alt.Chart(xgb_importance_df.head(10)).mark_bar().encode(
    x=alt.X('Importance:Q', title='Importance'),
    y=alt.Y('Feature:N', sort='-x', title=None),
    color="Importance"
).properties(
    title='Top 10 Feature Importances - Gradient Boosting',
    width=1000,
    height=400
).configure_axisY(
    labelAngle=-25,  # Align text horizontally
    labelLimit=500,  # Maximum allowed pixel width of a label
    labelAlign='right'  # Align labels to the left
)

In [None]:
y_pred_xgb = best_xgb_model.predict(X_test_selected)


In [None]:
test_rmse_xgb = root_mean_squared_error(data.y_test, y_pred_xgb)
test_mae_xgb = mean_absolute_error(data.y_test, y_pred_xgb)
test_r2_xgb = r2_score(data.y_test, y_pred_xgb)


print("Test RMSE: ", test_rmse_xgb)
print("Test MAE: ", test_mae_xgb)
print("Test R²: ", test_r2_xgb)

## Deep Neural Networks

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
model = Sequential()
model.add(Input(shape=(data.X_train.shape[1],)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [None]:
history = model.fit(data.X_train, data.y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1, callbacks=[early_stopping])


In [None]:
r2_scores = cross_val_score(GradientBoostingRegressor(**best_params_gb), data.X_train, data.y_train, cv=kf, scoring='r2')
print("Cross-validated R² scores for Gradient Boosting:", np.mean(r2_scores))

In [None]:
y_pred_dnn = model.predict(data.X_test)
rmse_dnn = root_mean_squared_error(data.y_test, y_pred_dnn)
mae_dnn = mean_absolute_error(data.y_test, y_pred_dnn)
r2_dnn = r2_score(data.y_test, y_pred_dnn)

print("Test RMSE:", rmse_dnn)
print("Test MAE:", mae_dnn)
print("Test R²:", r2_dnn)

### Visualise the DNN

In [None]:
plt.figure(figsize=(12, 5))

In [None]:
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error')
plt.legend()
plt.show()

## Get metrics

In [None]:
results = feat_selector.evaluate(estimator=SVR(), data=data, metrics=["RMSE", "MAE", "MAPE", "R2", "NSE", "KGE"])


In [None]:
output = f"""
Run at: {datetime.now()}
Weights = {weights}
Outputs:
"""
for key in results.keys():
    output += f"\t{key}: {results[key]}\n"

output += f"""Gradient Boosting
\tTest RMSE: {test_rmse_gb}
\tTest MAE: {test_mae_gb}
\tTest R²: {test_r2_gb}
\tBest parameters: 
"""
for key in best_params_gb.keys():
    output += f"\t{key}: {best_params_gb[key]}\n"

output += f"""XGBoost
\tTest RMSE: {test_rmse_xgb}
\tTest MAE: {test_mae_xgb}
\tTest R²: {test_r2_xgb}
\tBest parameters: 
"""
for key in best_params_xgb.keys():
    output += f"\t{key}: {best_params_xgb[key]}\n"

output += f"""Deep Neural Network
\tTest RMSE: {rmse_dnn}
\tTest MAE: {mae_dnn}
\tTest R²: {r2_dnn}
"""

with open("outputs/outputs.txt", "a", encoding="UTF-8") as f:
    f.write(output)


In [None]:
print(output)