In [1]:
import re

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
combined_data = pd.read_csv("./data/combined_data.csv")

In [3]:
combined_data["Grand Final Place"] = combined_data["Grand Final Place"].fillna(27 + combined_data["Semifinal Points"])

In [4]:
numeric_features = combined_data.drop(
    ['Country Name', 'Country Code', 'Year', 'Grand Final Place', "Song", "Artist", 
     "Grand Final Points", "Semifinal", "Semifinal Points", "Semifinal Place", "Language", "index",
     "Use of insecticide-treated bed nets (% of under-5 population)"], 
    axis=1)

In [5]:
target = combined_data['Grand Final Place']

In [6]:
categorical_features = combined_data[['Country Name', 'Language']]
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_features = encoder.fit_transform(categorical_features)
feature_names = encoder.get_feature_names_out(categorical_features.columns)
encoded_categorical_df = pd.DataFrame(encoded_categorical_features, columns=feature_names)

In [7]:
all_features = pd.concat([numeric_features, encoded_categorical_df], axis=1)


In [8]:
all_features.columns = [re.sub(r'\W+', '_', col) for col in all_features.columns]

In [9]:
imputer = SimpleImputer(strategy='median')
all_features = pd.DataFrame(imputer.fit_transform(all_features), columns=all_features.columns)


In [10]:
model = lgb.LGBMRegressor(max_depth=10, min_data_in_leaf=4, num_leaves=31, n_estimators=300, random_state=42)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_binned = pd.qcut(target, q=5, labels=False)
cv_scores = cross_val_score(model, all_features, y_binned, cv=skf, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)
print(f'Stratified Cross-Validation RMSE scores: {cv_rmse_scores}')
print(f'Average Stratified Cross-Validation RMSE: {cv_rmse_scores.mean()}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261498
[LightGBM] [Info] Number of data points in the train set: 2480, number of used features: 1565
[LightGBM] [Info] Start training from score 1.955242
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029406 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 260761
[LightGBM] [Info] Number of data points in the train set: 2480, number of used features: 1564
[LightGBM] [Info] Start training from score 1.954839
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 262843
[LightGBM] [Info] Number of data points in the train set: 2480, number of used features: 1566
[LightGBM] [Info] 

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(target, kde=True)
plt.title('Distribution of Grand Final Place')
Q1 = target.quantile(0.25)
Q3 = target.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = (target < lower_bound) | (target > upper_bound)
print(f"Number of outliers: {outliers.sum()}")
X_no_outliers = all_features[~outliers]
y_no_outliers = target[~outliers]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_no_outliers, y_no_outliers, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error without Outliers: {rmse}')

In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Residuals Distribution')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.show()

In [None]:
mean_target = target.mean()
median_target = target.median()
y_pred_mean = [mean_target] * len(y_test)
y_pred_median = [median_target] * len(y_test)
mse_mean = mean_squared_error(y_test, y_pred_mean)
rmse_mean = np.sqrt(mse_mean)
mse_median = mean_squared_error(y_test, y_pred_median)
rmse_median = np.sqrt(mse_median)
print(f'RMSE of Mean Baseline: {rmse_mean}')
print(f'RMSE of Median Baseline: {rmse_median}')