In [97]:
from datetime import datetime

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from tqdm import tqdm


In [74]:
combined_data = pd.read_csv("./data/combined_data.csv")

In [75]:
combined_data["Grand Final Place"] = combined_data["Grand Final Place"].fillna(27+combined_data["Semifinal Points"])

In [76]:
combined_data.columns[:15]

Index(['Country Name', 'Country Code', 'Year', 'Song', 'Artist', 'Language',
       'Grand Final Place', 'Grand Final Points', 'Semifinal',
       'Semifinal Place', 'Semifinal Points', 'index',
       'Access to clean fuels and technologies for cooking (% of population)',
       'Access to clean fuels and technologies for cooking, rural (% of rural population)',
       'Access to clean fuels and technologies for cooking, urban (% of urban population)'],
      dtype='object')

In [77]:
numeric_features = combined_data.drop(
    ['Country Name',
     'Country Code', 
     'Year', 
     'Grand Final Place', 
     "Song", 
     "Artist", 
     "Grand Final Points", 
     "Semifinal",
     "Semifinal Points",
     "Semifinal Place",
     "Language",
     "index",
     "Use of insecticide-treated bed nets (% of under-5 population)"
     ], 
     axis=1)
target = combined_data['Grand Final Place']
numeric_features.sample(10)

Unnamed: 0,Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)",...,"People with basic handwashing facilities including soap and water, rural (% of rural population)","People with basic handwashing facilities including soap and water, urban (% of urban population)","Risk premium on lending (lending rate minus treasury bill rate, %)","Incidence of malaria (per 1,000 population at risk)","Net financial flows, RDB concessional (NFL, current US$)",Financial intermediary services indirectly Measured (FISIM) (constant LCU),"Net financial flows, IMF concessional (NFL, current US$)",Newborns protected against tetanus (%),"Net official flows from UN agencies, UNWTO (current US$)",Children with fever receiving antimalarial drugs (% of children under age 5 with fever)
430,,,,100.0,100.0,100.0,,,,,...,,,,,,,,,,
838,46.65,23.8,72.6,99.7,99.8,99.5,,,,,...,,,,,,499728300.0,,,,
2128,,,,100.0,100.0,100.0,,,,,...,,,,,,,,,,
219,,,,,,,,,,,...,,,,,,,,,,
1279,100.0,100.0,100.0,100.0,100.0,100.0,,,,,...,,,,,,,,,,
134,,,,,,,,,,,...,,,,,,,,,,
1256,99.5,98.6,99.9,98.0,94.7,99.0,,,,,...,,,,,,,,,,
2993,100.0,100.0,100.0,100.0,100.0,100.0,99.38,99.6,99.12,99.29,...,,,,,,,,,,
1625,,,,,,,,,,,...,,,,,,,,,,
450,,,,100.0,100.0,100.0,,,,,...,,,,,,,,,,


In [78]:
categorical_features = combined_data[['Country Name', 'Language']]

In [79]:
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_features = encoder.fit_transform(categorical_features)

In [80]:
encoded_categorical_features.shape

(3100, 108)

In [81]:
feature_names = encoder.get_feature_names_out(categorical_features.columns)
len(feature_names)

108

In [82]:
encoded_categorical_df = pd.DataFrame(encoded_categorical_features, columns=feature_names)
encoded_categorical_df.sample(10)

Unnamed: 0,Country Name_Albania,Country Name_Andorra,Country Name_Armenia,Country Name_Australia,Country Name_Austria,Country Name_Azerbaijan,Country Name_Belarus,Country Name_Belgium,Country Name_Bosnia and Herzegovina,Country Name_Bulgaria,...,Language_Spanish,Language_Srnan Tongo,Language_Swahili,Language_Swedish,Language_Turkish,Language_Udmurt,Language_Ukrainian,Language_Viennese,Language_Vorarlbergish,Language_Võro
194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
numeric_features = pd.concat([numeric_features, encoded_categorical_df], axis=1)
numeric_features.sample(10)



Unnamed: 0,Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)",...,Language_Spanish,Language_Srnan Tongo,Language_Swahili,Language_Swedish,Language_Turkish,Language_Udmurt,Language_Ukrainian,Language_Viennese,Language_Vorarlbergish,Language_Võro
1044,100.0,100.0,100.0,100.0,100.0,100.0,85.24,83.07,87.62,87.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
877,99.7,99.4,99.5,98.2,97.6,98.4,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1449,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2377,93.7,86.5,97.2,99.5,99.3,99.7,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1087,99.2,97.4,99.8,95.6,89.7,97.5,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400,,,,100.0,100.0,100.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
947,100.0,100.0,100.0,100.0,100.0,100.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2740,100.0,100.0,100.0,100.0,100.0,100.0,77.91,77.88,77.94,86.89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404,,,,100.0,100.0,100.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
imputer = SimpleImputer(strategy='median')
numeric_features = pd.DataFrame(imputer.fit_transform(numeric_features), columns=numeric_features.columns)

In [85]:
for col in numeric_features.columns:
    if numeric_features[col].isna().sum() != 0:
        print(col)

In [86]:
final_features = pd.concat([numeric_features, encoded_categorical_df], axis=1)
final_data = pd.concat([final_features, combined_data['Grand Final Place'].reset_index(drop=True)], axis=1)


In [87]:
correlation_matrix = final_data.corr()

In [88]:
target_correlation = correlation_matrix['Grand Final Place']

In [89]:
target_correlation_sorted = target_correlation.abs().sort_values(ascending=False)

In [90]:
top_features = target_correlation_sorted.index[1:21] 
print("Top features correlated with the target variable:\n", target_correlation_sorted.head(21))

Top features correlated with the target variable:
 Grand Final Place                                                    1.000000
Mobile cellular subscriptions (per 100 people)                       0.205909
Gross national expenditure deflator (base year varies by country)    0.188794
Consumer price index (2010 = 100)                                    0.177268
GDP deflator (base year varies by country)                           0.176330
Population ages 55-59, male (% of male population)                   0.174222
Women Business and the Law Index Score (scale 1-100)                 0.170785
School enrollment, tertiary, female (% gross)                        0.162978
School enrollment, tertiary (% gross)                                0.160244
Population ages 55-59, female (% of female population)               0.160102
Individuals using the Internet (% of population)                     0.156795
Population ages 15-19, male (% of male population)                   0.156427
Agricultural 

In [91]:
final_features_top = final_data[top_features]
final_features.sample(20)


Unnamed: 0,Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)",...,Language_Spanish,Language_Srnan Tongo,Language_Swahili,Language_Swedish,Language_Turkish,Language_Udmurt,Language_Ukrainian,Language_Viennese,Language_Vorarlbergish,Language_Võro
1876,100.0,100.0,100.0,100.0,100.0,100.0,91.8,90.38,91.89,94.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1787,100.0,100.0,100.0,100.0,100.0,100.0,91.8,90.38,91.89,94.52,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2575,67.4,43.8,86.35,99.7,99.5,99.9,91.8,90.38,91.89,94.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2414,100.0,100.0,100.0,100.0,100.0,100.0,91.8,90.38,91.89,94.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1021,100.0,100.0,100.0,100.0,100.0,100.0,91.8,90.38,91.89,94.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2362,100.0,100.0,100.0,99.8,100.0,99.6,91.8,90.38,91.89,94.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1819,100.0,100.0,100.0,100.0,100.0,100.0,91.8,90.38,91.89,94.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1418,100.0,100.0,100.0,100.0,100.0,100.0,91.8,90.38,91.89,94.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2240,100.0,100.0,100.0,99.9,100.0,99.6,91.8,90.38,91.89,94.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2809,100.0,100.0,100.0,97.3,94.2,98.4,91.8,90.38,91.89,94.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
target_top = final_data['Grand Final Place']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(final_features, target, test_size=0.2, random_state=42)


In [105]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [106]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 24.869860647873956


In [107]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.8min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.9min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 2.0min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 3.5min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 3.6min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 2.0min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 1.8min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 4.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 1.7min
[CV] END m

In [108]:
best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

In [109]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)


In [110]:
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)


In [111]:
print(f'Best Parameters: {best_params}')
print(f'Best RMSE via GridSearchCV: {best_score}')
print(f'Root Mean Squared Error with Best Model: {rmse}')
print(f'Cross-Validation RMSE scores: {cv_rmse_scores}')
print(f'Average Cross-Validation RMSE: {cv_rmse_scores.mean()}')

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best RMSE via GridSearchCV: 21.834940980713395
Root Mean Squared Error with Best Model: 20.350522482054455
Cross-Validation RMSE scores: [ 2.16189231 35.06866258 35.60824252 23.78682515 37.96141396]
Average Cross-Validation RMSE: 26.9174073042444


In [112]:

with open("logs/model_outputs.txt", "a", encoding="UTF-8") as f:
    f.write(f"""Run at {datetime.now()}:
Best Parameters: {best_params}
Best RMSE via GridSearchCV: {best_score}
Root Mean Squared Error with Best Model: {rmse}
Cross-Validation RMSE scores: {cv_rmse_scores}
Average Cross-Validation RMSE: {cv_rmse_scores.mean()}""")
