In [33]:
import pandas as pd
import numpy as np
import mafese
from sklearn.preprocessing import StandardScaler
from mafese.wrapper.mha import MhaSelector
from mafese import get_dataset
from sklearn.svm import SVR, SVC
from scipy import sparse
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, mean_squared_error, mean_absolute_error, r2_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb




## Read in Data and Transform

In [41]:
data_array = pd.read_csv("combined_data.csv", index_col=0)

In [35]:
categorical_columns = ['Country Name', 'Country Code', 'Year', 'Song', 'Artist', 'Language']
numeric_cols = data_array.columns.difference(categorical_columns)

In [36]:
data_array[numeric_cols] = data_array[numeric_cols].fillna(data_array[numeric_cols].median())


In [38]:
empty_columns = data_array.columns[data_array.isnull().all()]
data_array = data_array.drop(columns=empty_columns)
data_array

Unnamed: 0_level_0,Country Code,Year,Song,Artist,Language,Grand Final Place,Grand Final Points,Semifinal,Semifinal Place,Semifinal Points,...,"People with basic handwashing facilities including soap and water, rural (% of rural population)","People with basic handwashing facilities including soap and water, urban (% of urban population)","Risk premium on lending (lending rate minus treasury bill rate, %)","Incidence of malaria (per 1,000 population at risk)","Net financial flows, RDB concessional (NFL, current US$)",Financial intermediary services indirectly Measured (FISIM) (constant LCU),"Net financial flows, IMF concessional (NFL, current US$)",Newborns protected against tetanus (%),"Net official flows from UN agencies, UNWTO (current US$)",Children with fever receiving antimalarial drugs (% of children under age 5 with fever)
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Netherlands,NLD,1969,De troubadour,Lenny Kuhr,['Dutch'],1.0,18.0,2.0,10.0,75.0,...,87.371663,94.597914,3.095015,0.0,2058076.5,1.695209e+10,1878743.7,81.0,608.647591,9.3
France,FRA,1970,Marie-Blanche,Guy Bonnet,['French'],4.0,8.0,2.0,10.0,75.0,...,87.371663,94.597914,3.095015,0.0,2058076.5,1.695209e+10,1878743.7,81.0,608.647591,9.3
Italy,ITA,1970,Occhi di ragazza,Gianni Morandi,['Italian'],8.0,5.0,2.0,10.0,75.0,...,87.371663,94.597914,3.095015,0.0,2058076.5,1.695209e+10,1878743.7,81.0,608.647591,9.3
Netherlands,NLD,1970,Waterman,Hearts of Soul,['Dutch'],7.0,7.0,2.0,10.0,75.0,...,87.371663,94.597914,3.095015,0.0,2058076.5,1.695209e+10,1878743.7,81.0,608.647591,9.3
Spain,ESP,1970,Gwendolyne,Julio Iglesias,['Spanish'],4.0,8.0,2.0,10.0,75.0,...,87.371663,94.597914,3.095015,0.0,2058076.5,1.695209e+10,1878743.7,81.0,608.647591,9.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Portugal,PRT,2023,Ai coração,Mimicat,['Portuguese'],23.0,59.0,1.0,9.0,74.0,...,87.371663,94.597914,3.095015,0.0,2058076.5,1.695209e+10,1878743.7,81.0,608.647591,9.3
Romania,ROU,2023,D.G.T. (Off and On),Theodor Andrei,"['Romanian', 'English']",12.0,64.5,2.0,15.0,0.0,...,87.371663,94.597914,2.316832,0.0,2058076.5,1.695209e+10,1878743.7,81.0,608.647591,9.3
Spain,ESP,2023,Eaea,Blanca Paloma,['Spanish'],17.0,100.0,2.0,10.0,75.0,...,87.371663,94.597914,3.095015,0.0,2058076.5,1.695209e+10,1878743.7,81.0,608.647591,9.3
Sweden,SWE,2023,Tattoo,Loreen,['English'],1.0,583.0,1.0,2.0,135.0,...,87.371663,94.597914,3.095015,0.0,2058076.5,1.695209e+10,1878743.7,81.0,608.647591,9.3


In [39]:
data_array["Grand Final Place"].value_counts()

Grand Final Place
12.0    322
18.0     56
5.0      54
8.0      53
2.0      53
1.0      51
14.0     51
9.0      51
7.0      51
10.0     51
11.0     50
3.0      50
4.0      50
16.0     50
6.0      47
13.0     45
17.0     45
15.0     44
19.0     41
22.0     38
20.0     36
21.0     35
23.0     26
24.0     25
25.0     19
26.0     11
27.0      1
Name: count, dtype: int64

In [40]:
data_encoded = pd.get_dummies(data_array, columns=categorical_columns, drop_first=True)


KeyError: "['Country Name'] not in index"

In [82]:
# data_array = data_array.sample(50)
target = data_encoded["Grand Final Place"].values
non_targets = data_encoded.drop(columns=["Grand Final Place"]).values


In [83]:
data = mafese.Data(non_targets, target)


In [84]:
data.split_train_test(test_size=0.2, inplace=True)

In [85]:
X_train_df = pd.DataFrame(data.X_train)
X_test_df = pd.DataFrame(data.X_test)

In [86]:
standard_scaler = StandardScaler()

data.X_train = standard_scaler.fit_transform(data.X_train)
data.X_test = standard_scaler.transform(data.X_test)


In [87]:
data.y_train, scaler_y = data.encode_label(data.y_train)
data.y_test = scaler_y.transform(data.y_test)

## Feature Selection

In [88]:
feat_selector = MhaSelector(problem="regression")

In [89]:
weights = [0.9, 0.1]

In [None]:
feat_selector.fit(data.X_train, data.y_train, fit_weights=weights, verbose=True)

2024/10/21 03:18:49 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: Solving 2-objective optimization problem with weights: [1. 0.].
2024/10/21 03:18:56 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >Problem: P, Epoch: 1, Current best: 33.25881320705882, Global best: 33.25881320705882, Runtime: 2.71950 seconds
2024/10/21 03:18:59 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >Problem: P, Epoch: 2, Current best: 32.79046966282853, Global best: 32.79046966282853, Runtime: 2.59029 seconds
2024/10/21 03:19:00 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >Problem: P, Epoch: 3, Current best: 32.60068415376995, Global best: 32.60068415376995, Runtime: 1.63383 seconds
2024/10/21 03:19:02 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >Problem: P, Epoch: 4, Current best: 33.044224305930506, Global best: 32.60068415376995, Runtime: 1.51167 seconds


In [39]:
print(len(feat_selector.selected_feature_indexes))

492


In [40]:
print(feat_selector.selected_feature_indexes)


[   0    4   47   62   65   66   67   92  101  116  150  155  160  161
  178  197  207  232  237  243  246  251  256  258  268  303  322  331
  335  337  339  372  375  388  396  411  431  435  436  439  446  452
  453  470  476  484  493  504  525  528  537  538  540  541  557  571
  577  578  582  583  585  588  602  605  616  656  662  669  673  687
  691  703  722  724  726  740  785  789  807  818  821  850  853  855
  865  866  882  887  896  905  908  921  937  939  963  964  969 1003
 1004 1020 1035 1038 1102 1113 1117 1126 1134 1163 1173 1174 1175 1187
 1195 1196 1203 1209 1210 1221 1226 1240 1256 1260 1281 1282 1286 1300
 1311 1337 1338 1344 1370 1371 1372 1376 1383 1384 1400 1426 1452 1473
 1503 1505 1509 1523 1538 1555 1562 1579 1589 1625 1631 1641 1658 1666
 1671 1677 1682 1690 1696 1709 1725 1747 1749 1799 1800 1804 1808 1811
 1836 1840 1849 1855 1858 1859 1861 1919 1936 1945 1953 1957 1963 1966
 1970 1974 2001 2002 2004 2006 2008 2012 2017 2029 2058 2077 2089 2101
 2117 

In [41]:
X_train_selected = feat_selector.transform(data.X_train)
X_test_selected = feat_selector.transform(data.X_test)

In [42]:
unique_classes, class_counts = np.unique(data.y_train, return_counts=True)
print("Unique classes in y_train:", unique_classes)
print("Counts of each class in y_train:", class_counts)

Unique classes in y_train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27]
Counts of each class in y_train: [250 110 104 107 120 107 108 121 111 107 371  91 102 120 112  87 102  80
  89  70  64  63  69  46  50  32  17   1]


## Grid Search

In [43]:
param_grid = {
    'C': [0.1, 1, 10, 100],         # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types suitable for regression
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient for RBF and poly
}


In [44]:
grid_search = GridSearchCV(SVR(), param_grid, cv=2, verbose=2, scoring='neg_mean_squared_error')


In [45]:
# grid_search.fit(X_train_selected, data.y_train)


In [46]:
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

In [47]:
# print("Best parameters found: ", best_params)
# print("Best cross-validation score: ", best_score)


In [48]:
# test_rmse = np.sqrt(mean_squared_error(data.y_test, y_pred))
# test_mae = mean_absolute_error(data.y_test, y_pred)
# test_r2 = r2_score(data.y_test, y_pred)

# print("Test RMSE: ", test_rmse)
# print("Test MAE: ", test_mae)
# print("Test R²: ", test_r2)

## Random Forest

In [49]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [50]:
rf_model = RandomForestRegressor(random_state=42)
grid_search_rf = GridSearchCV(rf_model, rf_param_grid, cv=3, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')


In [51]:

grid_search_rf.fit(X_train_selected, data.y_train)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total t



324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
250 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ruyzambrano/Desktop/Fun/eurovision_data/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ruyzambrano/Desktop/Fun/eurovision_data/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/ruyzambrano/Desktop/Fun/eurovision_data/.venv/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/ruyzambrano/Desktop/

In [52]:
best_params_rf = grid_search_rf.best_params_
best_rf_model = grid_search_rf.best_estimator_
print("Best parameters: ", best_params_rf)



Best parameters:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [53]:
y_pred_rf = best_rf_model.predict(X_test_selected)

In [54]:
test_rmse_rf = np.sqrt(mean_squared_error(data.y_test, y_pred_rf))
test_mae_rf = mean_absolute_error(data.y_test, y_pred_rf)
test_r2_rf = r2_score(data.y_test, y_pred_rf)

In [55]:
print("Random Forest Regressor Test RMSE: ", test_rmse_rf)
print("Random Forest Regressor Test MAE: ", test_mae_rf)
print("Random Forest Regressor Test R²: ", test_r2_rf)

Random Forest Regressor Test RMSE:  5.25546010500264
Random Forest Regressor Test MAE:  4.215658583605033
Random Forest Regressor Test R²:  0.4404618370714425


In [56]:
best_model = grid_search_rf.best_estimator_
y_pred = best_model.predict(X_test_selected)

## Gradient Boosting

In [57]:
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [58]:
gb_model = GradientBoostingRegressor(random_state=42)

In [59]:
grid_search_gb = GridSearchCV(gb_model, gb_param_grid, cv=3, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')

In [60]:
grid_search_gb.fit(X_train_selected, data.y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.5s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.4s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   8.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   8.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=2

In [61]:
best_params_gb = grid_search_gb.best_params_
best_gb_model = grid_search_gb.best_estimator_
print("Best parameters: ", best_params_gb)

Best parameters:  {'learning_rate': 0.2, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}


In [62]:
y_pred_gb = best_gb_model.predict(X_test_selected)


In [63]:
test_rmse_gb = np.sqrt(mean_squared_error(data.y_test, y_pred_gb))
test_mae_gb = mean_absolute_error(data.y_test, y_pred_gb)
test_r2_gb = r2_score(data.y_test, y_pred_gb)


print("Test RMSE: ", test_rmse_gb)
print("Test MAE: ", test_mae_gb)
print("Test R²: ", test_r2_gb)

Test RMSE:  1.1893783915377896
Test MAE:  0.6072201612700227
Test R²:  0.9713418393144176


## XGBoost

In [64]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


In [65]:
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'colsample_bytree': [0.3, 0.7]
}

In [66]:
grid_search_xgb = GridSearchCV(xgb_model, xgb_param_grid, cv=3, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')


In [67]:
grid_search_xgb.fit(X_train_selected, data.y_train)


Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.8s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=4, n_estimators=100; total time=   0.3s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=4, n_estimators=100; total time=   0.3s
[CV] END colsample_bytree=0.3

In [68]:
best_params_xgb = grid_search_xgb.best_params_
best_xgb_model = grid_search_xgb.best_estimator_
print("Best parameters: ", best_params_xgb)

Best parameters:  {'colsample_bytree': 0.7, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300}


In [69]:
y_pred_xgb = best_xgb_model.predict(X_test_selected)


In [70]:
test_rmse_xgb = np.sqrt(mean_squared_error(data.y_test, y_pred_xgb))
test_mae_xgb = mean_absolute_error(data.y_test, y_pred_xgb)
test_r2_xgb = r2_score(data.y_test, y_pred_xgb)


print("Test RMSE: ", test_rmse_xgb)
print("Test MAE: ", test_mae_xgb)
print("Test R²: ", test_r2_xgb)

Test RMSE:  1.4721340719190512
Test MAE:  0.8319332377275036
Test R²:  0.9560961127281189


## Get Report

In [71]:
results = feat_selector.evaluate(estimator=SVC(), data=data, metrics=["RMSE", "MAE", "MAPE", "R2", "NSE", "KGE"])



divide by zero encountered in divide


invalid value encountered in divide



In [74]:
output = f"""
Run at: {datetime.now()}
Weights = {weights}
Outputs:
"""
for key in results.keys():
    output += f"\t{key}: {results[key]}\n"

# output += f"""GridSearch Results
# \tBest parameters: {best_params}
# \tBest cross-validation score: {best_score}
# \tTest RMSE: : {test_rmse}
# \tTest MAE: {test_mae}
# \tTest R²: {test_r2}"""

output += f"""Random Forest Regressor:
\tTest RMSE: {test_rmse_rf}
\tTest MAE: {test_mae_rf}
\tTest R²: {test_r2_rf}
\tBest parameters: 
"""
for key in best_params_rf.keys():
    output += f"\t{key}: {best_params_rf[key]}\n"

output += f"""Gradient Boosting
\tTest RMSE: {test_rmse_gb}
\tTest MAE: {test_mae_gb}
\tTest R²: {test_r2_gb}
\tBest parameters: 
"""
for key in best_params_gb.keys():
    output += f"\t{key}: {best_params_gb[key]}\n"

output += f"""XGBoost
\tTest RMSE: {test_rmse_xgb}
\tTest MAE: {test_mae_xgb}
\tTest R²: {test_r2_xgb}
\tBest parameters: 
"""
for key in best_params_xgb.keys():
    output += f"\t{key}: {best_params_xgb[key]}\n"

with open("outputs/outputs.txt", "a", encoding="UTF-8") as f:
    f.write(output)


In [73]:
print(output)


Run at: 2024-10-21 14:59:55.391003
Weights = [0.9, 0.1]
Outputs:
	RMSE_train: 6.369953482085048
	MAE_train: 3.7872643187477766
	MAPE_train: 1.0
	R2_train: 0.1475650224865015
	NSE_train: 0.1475650224865015
	KGE_train: 0.4930734726682814
	RMSE_test: 6.898471185282131
	MAE_test: 4.647226173541963
	MAPE_test: 1.0
	R2_test: 0.03591808843500888
	NSE_test: 0.03591808843500888
	KGE_test: 0.4279639876210033
Random Forest Regressor:
	Test RMSE: {test_rmse_rf}
	Test MAE: {test_mae_rf}
	Test R²: {test_r2_rf}
	Best parameters: 
	max_depth: 10
	max_features: sqrt
	min_samples_leaf: 1
	min_samples_split: 2
	n_estimators: 100
Gradient Boosting
	Test RMSE: {test_rmse_gb}
	Test MAE: {test_mae_gb}
	Test R²: {test_r2_gb}
	Best parameters: 
	learning_rate: 0.2
	max_depth: 5
	min_samples_leaf: 4
	min_samples_split: 2
	n_estimators: 300
XGBoost
	Test RMSE: {test_rmse_xgb}
	Test MAE: {test_mae_xgb}
	Test R²: {test_r2_xgb}
	Best parameters: 
	colsample_bytree: 0.7
	learning_rate: 0.2
	max_depth: 5
	n_estimato