In [1]:
import pandas as pd
import numpy as np
import mafese
from sklearn.preprocessing import StandardScaler
from mafese.wrapper.mha import MhaSelector
from mafese import get_dataset
from sklearn.svm import SVR, SVC
from scipy import sparse
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, mean_squared_error, mean_absolute_error, r2_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb




## Read in Data and Transform

In [2]:
data_array = pd.read_csv("combined_data.csv", index_col=0)

In [3]:
categorical_columns = ['Country Name', 'Country Code', 'Year', 'Song', 'Artist', 'Language']
numeric_cols = data_array.columns.difference(categorical_columns)

In [4]:
data_array[numeric_cols] = data_array[numeric_cols].fillna(data_array[numeric_cols].median())


In [5]:
empty_columns = data_array.columns[data_array.isnull().all()]
data_array = data_array.drop(columns=empty_columns)
data_array

Unnamed: 0,Country Name,Country Code,Year,Song,Artist,Language,Grand Final Place,Grand Final Points,Semifinal,Semifinal Place,...,"People with basic handwashing facilities including soap and water, rural (% of rural population)","People with basic handwashing facilities including soap and water, urban (% of urban population)","Risk premium on lending (lending rate minus treasury bill rate, %)","Incidence of malaria (per 1,000 population at risk)","Net financial flows, RDB concessional (NFL, current US$)",Financial intermediary services indirectly Measured (FISIM) (constant LCU),"Net financial flows, IMF concessional (NFL, current US$)",Newborns protected against tetanus (%),"Net official flows from UN agencies, UNWTO (current US$)",Children with fever receiving antimalarial drugs (% of children under age 5 with fever)
0,Austria,AUT,1960.0,Du hast mich so fasziniert,Harry Winter,German,7.0,6.0,0.0,3.0,...,87.371663,94.597914,3.302041,0.0,4686996.8,3.323646e+10,1878743.7,76.0,608.647591,9.3
1,Belgium,BEL,1960.0,Mon amour pour toi,Fud Leclerc,French,6.0,9.0,0.0,3.0,...,87.371663,94.597914,3.302041,0.0,4686996.8,3.323646e+10,1878743.7,76.0,608.647591,9.3
2,Denmark,DNK,1960.0,Det var en yndig tid,Katy Bødtger,Danish,10.0,4.0,0.0,3.0,...,87.371663,94.597914,3.302041,0.0,4686996.8,3.323646e+10,1878743.7,76.0,608.647591,9.3
3,France,FRA,1960.0,Tom Pillibi,Jacqueline Boyer,French,1.0,32.0,0.0,3.0,...,87.371663,94.597914,3.302041,0.0,4686996.8,3.323646e+10,1878743.7,76.0,608.647591,9.3
4,Germany,DEU,1960.0,Bonne nuit ma chérie,Wyn Hoop,German,4.0,11.0,0.0,3.0,...,87.371663,94.597914,3.302041,0.0,4686996.8,3.323646e+10,1878743.7,76.0,608.647591,9.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3509,Sweden,SWE,2023.0,Tattoo,Loreen,English,1.0,583.0,1.0,2.0,...,87.371663,94.597914,3.302041,0.0,4686996.8,3.323646e+10,1878743.7,76.0,608.647591,9.3
3510,Switzerland,CHE,2023.0,Watergun,Remo Forrer,English,20.0,92.0,1.0,7.0,...,87.371663,94.597914,3.302041,0.0,4686996.8,3.323646e+10,1878743.7,76.0,608.647591,9.3
3511,Ukraine,UKR,2023.0,Heart of Steel,Tvorchi,English,6.0,243.0,0.0,0.0,...,87.371663,94.597914,3.302041,0.0,4686996.8,3.323646e+10,1878743.7,76.0,608.647591,9.3
3512,Ukraine,UKR,2023.0,Heart of Steel,Tvorchi,Ukrainian,6.0,243.0,0.0,0.0,...,87.371663,94.597914,3.302041,0.0,4686996.8,3.323646e+10,1878743.7,76.0,608.647591,9.3


In [6]:
data_array["Grand Final Place"].value_counts()

Grand Final Place
10.0    469
0.0     323
13.0    154
4.0     144
7.0     144
8.0     138
1.0     134
3.0     134
2.0     134
12.0    132
16.0    132
5.0     130
9.0     130
14.0    128
6.0     122
15.0    118
11.0    116
18.0    114
17.0    104
19.0     88
22.0     88
20.0     78
21.0     76
24.0     60
23.0     58
25.0     40
26.0     24
27.0      2
Name: count, dtype: int64

In [7]:
data_encoded = pd.get_dummies(data_array, columns=categorical_columns, drop_first=True)


In [8]:
# data_array = data_array.sample(50)
target = data_encoded["Grand Final Place"].values
non_targets = data_encoded.drop(columns=["Grand Final Place"]).values


In [9]:
data = mafese.Data(non_targets, target)


In [10]:
data.split_train_test(test_size=0.2, inplace=True)

In [11]:
X_train_df = pd.DataFrame(data.X_train)
X_test_df = pd.DataFrame(data.X_test)

In [12]:
standard_scaler = StandardScaler()

data.X_train = standard_scaler.fit_transform(data.X_train)
data.X_test = standard_scaler.transform(data.X_test)


In [13]:
data.y_train, scaler_y = data.encode_label(data.y_train)
data.y_test = scaler_y.transform(data.y_test)

## Feature Selection

In [14]:
feat_selector = MhaSelector(problem="regression")

In [15]:
weights = [0.9, 0.1]

In [16]:
feat_selector.fit(data.X_train, data.y_train, fit_weights=weights, verbose=True)

2024/10/08 05:25:20 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: Solving 2-objective optimization problem with weights: [1. 0.].
2024/10/08 05:25:25 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >Problem: P, Epoch: 1, Current best: 32.39216585697216, Global best: 32.39216585697216, Runtime: 2.58030 seconds
2024/10/08 05:25:27 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >Problem: P, Epoch: 2, Current best: 34.11817474127564, Global best: 32.39216585697216, Runtime: 2.02805 seconds
2024/10/08 05:25:29 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >Problem: P, Epoch: 3, Current best: 33.56557218236565, Global best: 32.39216585697216, Runtime: 1.61585 seconds
2024/10/08 05:25:30 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >Problem: P, Epoch: 4, Current best: 32.536889043672986, Global best: 32.39216585697216, Runtime: 1.34265 seconds
2024/10/08 05:25:32 PM, INFO, mealpy.evolutionary_based.GA.BaseGA: >Problem: P, Epoch: 5, Current best: 32.74388985698286, Global best: 32.392165856

In [17]:
print(len(feat_selector.selected_feature_indexes))

483


In [18]:
print(feat_selector.selected_feature_indexes)


[   0    1    2    4    5    8   10   20   45   48   51   52   62   97
  120  124  126  140  143  154  160  162  164  165  167  173  176  178
  184  188  202  223  235  241  246  247  266  269  275  284  297  322
  327  366  376  382  388  402  408  414  432  459  466  513  532  559
  564  583  590  602  603  614  619  623  634  639  650  653  659  661
  664  677  680  692  695  698  705  709  736  737  749  764  825  835
  845  846  847  855  864  888  890  891  901  910  915  922  923  933
  946  953  958  971  975  986 1005 1006 1027 1032 1037 1038 1043 1078
 1104 1109 1114 1144 1148 1149 1153 1167 1172 1191 1205 1233 1236 1245
 1263 1266 1285 1302 1316 1326 1339 1351 1364 1372 1392 1400 1403 1410
 1426 1435 1450 1467 1482 1483 1486 1501 1520 1538 1555 1561 1567 1568
 1577 1617 1620 1626 1630 1632 1642 1659 1680 1681 1690 1691 1699 1702
 1718 1741 1746 1750 1762 1763 1773 1783 1784 1799 1812 1814 1819 1831
 1832 1839 1843 1851 1854 1866 1872 1878 1892 1900 1901 1908 1930 1942
 1944 

In [19]:
X_train_selected = feat_selector.transform(data.X_train)
X_test_selected = feat_selector.transform(data.X_test)

In [20]:
unique_classes, class_counts = np.unique(data.y_train, return_counts=True)
print("Unique classes in y_train:", unique_classes)
print("Counts of each class in y_train:", class_counts)

Unique classes in y_train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27]
Counts of each class in y_train: [250 110 104 107 120 107 108 121 111 107 371  91 102 120 112  87 102  80
  89  70  64  63  69  46  50  32  17   1]


## Grid Search

In [21]:
# param_grid = {
#     'C': [0.1, 1, 10, 100],         # Regularization parameter
#     'kernel': ['linear', 'rbf', 'poly'],  # Kernel types suitable for regression
#     'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient for RBF and poly
# }


In [22]:
grid_search = GridSearchCV(SVR(), param_grid, cv=2, verbose=2, scoring='neg_mean_squared_error')


In [23]:
# grid_search.fit(X_train_selected, data.y_train)


In [24]:
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

In [25]:
# print("Best parameters found: ", best_params)
# print("Best cross-validation score: ", best_score)


In [26]:
# test_rmse = np.sqrt(mean_squared_error(data.y_test, y_pred))
# test_mae = mean_absolute_error(data.y_test, y_pred)
# test_r2 = r2_score(data.y_test, y_pred)

# print("Test RMSE: ", test_rmse)
# print("Test MAE: ", test_mae)
# print("Test R²: ", test_r2)

## Random Forest

In [27]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [28]:
rf_model = RandomForestRegressor(random_state=42)
grid_search_rf = GridSearchCV(rf_model, rf_param_grid, cv=3, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')


In [29]:

grid_search_rf.fit(X_train_selected, data.y_train)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s

[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=4, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total t



324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
124 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ruyzambrano/Desktop/Fun/eurovision_data/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ruyzambrano/Desktop/Fun/eurovision_data/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/ruyzambrano/Desktop/Fun/eurovision_data/.venv/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/ruyzambrano/Desktop/

In [30]:
best_params_rf = grid_search_rf.best_params_
best_rf_model = grid_search_rf.best_estimator_
print("Best parameters: ", best_params_rf)



Best parameters:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [31]:
y_pred_rf = best_rf_model.predict(X_test_selected)

In [32]:
test_rmse_rf = np.sqrt(mean_squared_error(data.y_test, y_pred_rf))
test_mae_rf = mean_absolute_error(data.y_test, y_pred_rf)
test_r2_rf = r2_score(data.y_test, y_pred_rf)

In [33]:
print("Random Forest Regressor Test RMSE: ", test_rmse_rf)
print("Random Forest Regressor Test MAE: ", test_mae_rf)
print("Random Forest Regressor Test R²: ", test_r2_rf)

Random Forest Regressor Test RMSE:  4.927585708909978
Random Forest Regressor Test MAE:  4.094553564455989
Random Forest Regressor Test R²:  0.508100252800584


In [34]:
best_model = grid_search_rf.best_estimator_
y_pred = best_model.predict(X_test_selected)

## Gradient Boosting

In [35]:
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [36]:
gb_model = GradientBoostingRegressor(random_state=42)

In [37]:
grid_search_gb = GridSearchCV(gb_model, gb_param_grid, cv=3, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')

In [38]:
grid_search_gb.fit(X_train_selected, data.y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.9s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   7.8s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   3.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   7.9s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   8.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=1

In [39]:
best_params_gb = grid_search_gb.best_params_
best_gb_model = grid_search_gb.best_estimator_
print("Best parameters: ", best_params_gb)

Best parameters:  {'learning_rate': 0.2, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}


In [40]:
y_pred_gb = best_gb_model.predict(X_test_selected)


In [41]:
test_rmse_gb = np.sqrt(mean_squared_error(data.y_test, y_pred_gb))
test_mae_gb = mean_absolute_error(data.y_test, y_pred_gb)
test_r2_gb = r2_score(data.y_test, y_pred_gb)


print("Test RMSE: ", test_rmse_gb)
print("Test MAE: ", test_mae_gb)
print("Test R²: ", test_r2_gb)

Test RMSE:  1.303510035152193
Test MAE:  0.4071617378448259
Test R²:  0.9655779300126035


## XGBoost

In [42]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


In [43]:
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'colsample_bytree': [0.3, 0.7]
}

In [44]:
grid_search_xgb = GridSearchCV(xgb_model, xgb_param_grid, cv=3, verbose=2, n_jobs=-1, scoring='neg_mean_squared_error')


In [45]:
grid_search_xgb.fit(X_train_selected, data.y_train)


Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.5s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.7s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.8s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=4, n_estimators=100; total time=   0.3s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=4, n_estimators=100; total time=   0.3s
[CV] END colsample_bytree=0.3

In [46]:
best_params_xgb = grid_search_xgb.best_params_
best_xgb_model = grid_search_xgb.best_estimator_
print("Best parameters: ", best_params_xgb)

Best parameters:  {'colsample_bytree': 0.7, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300}


In [47]:
y_pred_xgb = best_xgb_model.predict(X_test_selected)


In [48]:
test_rmse_xgb = np.sqrt(mean_squared_error(data.y_test, y_pred_xgb))
test_mae_xgb = mean_absolute_error(data.y_test, y_pred_xgb)
test_r2_xgb = r2_score(data.y_test, y_pred_xgb)


print("Test RMSE: ", test_rmse_xgb)
print("Test MAE: ", test_mae_xgb)
print("Test R²: ", test_r2_xgb)

Test RMSE:  1.1068566868340788
Test MAE:  0.5042680718293175
Test R²:  0.9751806259155273


## Get Report

In [49]:
results = feat_selector.evaluate(estimator=SVC(), data=data, metrics=["RMSE", "MAE", "MAPE", "R2", "NSE", "KGE"])



divide by zero encountered in divide


invalid value encountered in divide



In [50]:
output = f"""
Run at: {datetime.now()}
Weights = {weights}
Outputs:
"""
for key in results.keys():
    output += f"\t{key}: {results[key]}\n"

# output += f"""GridSearch Results
# \tBest parameters: {best_params}
# \tBest cross-validation score: {best_score}
# \tTest RMSE: : {test_rmse}
# \tTest MAE: {test_mae}
# \tTest R²: {test_r2}"""

output += """Random Forest Regressor:
\tTest RMSE: {test_rmse_rf}
\tTest MAE: {test_mae_rf}
\tTest R²: {test_r2_rf}
\tBest parameters: 
"""
for key in best_params_rf.keys():
    output += f"\t{key}: {best_params_rf[key]}\n"

output += """Gradient Boosting
\tTest RMSE: {test_rmse_gb}
\tTest MAE: {test_mae_gb}
\tTest R²: {test_r2_gb}
\tBest parameters: 
"""
for key in best_params_gb.keys():
    output += f"\t{key}: {best_params_gb[key]}\n"

output += """XGBoost
\tTest RMSE: {test_rmse_xgb}
\tTest MAE: {test_mae_xgb}
\tTest R²: {test_r2_xgb}
\tBest parameters: 
"""
for key in best_params_xgb.keys():
    output += f"\t{key}: {best_params_xgb[key]}\n"

with open("outputs/outputs.txt", "a", encoding="UTF-8") as f:
    f.write(output)


In [51]:
print(output)


Run at: 2024-10-08 17:42:38.341171
Weights = [0.9, 0.1]
Outputs:
	RMSE_train: 6.084838363944652
	MAE_train: 3.590181430096051
	MAPE_train: 1.0
	R2_train: 0.22216616176003945
	NSE_train: 0.22216616176003945
	KGE_train: 0.5105487803786962
	RMSE_test: 7.102080315510791
	MAE_test: 4.908961593172119
	MAPE_test: 1.0
	R2_test: -0.02183172925371446
	NSE_test: -0.02183172925371446
	KGE_test: 0.32638202381968373
Random Forest Regressor:
	Test RMSE: {test_rmse_rf}
	Test MAE: {test_mae_rf}
	Test R²: {test_r2_rf}
	Best parameters: 
	max_depth: 10
	max_features: sqrt
	min_samples_leaf: 1
	min_samples_split: 2
	n_estimators: 100
Gradient Boosting
	Test RMSE: {test_rmse_gb}
	Test MAE: {test_mae_gb}
	Test R²: {test_r2_gb}
	Best parameters: 
	learning_rate: 0.2
	max_depth: 5
	min_samples_leaf: 2
	min_samples_split: 5
	n_estimators: 300
XGBoost
	Test RMSE: {test_rmse_xgb}
	Test MAE: {test_mae_xgb}
	Test R²: {test_r2_xgb}
	Best parameters: 
	colsample_bytree: 0.7
	learning_rate: 0.2
	max_depth: 5
	n_esti