In [1]:
import pandas as pd, numpy as np, seaborn as sbn, matplotlib.pyplot as plt

from Modules import utils as utl
from Modules import data_exploration as de
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import sklearn.metrics as sm
import pickle

In [2]:
csv_file_path = 'Data/finalists_cleaned.csv'
eurovision_df = pd.read_csv(csv_file_path, encoding='windows-1252')

In [3]:
eurovision_df

Unnamed: 0,year,final_draw_position,country,style,final_televote_points,final_jury_points,final_televote_votes,final_jury_votes,final_place,final_total_points
0,2023,20,Norway,Pop,216.0,52.0,36.0,11.0,5.0,268.0
1,2023,5,Serbia,Pop,16.0,14.0,4.0,6.0,24.0,30.0
2,2023,2,Portugal,Pop,16.0,43.0,3.0,9.0,23.0,59.0
3,2023,25,Croatia,Pop,112.0,11.0,20.0,2.0,13.0,123.0
4,2023,3,Switzerland,Ballad,31.0,61.0,10.0,15.0,20.0,92.0
...,...,...,...,...,...,...,...,...,...,...
353,2009,3,France,Ballad,54.0,164.0,,,8.0,218.0
354,2009,10,Russia,Ballad,118.0,67.0,,,11.0,185.0
355,2009,17,Germany,Pop,18.0,73.0,,,20.0,91.0
356,2009,23,United Kingdom,Ballad,105.0,223.0,,,5.0,328.0


In [7]:
dropped_columns = ['style', 'final_televote_votes', 'final_jury_votes']
jury_televote = eurovision_df.drop(columns=dropped_columns)

In [9]:
jury_televote

Unnamed: 0,year,final_draw_position,country,final_televote_points,final_jury_points,final_place,final_total_points
0,2023,20,Norway,216.0,52.0,5.0,268.0
1,2023,5,Serbia,16.0,14.0,24.0,30.0
2,2023,2,Portugal,16.0,43.0,23.0,59.0
3,2023,25,Croatia,112.0,11.0,13.0,123.0
4,2023,3,Switzerland,31.0,61.0,20.0,92.0
...,...,...,...,...,...,...,...
353,2009,3,France,54.0,164.0,8.0,218.0
354,2009,10,Russia,118.0,67.0,11.0,185.0
355,2009,17,Germany,18.0,73.0,20.0,91.0
356,2009,23,United Kingdom,105.0,223.0,5.0,328.0


In [13]:
jury_televote.isnull().sum()

year                      0
final_draw_position       0
country                   0
final_televote_points    32
final_jury_points        32
final_place               0
final_total_points        0
dtype: int64

In [17]:
# Show the rows with any null values in the jury_televote DataFrame
null_rows = jury_televote[jury_televote.isnull().any(axis=1)]

# Display the rows with null values
null_rows

Unnamed: 0,year,final_draw_position,country,final_televote_points,final_jury_points,final_place,final_total_points
212,2014,24,Netherlands,,,2.0,0.0
231,2013,7,Estonia,,,20.0,0.0
232,2013,18,Denmark,,,1.0,0.0
233,2013,10,Russia,,,5.0,0.0
234,2013,22,Ukraine,,,3.0,0.0
235,2013,13,Netherlands,,,9.0,0.0
236,2013,2,Lithuania,,,22.0,0.0
237,2013,8,Belarus,,,16.0,0.0
238,2013,3,Moldova,,,11.0,0.0
239,2013,26,Ireland,,,26.0,0.0


In [None]:

eurovision_df.isnull().sum()


# In[20]:


eurovision_semi_df.describe()


# In[21]:


de.vs.lmplot(eurovision_semi_df, 'semi_draw_position', 'semi_place')
# the smaller the semi_place - the better


# In[22]:


de.vs.boxplot(eurovision_semi_df, 'semi_draw_position', 'semi_place')


# In[23]:


# reformat the Eurovision semifinal list to not include strings
eurovision_semi_num = ['year', 'semi_final', 'semi_draw_position', 'semi_televote_points', 'semi_jury_points', 'semi_total_points', 'semi_place']
eurovision_semi_df = eurovision_semi_df[eurovision_semi_num]


# In[24]:


de.vs.correlation_heatmap(eurovision_semi_df)


# ### Separating the non numeral data from the data we want to train and test

# In[26]:


# create a Python list of feature names
feature_cols = ['semi_final', 'semi_draw_position']

# use the list to select a subset of the original DataFrame
X = eurovision_semi_df[feature_cols]

# print the first 5 rows
X.head()


# In[27]:


# select a Series from the DataFrame for y
y = eurovision_semi_df['semi_place']

# print the first 5 values
y.head()


# In[28]:


# check the type and shape of X
print(type(X))
print(X.shape)


# In[29]:


# check the type and shape of y
print(type(y))
print(y.shape)


# In[30]:


import sys
import os

# Use the current working directory instead of __file__
current_directory = os.getcwd()

# Add the path to the 'Modules' folder (adjust the path to reach the Modules folder)
sys.path.append(os.path.join(current_directory, 'Modules'))

# Now import the evalute function
import machine_learning
from machine_learning import *


# ### Training and testing

# In[32]:


# training the data
X_train, X_test, y_train, y_test = machine_learning.prepare_data(eurovision_semi_df, 'semi_place')


# In[33]:


# using Pipeline to combat NaN values in the points columns - and replacing them with a "mean" aggregated result instead.
model = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', LinearRegression())
])


# In[34]:


model.fit(X_train, y_train)


# In[35]:


reg = model.named_steps['regressor']


# In[36]:


print("Intercept:", reg.intercept_)
print("Coefficients:", reg.coef_)

feature_names = X_train.columns
coef_table = list(zip(feature_names, reg.coef_))
for name, coef in coef_table:
    print(f"{name:30} {coef:>10.4f}")


# ####
# Intercept: -536.99
#  The intercept is the predicted semifinal place when all features are 0, which doesn't have much practical interpretation in this case. But in case it was - a negative value that big is reason for concern that the values aren't coherent with each other.
# 
#  Coefficients:
# 1. 'Year' (0.2738):
#        A one-year increase is associated with a slight increase (0.27) in the predicted semifinal place. 
#         This suggests that over the years, the rankings have slightly moved upwards on average - aka they have gotten worse.
# 2. 'Semi Final' (0.1695): 
#     Moving from one semifinal to another is associated with a small increase (0.17) in the predicted place (so semi 2 is worse than semi 1), 
#     reflecting possible differences in competition between the two semifinals. Perhaps because when the number of semifinalists are odd, one of the semifinals will have one more entry than the other - making it less likely to qualify from that semifinal.
# 3. 'Semi Draw Position' (-0.0332): 
#     A later performance position (higher number) is associated with a slight improvement (lower `semi_place`) in ranking. 
#     This supports the idea that later performances might be more memorable or have an advantage, potentially due to recency bias.
# 4. 'Semi Televote Points' (-0.0048): 
#     As televote points increase, the predicted semifinal place improves (i.e., a better placement). 
#     This is intuitive since more televote points would likely lead to a better semifinal result.
# 5. 'Semi Jury Points' (0.0093): 
#     A higher number of jury points is associated with a worse semifinal place (higher `semi_place`), which is unexpected and suggests that higher jury points alone don't necessarily guarantee a better place in the semifinal. Perhaps the televote disagrees more strongly with jury favorites than juries do with televote favorites, leading to jury favorites performing slightly worse overall in the total result? As we also have to keep in mind the total result have in most of these recorded years (2009-2023) been 50%/50% jury/televote based - with the exception of 2023 and up - where the juries were removed from semifinals entirely, but still remaining at the same capacity in the final.
# 6. 'Semi Total Points' (-0.0597): 
#     An increase in total points (combination of jury and televote) is associated with a better semifinal position 
#     (lower `semi_place`), which is consistent with the idea that more points lead to a higher placement.
# 
# A negative coefficient generally represents a negative correlation with semi_place - which actually is good - because that means the value in question "negatively" impacts the place result (the number becomes lower) - as the lower the place number is - the better it is - as 1 is the best result here. So fx a high number of point should give you a low place number - like 3rd place, and vice versa a low number of points would give you a "high" placing like 15th - which would make it less likely to qualify.
# 

# In[38]:


correlation_matrix = eurovision_semi_df[['semi_jury_points', 'semi_televote_points', 'semi_total_points']].corr()
print(correlation_matrix)


# ##### Jury points seems to align slightly more with the final result than the televote points

# In[40]:


machine_learning.evaluate_model(model, X_test, y_test)


# In[41]:


#machine_learning.save_model(model, 'semifinalpredictions')


# In[ ]:





# In[42]:


load_model('semifinalpredictions')


# In[43]:


from sklearn import metrics

y_predicted = model.predict(X_test)

print("Mean Absolute Error (MAE) = ", metrics.mean_absolute_error(y_test, y_predicted))

print("Mean Squared Error (MSE) = ", metrics.mean_squared_error(y_test, y_predicted))


# calculate RMSE using scikit-learn
print("Root Mean Squared Error (RMSE) = ", np.sqrt(metrics.mean_squared_error(y_test, y_predicted)))


# R-squared
r2_score = metrics.r2_score(y_test, y_predicted)
print ("R2 (R-squared) score = ", r2_score)
eV = round(sm.explained_variance_score(y_test, y_predicted), 4)
print('Explained variance score = ',eV )

# Visualise the regression results
plt.title('Multiple Linear Regression')
plt.scatter(y_test, y_predicted, color='magenta')
plt.show()




# ##### 
# - MAE = the predicted semifinal places are off by approximately 2.2 places - so a 10th place here could be between 7.8 and 12.2 in the actual result - which is pretty close.
# - MSE = A value of 7.04 indicates that, on average, the squared difference between predicted and actual places is 7.04, which in particular highlights the scale of this regression's errors due to the usage of squaring.
# - RMSE = A value of 2.65 means that, on average, the predicted semifinal place is off by around 2.65 places - which is also still reasonable.
# - Both R2 and the variance score are 0.68 - which means the model has covered 68% of the total variance in the semifinal results.

# In[45]:


residuals = y_test - y_predicted
plt.scatter(y_predicted, residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title("What is the deviation between predicted vs actual results? The line is the actual result")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.show()


# In[46]:


plt.scatter(y_test, y_predicted, color='magenta')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # Line of perfect prediction
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted")
plt.show()


# #### Since Eurovision results have a tendency to be quite unpredictable, we don't expect these predictions to be entirely accurate. But an variance score of 68% is pretty alright for a song contest with a small number of possible positions - in theory up until 26 - but ususally less than 20. That means of course that there is 32% of the result variance in the semifinals we don't account for. This can generally just stem from the upredictability of Eurovision results - like country-specific tendencies (diaspora voting, political voting, bloc-voting etc.), or just disrepancies between the juries and televotes.

# In[ ]:
