## MP-3 Project - Analysis of ESC Semifinals running order and results

In [None]:
import pandas as pd, numpy as np, seaborn as sbn, matplotlib.pyplot as plt

from Modules import utils as utl
from Modules import data_exploration as de
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import sklearn.metrics as sm
import pickle

In [None]:
csv_file_path = 'Data/semifinalists_cleaned.csv'
eurovision_semi_df = utl.load_csv(csv_file_path, skiprows=0, encoding='windows-1252')

In [None]:
eurovision_semi_df

In [None]:
eurovision_semi_df.shape

In [None]:
# visualise the features and the response using scatterplots
sbn.pairplot(eurovision_semi_df, x_vars=['semi_draw_position'], y_vars='semi_place', height=5, aspect=0.8)
plt.show()

In [None]:
# histograms
hist = eurovision_semi_df.hist()

### Trying to figure out why there is a gap between 7.5 and 10 in semi_draw_position and semi_place

In [None]:
eurovision_semi_df['semi_draw_position'].hist()

In [None]:
not_semi_places = ['semi_final', 'country', 'style', 'semi_televote_points', 'semi_jury_points', 'semi_total_points']

In [None]:
semi_places = eurovision_semi_df.drop(columns=not_semi_places)

In [None]:
semi_places.info()

### Back to exploration of data

In [None]:
eurovision_semi_df.isnull().sum()

In [None]:
eurovision_semi_df.describe()

In [None]:
# reformat the Eurovision semifinal list to not include strings
eurovision_semi_num = ['semi_final', 'semi_draw_position', 'semi_televote_points', 'semi_jury_points', 'semi_total_points', 'semi_place']
eurovision_semi_df = eurovision_semi_df[eurovision_semi_num]

In [None]:
# create a Python list of feature names
feature_cols = ['semi_final', 'semi_draw_position']

# use the list to select a subset of the original DataFrame
X = eurovision_semi_df[feature_cols]

# print the first 5 rows
X.head()

In [None]:
# select a Series from the DataFrame for y
y = eurovision_semi_df['semi_place']

# print the first 5 values
y.head()

In [None]:
# check the type and shape of X
print(type(X))
print(X.shape)

In [None]:
# check the type and shape of y
print(type(y))
print(y.shape)

In [None]:
import sys
import os

# Use the current working directory instead of __file__
current_directory = os.getcwd()

# Add the path to the 'Modules' folder (adjust the path to reach the Modules folder)
sys.path.append(os.path.join(current_directory, 'Modules'))

# Now import the evalute function
import machine_learning
from machine_learning import *

In [None]:
X_train, X_test, y_train, y_test = machine_learning.prepare_data(eurovision_semi_df, 'semi_place')

In [None]:
model = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', LinearRegression())
])

In [None]:
model.fit(X_train, y_train)

In [None]:
reg = model.named_steps['regressor']

In [None]:
print("Intercept:", reg.intercept_)
print("Coefficients:", reg.coef_)

feature_names = X_train.columns
coef_table = list(zip(feature_names, reg.coef_))
for name, coef in coef_table:
    print(f"{name:30} {coef:>10.4f}")


In [None]:
machine_learning.evaluate_model(model, X_test, y_test)

In [None]:
machine_learning.save_model(model, 'semifinalpredictions')

In [None]:
load_model('semifinalpredictions')