## MP-3 Project

In [1]:
import pandas as pd, numpy as np, seaborn as sbn, matplotlib.pyplot as plt

from Modules import utils as utl
from Modules import data_exploration as de
from Modules import machine_learning as ml

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm

In [3]:
import pickle

### Data Loading & Cleaning

In [7]:
csv_file_path = 'Data/song_data.csv'
eurovision_df = utl.load_csv(csv_file_path, skiprows=0, encoding='windows-1252')

In [9]:
eurovision_df

Unnamed: 0,year,semi_final,semi_draw_position,final_draw_position,country,artist_name,song_name,language,style,direct_qualifier_10,...,final_jury_votes,final_place,final_total_points,semi_place,semi_televote_points,semi_jury_points,semi_total_points,favourite_10,race,host_10
0,2023,1,1,20,Norway,Alessandra,Queen of Kings,English,Pop,0,...,11.0,5.0,268.0,6.0,102.0,,102.0,0,unknown,0
1,2023,1,2,,Malta,The Busker,Dance (Our Own Party),English,Pop,-,...,,,,15.0,3.0,,3.0,0,unknown,0
2,2023,1,3,5,Serbia,Luke Black,Samo mi se spava,"Serbian, English",Pop,0,...,6.0,24.0,30.0,10.0,37.0,,37.0,0,unknown,0
3,2023,1,4,,Latvia,Sudden Lights,Aija,English,Rock,-,...,,,,11.0,34.0,,34.0,0,unknown,0
4,2023,1,5,2,Portugal,Mimicat,Ai coração,Portuguese,Pop,0,...,9.0,23.0,59.0,9.0,74.0,,74.0,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,2009,-,-,3,France,Patricia Kaas,Et s'il fallait le faire,French,Ballad,1,...,,8.0,218.0,,,,,0,unknown,0
561,2009,-,-,10,Russia,Anastasiya Prikhodko,Mamo,"Russian, Ukrainian",Ballad,1,...,,11.0,185.0,,,,,0,unknown,1
562,2009,-,-,17,Germany,Alex Swings Oscar Sings!,Miss Kiss Kiss Bang,English,Pop,1,...,,20.0,91.0,,,,,0,unknown,0
563,2009,-,-,23,United Kingdom,Jade Ewen,It's My Time,English,Ballad,1,...,,5.0,328.0,,,,,0,unknown,0


In [11]:
eurovision_df.isna().sum()

year                       0
semi_final                 0
semi_draw_position         0
final_draw_position       11
country                    0
artist_name                0
song_name                  0
language                   0
style                      0
direct_qualifier_10        0
gender                     0
main_singers               0
age                        0
selection                  0
key                        0
BPM                        0
energy                     0
danceability               0
happiness                  0
loudness                  90
acousticness               0
instrumentalness           0
liveness                   0
speechiness                0
release_date               0
key_change_10              0
backing_dancers            0
backing_singers            0
backing_instruments        0
instrument_10              0
qualified_10               0
final_televote_points    237
final_jury_points        237
final_televote_votes     332
final_jury_vot

##### The dataset has numerous null values, however this is more due to the nature of the Eurovision Song Contest rather than faulty data. For instance, all the contestants of each year who performed in the semi-finals but never made it through to the final will have null values in all final related features. While some semi-final features must also be expected to be null due to some nations being automatically qualified to the final, each year.

In [13]:
finalists_df = utl.remove_missing(eurovision_df, columns='final_place')

##### Now we have two dataframes. eurovision_df with all the data and finalists_df for only the songs that made it to the final.

In [None]:
finalists_df.isna().sum()

##### Now to remove some columns we don't need.

In [15]:
columns_to_remove = [
    'artist_name', 'song_name', 'BPM', 'energy', 'danceability', 'happiness',
    'loudness', 'acousticness', 'instrumentalness', 'liveness', 'speechiness',
    'backing_dancers', 'backing_singers', 'backing_instruments', 'instrument_10',
    'race', 'favourite_10', 'host_10', 'age', 'gender', 'main_singers', 'key',
    'key_change_10', 'selection', 'release_date', 'direct_qualifier_10', 'qualified_10',
    'language'
]

In [17]:
eurovision_clean_df = eurovision_df.drop(columns=columns_to_remove)
finalists_clean_df = finalists_df.drop(columns=columns_to_remove)

In [None]:
eurovision_clean_df.sample(5)

In [None]:
eurovision_clean_df.to_csv('Data/eurovision_cleaned.csv', index=False)

In [19]:
more_columns_to_remove = [
    'semi_final', 'semi_draw_position', 'semi_place', 'semi_televote_points',
    'semi_jury_points', 'semi_total_points'
]

In [21]:
finalists_clean_df = finalists_clean_df.drop(columns=more_columns_to_remove)

In [29]:
finalists_clean_df.sample(5)

Unnamed: 0,year,final_draw_position,country,style,final_televote_points,final_jury_points,final_televote_votes,final_jury_votes,final_place,final_total_points
176,2018,7,Norway,Pop,84.0,60.0,18.0,12.0,15.0,144.0
273,2016,21,Ukraine,Pop,323.0,211.0,40.0,24.0,1.0,534.0
416,2012,23,Ireland,Dance,89.0,14.0,,,19.0,103.0
538,2009,6,Portugal,Traditional,45.0,64.0,,,15.0,109.0
343,2014,9,Poland,Traditional,23.0,162.0,23.0,6.0,14.0,185.0


In [None]:
finalists_clean_df.to_csv('Data/finalists_cleaned.csv', index=False)

### Data Exploration

In [23]:
eurovision_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565 entries, 0 to 564
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   565 non-null    int64  
 1   semi_final             565 non-null    object 
 2   semi_draw_position     565 non-null    object 
 3   final_draw_position    554 non-null    object 
 4   country                565 non-null    object 
 5   style                  565 non-null    object 
 6   final_televote_points  328 non-null    float64
 7   final_jury_points      328 non-null    float64
 8   final_televote_votes   233 non-null    float64
 9   final_jury_votes       233 non-null    float64
 10  final_place            358 non-null    float64
 11  final_total_points     358 non-null    float64
 12  semi_place             483 non-null    float64
 13  semi_televote_points   243 non-null    float64
 14  semi_jury_points       212 non-null    float64
 15  semi_t

In [25]:
eurovision_clean_df.describe()

Unnamed: 0,year,final_televote_points,final_jury_points,final_televote_votes,final_jury_votes,final_place,final_total_points,semi_place,semi_televote_points,semi_jury_points,semi_total_points
count,565.0,328.0,328.0,233.0,233.0,358.0,358.0,483.0,243.0,212.0,483.0
mean,2015.653097,90.539634,90.655488,15.184549,15.107296,13.391061,164.849162,9.192547,68.263374,67.575472,97.78882
std,4.312691,86.852074,74.096517,12.828622,9.868602,7.430208,143.603444,5.069863,50.617148,47.545328,73.844662
min,2009.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
25%,2012.0,25.75,36.0,4.0,7.0,7.0,68.25,5.0,24.5,26.75,43.0
50%,2016.0,59.0,71.0,11.0,13.0,13.0,124.5,9.0,54.0,58.0,77.0
75%,2019.0,127.5,126.0,26.0,22.0,20.0,225.25,13.5,104.5,98.25,134.0
max,2023.0,439.0,382.0,42.0,39.0,27.0,758.0,19.0,204.0,222.0,403.0


In [27]:
finalists_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 358 entries, 0 to 564
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   358 non-null    int64  
 1   final_draw_position    358 non-null    object 
 2   country                358 non-null    object 
 3   style                  358 non-null    object 
 4   final_televote_points  326 non-null    float64
 5   final_jury_points      326 non-null    float64
 6   final_televote_votes   230 non-null    float64
 7   final_jury_votes       230 non-null    float64
 8   final_place            358 non-null    float64
 9   final_total_points     358 non-null    float64
dtypes: float64(6), int64(1), object(3)
memory usage: 30.8+ KB


In [None]:
finalists_clean_df.describe()

### Data loading, cleaning and exploration of semifinal data (removal of Big-5 countries and host country)

In [None]:
esc_sf_df = utl.remove_missing(eurovision_clean_df, columns='semi_place')

In [None]:
esc_sf_df.describe()

In [None]:
esc_sf_df.info()

In [None]:
more_columns_to_remove_semivers = [
    'final_draw_position', 'final_televote_points', 'final_jury_points', 'final_televote_votes', 'final_jury_votes', 'final_place', 'final_total_points'
]

In [None]:
esc_sf_clean_df = esc_sf_df.drop(columns=more_columns_to_remove_semivers)

In [None]:
esc_sf_clean_df.to_csv('Data/semifinalists_cleaned.csv', index=False)

In [None]:
esc_sf_clean_df.sample(5)

In [None]:
esc_sf_clean_df.info()

In [None]:
esc_sf_clean_df.describe()