## MP-3 Project

In [None]:
import pandas as pd, numpy as np, seaborn as sbn, matplotlib.pyplot as plt

from Modules import utils as utl
from Modules import data_exploration as de
from Modules import machine_learning as ml

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm

In [None]:
import pickle

### Data Loading & Cleaning

In [None]:
csv_file_path = 'Data/song_data.csv'
eurovision_df = utl.load_csv(csv_file_path, skiprows=0, encoding='windows-1252')

In [None]:
eurovision_df

In [None]:
eurovision_df.isna().sum()

##### The dataset has numerous null values, however this is more due to the nature of the Eurovision Song Contest rather than faulty data. For instance, all the contestants of each year who performed in the semi-finals but never made it through to the final will have null values in all final related features. While some semi-final features must also be expected to be null due to some nations being automatically qualified to the final, each year.

In [None]:
finalists_df = utl.remove_missing(eurovision_df, columns='final_place')

##### Now we have two dataframes. eurovision_df with all the data and finalists_df for only the songs that made it to the final.

In [None]:
finalists_df.isna().sum()

##### Now to remove some columns we don't need.

In [None]:
columns_to_remove = [
    'artist_name', 'song_name', 'BPM', 'energy', 'danceability', 'happiness',
    'loudness', 'acousticness', 'instrumentalness', 'liveness', 'speechiness',
    'backing_dancers', 'backing_singers', 'backing_instruments', 'instrument_10',
    'race', 'favourite_10', 'host_10', 'age', 'gender', 'main_singers', 'key',
    'key_change_10', 'selection', 'release_date', 'direct_qualifier_10', 'qualified_10',
    'language'
]

In [None]:
eurovision_clean_df = eurovision_df.drop(columns=columns_to_remove)
finalists_clean_df = finalists_df.drop(columns=columns_to_remove)

In [None]:
eurovision_clean_df.sample(5)

In [None]:
eurovision_clean_df.to_csv('Data/eurovision_cleaned.csv', index=False)

In [None]:
more_columns_to_remove = [
    'semi_final', 'semi_draw_position', 'semi_place', 'semi_televote_points',
    'semi_jury_points', 'semi_total_points'
]

In [None]:
finalists_clean_df = finalists_clean_df.drop(columns=more_columns_to_remove)

In [None]:
finalists_clean_df.sample(5)

In [None]:
finalists_clean_df.to_csv('Data/finalists_cleaned.csv', index=False)

### Data Exploration

In [None]:
eurovision_clean_df.info()

In [None]:
eurovision_clean_df.describe()

In [None]:
finalists_clean_df.info()

In [None]:
finalists_clean_df.describe()

### Data loading, cleaning and exploration of semifinal data (removal of Big-5 countries and host country)

In [None]:
esc_sf_df = utl.remove_missing(eurovision_clean_df, columns='semi_place')

In [None]:
esc_sf_df.describe()

In [None]:
esc_sf_df.info()

In [None]:
more_columns_to_remove_semivers = [
    'final_draw_position', 'final_televote_points', 'final_jury_points', 'final_televote_votes', 'final_jury_votes', 'final_place', 'final_total_points'
]

In [None]:
esc_sf_clean_df = esc_sf_df.drop(columns=more_columns_to_remove_semivers)

In [None]:
esc_sf_clean_df.to_csv('Data/semifinalists_cleaned.csv', index=False)

In [None]:
esc_sf_clean_df.sample(5)

In [None]:
esc_sf_clean_df.info()

In [None]:
esc_sf_clean_df.describe()