In [220]:
# Relative imports
import pandas as pd
import numpy as np
import math

In [221]:
raw_df = pd.read_csv("../data/raw/eurovision.csv", index_col=0)

In [222]:
trimmed_df = raw_df.drop(columns=['event_url', 'artist', 'song', 'artist_url', 'image_url', 'country_emoji'])
trimmed_df.drop(trimmed_df[trimmed_df.year == 1956].index, inplace=True) # Drop 1956 because rank is not proper in this year
trimmed_df.drop(trimmed_df[trimmed_df.year == 2020].index, inplace=True) # Drop 2020 because covid
trimmed_df.drop(trimmed_df[trimmed_df.section == 'semi-final'].index, inplace=True) # Drop non-finals
trimmed_df.drop(trimmed_df[trimmed_df.section == 'first-semi-final'].index, inplace=True) # Drop non-finals
trimmed_df.drop(trimmed_df[trimmed_df.section == 'second-semi-final'].index, inplace=True) # Drop non-finals
trimmed_df

Unnamed: 0,event,host_city,year,host_country,section,artist_country,running_order,total_points,rank,rank_ordinal,qualified,winner
36,Turin 2022,Turin,2022,Italy,grand-final,United Kingdom,22.0,466.0,2.0,2nd,False,False
37,Turin 2022,Turin,2022,Italy,grand-final,Spain,10.0,459.0,3.0,3rd,False,False
38,Turin 2022,Turin,2022,Italy,grand-final,Italy,9.0,268.0,6.0,6th,False,False
39,Turin 2022,Turin,2022,Italy,grand-final,France,6.0,17.0,24.0,24th,False,False
40,Turin 2022,Turin,2022,Italy,grand-final,Germany,13.0,6.0,25.0,25th,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1987,Frankfurt 1957,Frankfurt,1957,Germany,final,Austria,5.0,3.0,10.0,10th,False,False
1988,Frankfurt 1957,Frankfurt,1957,Germany,final,Italy,4.0,7.0,6.0,6th,False,False
1989,Frankfurt 1957,Frankfurt,1957,Germany,final,United Kingdom,3.0,6.0,7.0,7th,False,False
1990,Frankfurt 1957,Frankfurt,1957,Germany,final,Luxembourg,2.0,8.0,4.0,4th,False,False


In [223]:
prepped_df = trimmed_df.copy()
prepped_df['winner'] = np.where(prepped_df['winner'] == True, 1, 0)
prepped_df['qualified'] = np.where(prepped_df['qualified'] == True, 1, 0)

#Helper Columns
prepped_df['event-section'] = prepped_df['event'] + prepped_df['section']
contestants_per_contest = prepped_df['event-section'].value_counts().sort_index()
prepped_df['section_contestants'] = prepped_df['event-section'].apply(lambda x: contestants_per_contest.loc[x])

#Prepped Columns - Features to Test
prepped_df['relative_order'] = prepped_df['running_order'] / prepped_df['section_contestants']
prepped_df['first_to_perform'] = np.where(prepped_df['running_order'] == 1, 1, 0)
prepped_df['last_to_perform'] = np.where(prepped_df['running_order'] == prepped_df['section_contestants'], 1, 0)
prepped_df['is_host_country'] = np.where(prepped_df['host_country'] == prepped_df['artist_country'], 1, 0)

#Prepped Columns - Targets to Test
prepped_df['relative_rank'] = (prepped_df['section_contestants'] - prepped_df['rank'] + 1) / prepped_df['section_contestants']
prepped_df['rank_quintiles'] = prepped_df['relative_rank'].apply(lambda x: 6 - math.ceil(5 * x))

In [226]:
output_df = prepped_df.filter(['relative_order', 'first_to_perform', 'last_to_perform', 'relative_rank']).reset_index()

In [227]:
output_df.to_csv('../data/preprocessed/eurovision_data_preprocessed.csv')