In [147]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import ast

# Read in Data

In [148]:
data = pd.read_csv("data/combined_data.csv")

In [149]:
data.sample(10)

Unnamed: 0,Country Name,Country Code,Year,Song,Artist,Language,Grand Final Place,Grand Final Points,Semifinal,Semifinal Place,...,"People with basic handwashing facilities including soap and water, urban (% of urban population)","Risk premium on lending (lending rate minus treasury bill rate, %)","Incidence of malaria (per 1,000 population at risk)","Net financial flows, RDB concessional (NFL, current US$)",Financial intermediary services indirectly Measured (FISIM) (constant LCU),"Net financial flows, IMF concessional (NFL, current US$)",Newborns protected against tetanus (%),"Net official flows from UN agencies, UNWTO (current US$)",Children with fever receiving antimalarial drugs (% of children under age 5 with fever),Use of insecticide-treated bed nets (% of under-5 population)
1030,Denmark,DNK,2013,Only Teardrops,Emmelie de Forest,['English'],1.0,281.0,1.0,1.0,...,,,,,,,,,,
1189,Estonia,EST,2017,Verona,Koit Toome and Laura,['English'],,,2.0,14.0,...,,,,,,,,,,
645,Estonia,EST,2003,Eighties Coming Back,Ruffus,['English'],21.0,14.0,,,...,,,,,,,,,,
838,Lithuania,LTU,2008,Nomads in the Night,Jeronimas Milius,['English'],,,2.0,16.0,...,,,,,,,,,,
715,Hungary,HUN,2005,"Forogj, világ!",NOX,['Hungarian'],12.0,97.0,,5.0,...,,1.587988,,,,,,,,
1033,France,FRA,2013,L'enfer et moi,Amandine Bourgeois,['French'],23.0,14.0,,,...,,,,,,,,,,
1227,Croatia,HRV,2018,Crazy,Franka,['English'],,,1.0,17.0,...,,,,,,,,,,
61,Finland,FIN,1975,Old Man Fiddle,Pihasoittajat,['English'],7.0,74.0,,,...,,,,,,,,,,
674,Denmark,DNK,2004,Shame on You,Tomas Thordarson,['English'],,,,13.0,...,,,,,,,,,,
532,Malta,MLT,1998,The One That I Love,Chiara,['English'],3.0,165.0,,,...,,2.6775,,,,,,,,


## Format the Languages to be a list

In [150]:
data["Language"] = data["Language"].str.removeprefix("[").str.removesuffix("]").str.split(",")



## Encode the Languages as Binaries

In [151]:
mlb = MultiLabelBinarizer()
language_encoded = mlb.fit_transform(data['Language'])

In [152]:
mlb.classes_

array([" 'Albanian'", " 'Antillean Creole'", " 'Arabic'", " 'Armenian'",
       " 'Bosnian'", " 'Bulgarian'", " 'Catalan'", " 'Corsican'",
       " 'Crimean Tatar'", " 'Croatian'", " 'Czech'",
       ' \'English ("Franglais")\'', " 'English'", " 'Finnish'",
       " 'French'", " 'Georgian'", " 'German'", " 'Greek'", " 'Hebrew'",
       " 'Hungarian'", " 'Imaginary'", " 'Italian'", " 'Latin'",
       " 'Lithuanian'", " 'Luxembourgish'", " 'Macedonian'", " 'Maltese'",
       " 'Northern Sami'", " 'Polish'", " 'Portuguese'", " 'Romani'",
       " 'Romanian'", " 'Russian'", " 'Slovene'", " 'Spanish'",
       " 'Srnan Tongo'", " 'Swahili'", " 'Turkish'", " 'Ukrainian'",
       "'Albanian'", "'Arabic'", "'Armenian'", "'Belarusian'",
       "'Bosnian'", "'Breton'", "'Bulgarian'", "'Catalan'", "'Corsican'",
       "'Croatian'", "'Czech'", "'Danish'", "'Dutch'", "'English'",
       "'Estonian'", "'Finnish'", "'French'", "'Georgian'", "'German'",
       "'Greek'", "'Hebrew'", "'Hungarian'", "'Ic

In [153]:
language_df = pd.DataFrame(language_encoded, columns=mlb.classes_)

In [154]:
data = pd.concat([data, language_df], axis=1).drop(columns=['Language'])


In [155]:
data.sample(10)

Unnamed: 0,Country Name,Country Code,Year,Song,Artist,Grand Final Place,Grand Final Points,Semifinal,Semifinal Place,Semifinal Points,...,'Slovak','Slovene','Spanish','Swedish','Turkish','Udmurt','Ukrainian','Viennese','Vorarlbergish','Võro'
352,Portugal,PRT,1990,Há sempre alguém,Nucha,20.0,9.0,,,,...,0,0,0,0,0,0,0,0,0,0
756,Lithuania,LTU,2006,We Are the Winners,LT United,6.0,162.0,,5.0,163.0,...,0,0,0,0,0,0,0,0,0,0
296,Austria,AUT,1988,Lisa Mona Lisa,Wilfried,21.0,0.0,,,,...,0,0,0,0,0,0,0,0,0,0
197,Switzerland,CHE,1982,Amour on t'aime,Arlette Zola,3.0,97.0,,,,...,0,0,0,0,0,0,0,0,0,0
318,Cyprus,CYP,1989,Apopse as vrethume (Απόψε ας βρεθούμε),Fani Polymeri & Yiannis Savvidakis,11.0,51.0,,,,...,0,0,0,0,0,0,0,0,0,0
980,Albania,ALB,2012,Suus,Rona Nishliu,5.0,146.0,1.0,2.0,146.0,...,0,0,0,0,0,0,0,0,0,0
584,North Macedonia,MKD,2000,100% te ljubam (100% те љубам),XXL,15.0,29.0,,,,...,0,0,0,0,0,0,0,0,0,0
39,Luxembourg,LUX,1973,Tu te reconnaîtras,Anne-Marie David,1.0,129.0,,,,...,0,0,0,0,0,0,0,0,0,0
906,Croatia,HRV,2010,Lako je sve,Feminnem,,,2.0,13.0,33.0,...,0,0,0,0,0,0,0,0,0,0
875,Iceland,ISL,2009,Is It True?,Yohanna,2.0,218.0,1.0,1.0,174.0,...,0,0,0,0,0,0,0,0,0,0
