In [309]:
import pandas as pd
import torch

In [None]:
# features
df = pd.read_csv("../data/eurovision_tune_plus.csv", index_col="Unnamed: 0")

# fixes
df.loc[df["year"] == "Azerbaijan_Running Scared_Ell", "year"] = 2011


df["year"] = df["year"].astype(int)
df = df[(df["year"] != 2020) & (df["year"] != 2023)]

def feature_vec(row):
    feature_len = 512
    features = []
    for f in range(feature_len):
        features.append(row[str(f)])
    return features

df["features"] = df.apply(feature_vec, axis=1)

In [None]:
stan_contestants = pd.read_csv("../data/stan-contestants.csv")

merged_df = pd.merge(df, stan_contestants,  how='left', left_on=['year', 'country'], right_on = ['year','country_name'])
print("Missing datapoints: {}".format(new_df["stan_contestant"].isna().sum()))

merged_df = merged_df.dropna(subset=["stan_contestant"])

In [None]:
# draws
draws = pd.read_csv("../data/contest-draws.csv")
draws = draws.drop([".chain", ".iteration", ".draw"], axis=1)
draws.columns = [col.replace("beta_contestant[", "").replace("]", "") for col in draws.columns]
draws = draws.T
draws["stan_contestant"] = draws.index
draws["stan_contestant"] = draws["stan_contestant"].astype(int)

In [None]:
def draws_vec(row):
    draws = []
    n_draws = 4000
    for d in range(n_draws):
        draws.append(row[d])
    return draws
    
draws["draws"] = draws.apply(draws_vec, axis=1)
draws = draws[["draws", "stan_contestant"]]

merged_df = pd.merge(merged_df, draws, how="left", left_on="stan_contestant", right_on="stan_contestant")

In [None]:
print("Missing datapoints: {}".format(merged_df["stan_contestant"].isna().sum()))

## Prepare input and targets

For every input $X$ we observe multiple draws $Y$

In [None]:
from tqdm import tqdm
inputs = []
targets = []
for idx, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
    for draw in row["draws"]:
        inputs.append(row["features"])
        targets.append(draw)


In [None]:
# Check if the above for loop made the correct mapping
assert inputs[0] == merged_df.iloc[0]["features"]
assert targets[0] == merged_df.iloc[0]["draws"][0]
assert inputs[10] == merged_df.iloc[0]["features"]
assert targets[10] == merged_df.iloc[0]["draws"][10]

assert inputs[4000] == merged_df.iloc[1]["features"]
assert targets[4000] == merged_df.iloc[1]["draws"][0]
assert inputs[8005] == merged_df.iloc[2]["features"]
assert targets[8005] == merged_df.iloc[2]["draws"][5]

In [310]:
len(inputs)

5044000

In [None]:
from torch.utils.data import TensorDataset
inputs = torch.Tensor(inputs)
targets = torch.Tensor(targets)

dataset = TensorDataset(inputs, targets)

In [74]:
import torch.nn as nn
import lightning

class MLP(nn.Module):
    
    def __init__(self, n_classes: int):
        
        self.model = nn.Sequential(
            nn.Linear(512, n_classes)
        )
        
    def forward(self, x):
        return self.model(x)

In [75]:
for x, y in dataset:
    print(x.shape, y.shape)
    break

torch.Size([512]) torch.Size([])
