# Verifying Assumption 1 on FIFA 2020 Task

In [5]:
# Import necessary packages
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
from balancers import BinaryBalancer
from utils import BertClassifier, Dataset, train

# sklearn
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split

# pytorch
import torch

In [6]:
# Import fifa 20 dataset
all_data = pd.read_csv('data/players_20.csv')
results_dir = 'results/'

# Set the nationalities you want to use
c1 = 'England'
c2 = 'Argentina'
nationalities = [c1,c2]

# Keep relevant columns
o_data = all_data[['long_name','age','nationality','overall','wage_eur']].copy(deep = True)

# Change column names of nationality and wage_eur to a and y 
o_data.rename(columns = {'nationality':'a', 'wage_eur':'y'}, inplace = True)

# Restrict to specified nationalities
o_data = o_data.loc[(o_data['a'].isin([nationalities[0], nationalities[1]]))]
print('Number of players:', o_data.shape[0])

# Only keep relevant feagires and outcomes
data = o_data[['long_name','age','a','overall','y']].copy(deep = True)

# Binarize nationality
data = data.replace(nationalities[0],0)
data = data.replace(nationalities[1],1)

# Reset the index
data = data.reset_index(drop=True)

# Binarize outcome using median outcome
data.loc[data['y'] < np.median(data['y']), 'y'] = 0
data.loc[data['y'] >= np.median(data['y']), 'y'] = 1

Number of players: 2553


In [7]:
# Create new column to stratify data and calculate base rates
group = []
for i in range(data.shape[0]):
    # w
    if np.sum(data[['a','y']].iloc[i] == [0,0]) == 2:
        group.append(1)
    # v
    if np.sum(data[['a','y']].iloc[i] == [1,0]) == 2:
        group.append(2)
    # s
    if np.sum(data[['a','y']].iloc[i] == [0,1]) == 2:
        group.append(3)
    # r
    if np.sum(data[['a','y']].iloc[i] == [1,1]) == 2:
        group.append(4)

# Add column to the data
data['group'] = group

# Calculate base rates
total = data.shape[0]
r = np.sum(data['group'] == 4)/total
s = np.sum(data['group'] == 3)/total
v = np.sum(data['group'] == 2)/total
w = np.sum(data['group'] == 1)/total

### Learn a predictor for A (nationality) using player name

In [8]:
# Learn a predictor for nationality using name with BERT
model = BertClassifier()
LR = 1e-5

# Splitting data into necessary datasets
d_train, d_test = train_test_split(data, train_size = 0.75, stratify=data['group'])

# Train model
EPOCHS = 5
train(model, d_train, d_test, LR, EPOCHS)

100%|██████████| 957/957 [00:40<00:00, 23.56it/s]


Epochs: 1 | Train Loss:  0.219             | Train Accuracy:  0.918             | Val Loss:  0.121             | Val Accuracy:  0.959


100%|██████████| 957/957 [00:40<00:00, 23.37it/s]


Epochs: 2 | Train Loss:  0.078             | Train Accuracy:  0.978             | Val Loss:  0.125             | Val Accuracy:  0.956


100%|██████████| 957/957 [00:40<00:00, 23.51it/s]


Epochs: 3 | Train Loss:  0.031             | Train Accuracy:  0.990             | Val Loss:  0.109             | Val Accuracy:  0.978


100%|██████████| 957/957 [00:40<00:00, 23.42it/s]


Epochs: 4 | Train Loss:  0.011             | Train Accuracy:  0.996             | Val Loss:  0.256             | Val Accuracy:  0.950


100%|██████████| 957/957 [00:40<00:00, 23.42it/s]


Epochs: 5 | Train Loss:  0.026             | Train Accuracy:  0.991             | Val Loss:  0.101             | Val Accuracy:  0.972


In [9]:
# Use BERT on test set to generate a_hat
xy_test = Dataset(d_test)
xy_test_dataloader = torch.utils.data.DataLoader(xy_test, batch_size=len(xy_test))
device = torch.device("cuda:1")
model.to(device)
model = model.eval()
with torch.no_grad():
    for xy_test_input, xy_test_label, xy_test_remain in xy_test_dataloader:
        xy_test_label = xy_test_label.to(device).float()
        mask = xy_test_input['attention_mask'].to(device)
        input_id = xy_test_input['input_ids'].squeeze(1).to(device)
        output = model(input_id, mask).reshape(1,-1)[0]
        a_hat = (output >= 0.5)*1.0
        a_hat = a_hat.cpu().numpy()

### Learn a predictor for Y (wage > median(wage)) using players age and overall attribute

In [10]:
# Learn classifier for y
x_test = d_test[['overall', 'age']]
y_test = d_test['y'].to_numpy()

# Create Classifier
clf = RandomForestClassifier(n_estimators=10)

# Split data into features and labels
x_train = d_train[['overall', 'age']]
y_train = d_train['y']

# Fit classifier
clf.fit(x_train.to_numpy(), y_train)

# Evaluate probabilities/y_hat on test set
y_prob = clf.predict_proba(x_test.to_numpy())
    
# Calculate y_hat
y_hat = (y_prob[:,1] >= 0.5).astype('float')

### Perform necessary calculations to verify Assumption 1

In [11]:
iter = 1000
y = y_test
a = d_test['a'].values
num_rows = len(a)
k = 0 

alpha_h_11 = []
alpha_h_11_lb = []
alpha_h_01 = []
alpha_h_01_lb = []
alpha_h_10 = []
alpha_h_10_lb = []
alpha_h_00 = []
alpha_h_00_lb = []


while k < iter:
    random_indices = np.random.choice(num_rows, size=num_rows, replace=True)
    a_s = a[random_indices]
    y_s = y[random_indices]
    a_hat_s = a_hat[random_indices]
    y_hat_s = y_hat[random_indices]
    true_balancer = BinaryBalancer(y=y_s, y_=y_hat_s, a=a_s, a_hat = a_hat_s, adjusted = False)

    # See if conditions are met
    if true_balancer.assumption == 0:
        true_balancer.adjust(con = 'tpr/fpr', obj = 'project')
        true_balancer = BinaryBalancer(y=y_s, y_=true_balancer.y_adj, a=a_s, a_hat = a_hat_s)
    
    # Show assumption holds
    alpha_h_11.append(true_balancer.a_gr_list[1].tpr)
    alpha_h_11_lb.append(true_balancer.U1/true_balancer.est_base_rates['rh_11'])

    alpha_h_01.append(true_balancer.a_gr_list[0].tpr)
    alpha_h_01_lb.append(true_balancer.U0/true_balancer.est_base_rates['rh_01'])

    alpha_h_10.append(true_balancer.a_gr_list[1].fpr)
    alpha_h_10_lb.append(true_balancer.U1/true_balancer.est_base_rates['rh_10'])

    alpha_h_00.append(true_balancer.a_gr_list[0].fpr)
    alpha_h_00_lb.append(true_balancer.U0/true_balancer.est_base_rates['rh_00'])

    if k%200 == 0:
        print(k)
    k = k+1 

0
200
400
600
800


In [12]:
# Statistics to show assumption holds
rows = ['alpha_h_11', 'alpha_h_11_ub', 'alpha_h_11_lb', \
        'alpha_h_01', 'alpha_h_01_ub', 'alpha_h_01_lb', \
        'alpha_h_10', 'alpha_h_10_ub', 'alpha_h_10_lb', \
        'alpha_h_00', 'alpha_h_00_ub', 'alpha_h_00_lb']
values = [np.mean(np.array(alpha_h_11)), np.mean(1 - np.array(alpha_h_11_lb)), np.mean(np.array(alpha_h_11_lb)), \
          np.mean(np.array(alpha_h_01)), np.mean(1 - np.array(alpha_h_01_lb)), np.mean(np.array(alpha_h_01_lb)), \
          np.mean(np.array(alpha_h_10)), np.mean(1 - np.array(alpha_h_10_lb)), np.mean(np.array(alpha_h_10_lb)),
          np.mean(np.array(alpha_h_00)), np.mean(1 - np.array(alpha_h_00_lb)), np.mean(np.array(alpha_h_00_lb))]
df = pd.DataFrame(data = zip(rows, values), columns = ['Paramater', 'Value'])
df.to_csv(results_dir + c1 + '_' + c2 + '/assumption_results.csv', index = 0)