# Experiment Preparation 1. Groupby  #
## For Brownlow Predictor Project ##

Group different folds of the same model together for further experiment.

**Author: `Lang (Ron) Chen` 2021.12-2022.1**

---

**0. Import Libraries**

In [1]:
import pandas as pd

In [2]:
file = pd.read_csv('./Script Output/R.csv')
file2 = pd.read_csv('./Script Output/(B).csv')
file3 = pd.read_csv('./Script Output/(2)(B).csv')
file4 = pd.read_csv('./Script Output/(2).csv')

new = pd.concat([file, file2, file3, file4])

**1. Initial Manipulation and Defining Variables**

In [3]:
file = new[new['Datatype'] != 'PN'] # As PN was proven to be same as N after running scripts

In [4]:
# Put in the data of the top 6 pollers in 2021
PLAYERS = ['Oliver Wines', 'Marcus Bontempelli', 'Clayton Oliver', 'Sam Walsh', 'Darcy Parish', 'Jack Steele']
VOTES = [36, 33, 31, 30, 26, 26]
PDICT = {i:j for i, j in enumerate(PLAYERS)}

In [5]:
# Create initial datframe
data1 = pd.DataFrame({
    'Method': list(),
    'Datatype': list(),
    'Use': list(),
    'FS_Val' : list(),
    'FS_Rule' : list(),
    'WinLoss' : list(),
    
    'tp3' : list(),
    'tp2' : list(),
    'tp1' : list(),
    'tp0.5' : list(),
    'tp0' : list(),
    'coef_avg' : list(),
    'coef1' : list(),
    'coef2_1' : list(),
    'coef2_2' : list(),
    
    'm1_Top3score' : list(),  
    'm1_avgvotediff(3)' : list(),
    'm1_Top4score' : list(),  
    'm1_avgvotediff(4)' : list(),
    'm2_Winner' : list(),
    'm2_avgvotediff' : list(),
    'v1': list(),
    'v1m': list()})

**2. Defining some functions to find desired scores to rank emperical observations**

In [6]:
def top3score(model, n):
    model.index = range(len(model))
    tmp = [findscore(model, i, n) for i in range(n)]
    
    return sum(tmp)
  
    
    
def findscore(model, i, n):
    score = list()
    
    for j in range(len(model)):
        diff = -1
        for k in range(n):
            if PDICT[i] == model.loc[j][f'P{k+1}']:
                diff = abs(k-i)
                break
        
        if diff != -1:
            score.append((n-i)*(n-diff)/n)
        else:
            score.append(0)
            
    return sum(score)

In [7]:
def m1_votediff1(model, n):
    i = 0
    sumdiff = 0
    for i in range(n):
        a, b = m1_votediff1_help(model, i)
        i += a
        sumdiff += b
    
    if i:
        return sumdiff/i
    
    return 0
    
    
    
def m1_votediff1_help(model, i):
    winner = model[model[f'P{i+1}'] == PLAYERS[i]]
    winner.index = range(len(winner))
    
    if len(winner):
        diff = [abs(winner.loc[j][f'V{i+1}']- VOTES[i]) for j in range(len(winner))]
        return sum(diff), len(winner)
    
    return 0, 0

In [8]:
def m2_win(model):
    return len(model[model['P1'] == PLAYERS[0]])

In [9]:
def m2_votediff(model):
    winner = model[model['P1'] == PLAYERS[0]]
    winner.index = range(len(winner))
    
    if len(winner):
        diff = [abs(winner.loc[i]['V1']- VOTES[0]) for i in range(len(winner))]
        return sum(diff)/len(winner)
    
    return 0

**3. Run Script**

In [10]:
for char, model in file.groupby(['Method', 'Datatype', 'Use', 'Feature Selection Value', 'Feature Selection Rule', 'Winloss']):
    
    twostep = False
    if char[0][0:5] == 'LR(2)':
        twostep = True
    
    hasnan = False
    if model['TP0'].hasnans:
        hasnan = True
    
    if not hasnan:
        
        if not twostep:
            tmp = pd.DataFrame({
            'Method': char[0],
            'Datatype': char[1],
            'Use': char[2],
            'FS_Val' : char[3],
            'FS_Rule' : char[4],
            'WinLoss' : char[5],

            'tp3' : model['TP3'].mean(),
            'tp2' : model['TP2'].mean(),
            'tp1' : model['TP1'].mean(),
            'tp0.5' : [None],
            'tp0' : model['TP0'].mean(),
            'coef_avg' : model['Coef1'].mean(),
            'coef1' : model['Coef1'].mean(),
            'coef2_1' : None,
            'coef2_2' : None,
            
            'm1_Top3score' : top3score(model, 3),
            'm1_avgvotediff(3)' : m1_votediff1(model, 3),
            'm1_Top4score' : top3score(model, 4),
            'm1_avgvotediff(4)' : m1_votediff1(model, 4),
            'm2_Winner' : m2_win(model),
            'm2_avgvotediff' : m2_votediff(model),
            'v1': model['V1'].mean(),
            'v1m': model['V1'].median()})
            
        else:
            model.index = range(5)
            tmp = pd.DataFrame({
            'Method': char[0],
            'Datatype': char[1],
            'Use': char[2],
            'FS_Val' : char[3],
            'FS_Rule' : char[4],
            'WinLoss' : char[5],

            'tp3' : model['TP3'].mean(),
            'tp2' : model['TP2'].mean(),
            'tp1' : model['TP1'].mean(),
            'tp0.5' : model['TP0.5'].mean(),
            'tp0' : model['TP0'].mean(),
            'coef_avg' : (model['Coef1'].mean() + model['Coef2'].mean())/2,
            'coef1' : None,
            'coef2_1' : model['Coef1'].mean(),
            'coef2_2' : model['Coef2'].mean(),
            
            'm1_Top3score' : top3score(model, 3),
            'm1_avgvotediff(3)' : m1_votediff1(model, 3),
            'm1_Top4score' : top3score(model, 4),
            'm1_avgvotediff(4)' : m1_votediff1(model, 4),
            'm2_Winner' : m2_win(model),
            'm2_avgvotediff' : m2_votediff(model),
            'v1': [model['V1'].mean()],
            'v1m': [model['V1'].median()]})
        data1 = pd.concat([data1, tmp])

In [11]:
data1.to_csv('All.csv', index = False)