# Packages

In [28]:
import psycopg2
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split

# SQL Engine Setup

In [29]:
db_string = 'postgresql://postgres:impreza@localhost/projectdb'
db = create_engine(db_string)

con = psycopg2.connect(database="projectdb", user="postgres", password="impreza", host="127.0.0.1", port="5432")
curr = con.cursor()

# Data Querying

In [30]:
# query matches data

matches_query = """
    SELECT id as match_id, period_id, pla_id, plb_id,
    sca as score_a, scb as score_b, rca as race_a, 
    rcb as race_b
    FROM MATCH
"""

match_df = pd.read_sql_query(matches_query, db)

In [31]:
# query ratings data
ratings_query = """
    SELECT period_id, player_id, comp_rat, comp_rat_vp,
    comp_rat_vt, comp_rat_vz, position, position_vp, position_vt,
    position_vz
    FROM RATING
    WHERE position IS NOT NULL
"""

ratings_df = pd.read_sql_query(ratings_query, db)

In [32]:
pd.read_sql_query("SELECT * FROM PERIOD WHERE id = '268'", db)

Unnamed: 0,id,start,end,computed,needs_recompute,num_retplayers,num_newplayers,num_games,dom_p,dom_t,dom_z
0,268,2020-05-21,2020-06-03,True,False,659,161,4622,0.795011,1.107612,1.097377


# Data Transformations

In [33]:
def assign_winner(df, sca, scb):
    """
    Inputs:
    sca: score for player A
    scb: score for player B

    Output:
    Binary result for winner:
    1 for player A
    0 for player B
    """
    if df[sca] > df[scb]:
        return 1
    else:
        return 0

In [34]:
def rearrange_columns(df):
    # get a list of any column except the winner (label) col

    # moves winner to end of dataframe
    temp_list = [col for col in df.columns if col != 'winner']

    temp_list.append('winner')

    return df[temp_list]

In [35]:
# assign a winner column

match_df['winner'] = match_df.apply(
    assign_winner,
    axis = 1,
    args = ('score_a', 'score_b')
)

In [36]:
# merge appropriate ratings to match dataframe

match_df = pd.merge(match_df, ratings_df,
                    left_on = ['period_id', 'pla_id'],
                    right_on = ['period_id', 'player_id'],
                    how = 'left')

match_df = pd.merge(match_df, ratings_df,
                    left_on = ['period_id', 'plb_id'],
                    right_on = ['period_id', 'player_id'],
                    how = 'left', suffixes = ('_a', '_b'))

# drop duplicate player ID columns
drop_cols = ['player_id_a', 'player_id_b']
match_df.drop(columns=drop_cols, inplace=True)

In [37]:
len(match_df)

362893

In [38]:
# remove randoms
match_df = match_df[
    (match_df['race_a'] != 'R') &
    (match_df['race_b'] != 'R')
]

In [39]:
len(match_df)

356777

In [40]:
# one hot encode player races
pla_race = match_df['race_a']
plb_race = match_df['race_b']
pla_race_encoding = pd.get_dummies(pla_race, prefix='pla_race')
plb_race_encoding = pd.get_dummies(plb_race, prefix ='plb_race')

# append one hot encoded race to dataset
match_df = pd.concat([match_df, pla_race_encoding, plb_race_encoding], axis=1)

In [41]:
len(match_df)

356777

In [42]:
# Create effective ratings column
# Player B
player_b_eff_ratings = dict()
for index, row in match_df.iterrows():
    if row['race_a'] == 'Z':
        player_b_eff_ratings[index] = row['comp_rat_vz_b']
    elif row['race_a'] == 'P':
        player_b_eff_ratings[index] = row['comp_rat_vp_b']
    elif row['race_a'] == 'T':
        player_b_eff_ratings[index] = row['comp_rat_vt_b']

plb_eff_ratings = pd.Series(player_b_eff_ratings, name='plb_eff_ratings')

# Player A
player_a_eff_ratings = dict()
for index, row in match_df.iterrows():
    if row['race_b'] == 'Z':
        player_a_eff_ratings[index] = row['comp_rat_vz_a']
    elif row['race_b'] == 'P':
        player_a_eff_ratings[index] = row['comp_rat_vp_a']
    elif row['race_b'] == 'T':
        player_a_eff_ratings[index] = row['comp_rat_vt_a']

pla_eff_ratings = pd.Series(player_a_eff_ratings, name='pla_eff_ratings')

In [43]:
match_df['pla_eff_rating'] = pla_eff_ratings
match_df['plb_eff_rating'] = plb_eff_ratings
match_df['ratings_diff'] = match_df['pla_eff_rating'] - match_df['plb_eff_rating']

In [44]:
match_df['pla_eff_rating'].isnull().sum()

0

In [45]:
len(player_a_eff_ratings.keys())

356777

In [46]:
len(player_b_eff_ratings.keys())

356777

In [47]:
# rearrange winner to last column
match_df = rearrange_columns(match_df)

In [48]:
# Split a train and test set
# The last 20% are the most recent matches
# train/test split
# All features - races, all comp ratings, all positions
train_cols = list(match_df.columns[8:-1])
test_idx = round(len(match_df) * 0.8)

train_df = match_df.iloc[:test_idx]
test_df = match_df.iloc[test_idx:]


In [49]:
len(train_df)

285422

In [50]:
len(test_df)

71355

In [51]:
# Export training and test dataset
train_fname = 'data/train.csv'
test_fname = 'data/test.csv'

train_df.to_csv(train_fname, index=False)
test_df.to_csv(test_fname, index=False)

In [52]:
match_df.columns

Index(['match_id', 'period_id', 'pla_id', 'plb_id', 'score_a', 'score_b',
       'race_a', 'race_b', 'comp_rat_a', 'comp_rat_vp_a', 'comp_rat_vt_a',
       'comp_rat_vz_a', 'position_a', 'position_vp_a', 'position_vt_a',
       'position_vz_a', 'comp_rat_b', 'comp_rat_vp_b', 'comp_rat_vt_b',
       'comp_rat_vz_b', 'position_b', 'position_vp_b', 'position_vt_b',
       'position_vz_b', 'pla_race_P', 'pla_race_T', 'pla_race_Z', 'plb_race_P',
       'plb_race_T', 'plb_race_Z', 'pla_eff_rating', 'plb_eff_rating',
       'ratings_diff', 'winner'],
      dtype='object')

In [53]:
match_df.tail(n=10)

Unnamed: 0,match_id,period_id,pla_id,plb_id,score_a,score_b,race_a,race_b,comp_rat_a,comp_rat_vp_a,...,pla_race_P,pla_race_T,pla_race_Z,plb_race_P,plb_race_T,plb_race_Z,pla_eff_rating,plb_eff_rating,ratings_diff,winner
362883,346711,268,13279,317,0,2,P,P,-1000.0,-3000.0,...,1,0,0,1,0,0,-3000.0,1.184659,-3001.184659,0
362884,346712,268,18135,17030,2,0,P,T,-1000.0,0.476942,...,1,0,0,0,1,0,-2000.0,-3000.0,1000.0,1
362885,346713,268,317,18135,3,2,P,P,0.9834,1.184659,...,1,0,0,1,0,0,1.184659,0.476942,0.707718,1
362886,346714,268,2566,1652,2,5,Z,P,1.380382,1.351047,...,0,0,1,1,0,0,1.351047,1.471305,-0.120258,0
362887,346715,268,5938,274,2,0,Z,P,1.009912,1.012771,...,0,0,1,1,0,0,1.012771,0.673791,0.33898,1
362888,346716,268,68,20275,2,1,P,T,1.171764,1.319927,...,1,0,0,0,1,0,0.773591,0.794513,-0.020922,1
362889,346717,268,4134,1652,1,2,P,P,1.47327,1.587658,...,1,0,0,1,0,0,1.587658,1.310743,0.276915,0
362890,346718,268,229,13223,2,1,T,P,0.719868,1.001348,...,0,1,0,1,0,0,1.001348,0.392331,0.609017,1
362891,346720,268,422,13890,2,0,Z,T,-1000.0,0.492184,...,0,0,1,0,1,0,-2000.0,-0.186257,-1999.813743,1
362892,346719,268,2566,20915,2,0,Z,P,1.380382,1.351047,...,0,0,1,1,0,0,1.351047,0.238551,1.112496,1
