In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
df = pd.read_csv('./economy.csv', low_memory=False) # load csv

colnames = list(df.columns) # original colnames

In [None]:
winners = df.T.apply(pd.Series.last_valid_index) 
# returns an array where each index represents the corresponding row of data and each value represents the last column that is not NaN (the last round with a winner)

In [None]:
winners_col = [] # create the winners column
winnerarr = np.array(winners) 
for i in range(df.shape[0]): # loop through all records of data
    winners_col.append(df[winnerarr[i]][i]) # append whether team 1 won or team 2 won (winnerarr[i][i] returns 1 or 2)
    #winners_col.append(df["team_" + str(int(df[winnerarr[i]][i]))][i])

In [None]:
df['match_winner'] = winners_col # append new column to data

In [None]:
round_win_col_names = ['1_winner',
 '2_winner',
 '3_winner',
 '4_winner',
 '5_winner',
 '6_winner',
 '7_winner',
 '8_winner',
 '9_winner',
 '10_winner',
 '11_winner',
 '12_winner',
 '13_winner',
 '14_winner',
 '15_winner',
 '16_winner',
 '17_winner',
 '18_winner',
 '19_winner',
 '20_winner',
 '21_winner',
 '22_winner',
 '23_winner',
 '24_winner',
 '25_winner',
 '26_winner',
 '27_winner',
 '28_winner',
 '29_winner',
 '30_winner']
# We dont need these columns anymore

In [None]:
df = df.drop(columns=round_win_col_names) # drop round winner columns

In [None]:
df = df.drop(columns=["best_of", "date", "t2_start"])
df = df.drop(columns=["match_id", "event_id"])

# drops rounds past halftime
droprounds = ['16_t1', '17_t1', '18_t1', '19_t1', '20_t1', '21_t1', '22_t1', '23_t1', '24_t1', '25_t1', '26_t1', '27_t1', '28_t1', '29_t1', '30_t1', '16_t2', '17_t2', '18_t2', '19_t2', '20_t2', '21_t2', '22_t2', '23_t2', '24_t2', '25_t2', '26_t2', '27_t2', '28_t2', '29_t2', '30_t2']
df = df.drop(columns=droprounds)

In [None]:
df.head(6)

In [None]:
colnames = list(df.columns) # colnames is column names of new df

In [None]:
#print(colnames)
xnames = colnames[:len(colnames) - 1] # all features except for last column
#print(xnames)
X = df[xnames]

set_of_teams = set(list(X['team_1']) + list(X['team_2'])) # creates a unique list of teams found in dataset
team_to_num = dict(zip(set_of_teams, range(len(set_of_teams)))) # hashtable - key = name of team, value = number

#Replace team's name with their number
X['team_1'] = X['team_1'].replace(team_to_num)
X['team_2'] = X['team_2'].replace(team_to_num)
y = df["match_winner"].replace(team_to_num)

#t1_start and t2_start are either t or ct. We dropped t2_start because we can infer what t2 will be if we know what side t1 started on
#Converts t1_start to binary. 1 if they started as a t, 0 if they started as ct
X['t1_t'] = X['t1_start'].apply(lambda x: 1 if x == 't' else 0)
X = X.drop('t1_start', axis=1)

#Replace map names with numbers
map_to_num = dict(zip(X['_map'].unique(), range(len(X['_map'].unique()))))
X['_map'] = X['_map'].replace(map_to_num)

X # show X

In [None]:
y # show y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# is this the right way to scale?
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)