In [244]:
import pandas as pd
import xgboost as xgb
import pickle
import datetime

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline

In [238]:
df = pd.read_csv('data/prepared_data.csv')
df

Unnamed: 0.1,Unnamed: 0,date,map_name,player_one_race,player_two_race,player_one,player_two,game_length,winner_name,p_one_winner
0,0,2021-09-02,jagannatha le,zerg,zerg,serral,showtime,30.03,serral,1
1,17,2021-09-19,2000 atmospheres le,zerg,zerg,serral,maru,24.05,maru,0
2,18,2021-09-19,oxide le,zerg,zerg,serral,maru,15.04,maru,0
3,19,2021-09-19,lightshade le,zerg,zerg,serral,maru,11.16,maru,0
4,22,2021-04-27,oxide le,protoss,protoss,trap,dpgcure,11.19,dpgcure,0
...,...,...,...,...,...,...,...,...,...,...
12832,24665,2021-01-16,lightshade le,terran,terran,uwuthermal,gostephano,11.03,uwuthermal,1
12833,24666,2021-01-14,romanticide le,zerg,zerg,railgan,iiiiiiiiiiii,13.36,railgan,1
12834,24675,2015-05-07,coda le (void),zerg,zerg,bop,iiiiiiiiiiii,14.58,iiiiiiiiiiii,0
12835,24756,2015-04-05,coda le (void),zerg,zerg,kelazhur,masa,09.01,kelazhur,1


In [239]:
df.drop(['Unnamed: 0', 'player_one_race', 'player_two_race', 'date', 'game_length', 'winner_name'], axis=1, inplace=True)

X = df.drop("p_one_winner", axis=1)
y = df.p_one_winner

In [241]:
#encode players and map names
enc2 = OneHotEncoder(handle_unknown='ignore')
enc = OneHotEncoder(handle_unknown='ignore')
enc2 = enc2.fit(X)
X = enc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1121218)

In [133]:
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)

0.6286604361370717

In [141]:
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# Init Grid Search
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

# Fit
_ = grid_cv.fit(X, y)

In [142]:
grid_cv.best_score_

0.6306748778687297

In [16]:
grid_cv.best_params_

{'colsample_bytree': 0.5,
 'gamma': 1,
 'learning_rate': 0.1,
 'max_depth': 7,
 'reg_lambda': 0,
 'scale_pos_weight': 1,
 'subsample': 0.8}

In [143]:
# Insert the new fixed values to the grid
param_grid["scale_pos_weight"] = [0.5]

# Give new value ranges to other params
param_grid["gamma"] = [1.5]
param_grid["max_depth"] = [8]
param_grid["reg_lambda"] = [0.2, 0.3, 0.4, 0.5]
param_grid["learning_rate"] = [0.8, 0.9, 1, 1.1]

grid_cv_5 = GridSearchCV(xgb_cl, param_grid, 
                         cv=3, scoring="roc_auc", n_jobs=-1)

_ = grid_cv_5.fit(X, y)

grid_cv_5.best_score_

0.6551337815115751

In [144]:
grid_cv_5.best_params_

{'colsample_bytree': 0.5,
 'gamma': 1.5,
 'learning_rate': 1,
 'max_depth': 8,
 'reg_lambda': 0.3,
 'scale_pos_weight': 0.5,
 'subsample': 0.8}

Using Sklearns LogReg

In [146]:
#Using Logistic Regression
from sklearn.linear_model import LogisticRegression

# Fit
clf = LogisticRegression(random_state=0, C=0.68, penalty='l2').fit(X_train, y_train)

# Predict
preds = clf.predict(X_test)

# Score
accuracy_score(y_test, preds)

0.6308411214953271

In [95]:
import numpy as np

param_grid = {
    'C' : np.logspace(-4, 4, 50),
    'penalty' : ['l1', 'l2']

}

# Init classifier
clf = LogisticRegression()

# Init Grid Search
grid_cv_2 = GridSearchCV(clf, param_grid, n_jobs=-1, cv=10, scoring="roc_auc")

# Fit
_ = grid_cv_2.fit(X, y)

        nan 0.61088787        nan 0.61120789        nan 0.61166821
        nan 0.61229969        nan 0.61327484        nan 0.61444226
        nan 0.61608484        nan 0.61822668        nan 0.62104403
        nan 0.62449147        nan 0.62877086        nan 0.63363843
        nan 0.63889759        nan 0.64423709        nan 0.64933837
        nan 0.65392775        nan 0.65780028        nan 0.66071992
        nan 0.66273398        nan 0.66382762        nan 0.66412437
        nan 0.66384061        nan 0.66296777        nan 0.66169297
        nan 0.66035211        nan 0.65879561        nan 0.65740786
        nan 0.6558935         nan 0.65473114        nan 0.6536468
        nan 0.65269411        nan 0.65217134        nan 0.6516059
        nan 0.65105329        nan 0.65097069        nan 0.65086906
        nan 0.65048599        nan 0.65033407        nan 0.65010071
        nan 0.65040893        nan 0.65015973        nan 0.64998501
        nan 0.65003877        nan 0.65008855        nan 0.650058

In [96]:
grid_cv_2.best_score_

0.6641243672091692

In [97]:
grid_cv_2.best_params_

{'C': 0.5689866029018293, 'penalty': 'l2'}

Using Gradient Boosting

In [145]:
from sklearn.ensemble import GradientBoostingClassifier

#gradient boosting
grd_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.3, max_depth=4, random_state=0).fit(X_train, y_train)

grd_clf.score(X_test, y_test)


0.6155763239875389

In [220]:
clf = xgb.XGBClassifier(
     colsample_bytree= 0.5,
     gamma = 1.5,
     learning_rate = 1,
     max_depth = 6,
     reg_lambda = 0.3,
     scale_pos_weight = 0.7,
     subsample = 0.8,
     objective="binary:logistic"
)

#training model on whole dataset
clf.fit(X, y)

preds = clf.predict(X)

accuracy_score(y, preds)

0.6797538365661759

In [242]:
pickle_out = open("models/xgboost_model.pkl", mode = "wb") 
pickle.dump(clf, pickle_out) 
pickle_out.close()

pickle_out = open("models/enc.pkl", mode = "wb") 
pickle.dump(enc, pickle_out) 
pickle_out.close()

pickle_out = open("models/enc2.pkl", mode = "wb") 
pickle.dump(enc2, pickle_out) 
pickle_out.close()

In [236]:

test = pd.DataFrame(data={'map_name':['echo le (void)'], 'player_one':['serral'], 'player_two':['maru']})
test = enc.transform(test)

In [237]:
preds = clf.predict(test)
preds

array([0], dtype=int64)

In [246]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, fit_params):
        return self

    def transform(self, X, y=None, fit_params):
        return X.todense()

SyntaxError: non-default argument follows default argument (<ipython-input-246-c9a0a428d2d3>, line 3)

In [245]:
pipeline = make_pipeline(enc,  clf)
pipeline.fit(X, y)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.