In [144]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

In [145]:
df = pd.read_csv("https://projects.fivethirtyeight.com/soccer-api/club/spi_matches_latest.csv").dropna()

In [146]:
df['result'] = np.abs(df['score1'] - df['score2'])

In [147]:
results = []

In [148]:
for x in df['result']:
  if x == 0:
    results.append("TIE")
  else:
    results.append("RESULT")

In [149]:
df['result'] = results

In [150]:
ties = []

In [151]:
mean_pt = np.mean(df['probtie'])
max_pt = np.max(df['probtie'])
sd_pt = np.std(df['probtie'])

In [152]:
for y in df['probtie']:
  if y < mean_pt:
    ties.append("TIER 1")
  elif (y >= mean_pt) & (y < (mean_pt + sd_pt)):
    ties.append("TIER 2")
  elif (y >= (mean_pt + sd_pt)) & (y < (mean_pt + (sd_pt * 1.5))):
    ties.append("TIER 3")
  else:
    ties.append("TIER 4")

In [153]:
df['tie_prob'] = ties

In [154]:
X = df[['prob1','prob2']]
y = df['tie_prob']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [155]:
model = XGBClassifier(max_depth=6)

In [156]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [157]:
predictions = model.predict(X_test)

In [158]:
accuracy = accuracy_score(y_test, predictions)

In [159]:
print("Test Accuracy: {}%".format(np.round(accuracy * 100),3))

Test Accuracy: 93.0%


In [160]:
kfold = KFold(n_splits=10, shuffle=True)

In [161]:
cv_score = cross_val_score(model, X, y, cv=kfold).mean()

In [162]:
print("Cross-Validation Score: {}".format(cv_score))

Cross-Validation Score: 0.95239489660997


In [163]:
acc = []

In [164]:
def random_data(df):
  count = 0    
  df = pd.read_csv(df).dropna()
  df = df[df.season <= 2018]

  while count < 100:
    df['result'] = np.abs(df['score1'] - df['score2'])

    results = []


    for x in df['result']:
      if x == 0:
        results.append("TIE")
      else:
        results.append("RESULT")

    df['result'] = results

    ties = []

    mean_pt = np.mean(df['probtie'])
    max_pt = np.max(df['probtie'])
    sd_pt = np.std(df['probtie'])

    for y in df['probtie']:
      if y < mean_pt:
        ties.append("TIER 1")
      elif (y >= mean_pt) & (y < (mean_pt + sd_pt)):
        ties.append("TIER 2")
      elif (y >= (mean_pt + sd_pt)) & (y < (mean_pt + (sd_pt * 1.5))):
        ties.append("TIER 3")
      else:
        ties.append("TIER 4")        

    df['tie_prob'] = ties

    X = df[['prob1','prob2']]
    y = df['tie_prob']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=count)

    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)

    acc.append(accuracy)

    count += 1

In [165]:
run = random_data("https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv")

In [166]:
min_acc = np.min(acc)
max_acc = np.max(acc)
mean_acc = np.mean(acc)

In [168]:
print("Accuracy Values:")
print("Min: {} | Mean: {} | Max: {}".format(min_acc, mean_acc, max_acc))

Accuracy Values:
Min: 0.931214421252372 | Mean: 0.9430597722960151 | Max: 0.9535104364326376
