In [26]:
import numpy as np
import pandas as pd

from statistics import mean

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import (
    cross_validate, train_test_split, GridSearchCV, learning_curve, validation_curve
)
from sklearn.metrics import classification_report, accuracy_score

# Import dataset

In [27]:
final = pd.read_csv("../DataFormating/final.csv")

In [28]:
final.head()

Unnamed: 0,Away Team Goals,Away Team Name,Home Team Goals,Home Team Name,Year,home_rank,home_total_points,home_previous_points,home_rank_change,home_cur_year_avg,...,away_cur_year_avg,away_cur_year_avg_weighted,away_last_year_avg,away_last_year_avg_weighted,away_two_year_ago_avg,away_two_year_ago_weighted,away_three_year_ago_avg,away_three_year_ago_weighted,Home Avg Goals,Away Avg Goals
0,0.0,Morocco,2.0,Nigeria,2000.0,76,0.0,444,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,Denmark,3.0,France,2000.0,3,0.0,765,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,Senegal,4.0,Tunisia,2000.0,28,0.0,596,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,England,3.0,Portugal,2000.0,15,0.0,672,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,Germany,1.0,England,2000.0,12,0.0,695,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Setup `X`, `y` data for training / testing

In [29]:
final.columns

Index(['Away Team Goals', 'Away Team Name', 'Home Team Goals',
       'Home Team Name', 'Year', 'home_rank', 'home_total_points',
       'home_previous_points', 'home_rank_change', 'home_cur_year_avg',
       'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'away_rank', 'away_total_points',
       'away_previous_points', 'away_rank_change', 'away_cur_year_avg',
       'away_cur_year_avg_weighted', 'away_last_year_avg',
       'away_last_year_avg_weighted', 'away_two_year_ago_avg',
       'away_two_year_ago_weighted', 'away_three_year_ago_avg',
       'away_three_year_ago_weighted', 'Home Avg Goals', 'Away Avg Goals'],
      dtype='object')

In [30]:
X = final.drop(["Away Team Goals", "Home Team Goals"], axis=1)

y = []
for i in range(len(final)):
    home_team_goals = final["Home Team Goals"][i]
    away_team_goals = final["Away Team Goals"][i]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [31]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [32]:
team_name_encoder = LabelEncoder().fit(
    list(X["Home Team Name"]) + list(X["Away Team Name"])
)

X["Home Team Name"] = team_name_encoder.transform(X["Home Team Name"])
X["Away Team Name"] = team_name_encoder.transform(X["Away Team Name"])

### Feature Selection

In [33]:
X.columns

Index(['Away Team Name', 'Home Team Name', 'Year', 'home_rank',
       'home_total_points', 'home_previous_points', 'home_rank_change',
       'home_cur_year_avg', 'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'away_rank', 'away_total_points',
       'away_previous_points', 'away_rank_change', 'away_cur_year_avg',
       'away_cur_year_avg_weighted', 'away_last_year_avg',
       'away_last_year_avg_weighted', 'away_two_year_ago_avg',
       'away_two_year_ago_weighted', 'away_three_year_ago_avg',
       'away_three_year_ago_weighted', 'Home Avg Goals', 'Away Avg Goals'],
      dtype='object')

In [34]:
feature_names = []

COLUMNS = []
for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

            
COLUMNS = [
    'Away Team Name',
    'Home Team Name',

    'home_rank',
    'home_total_points',
    'home_cur_year_avg',
    'home_cur_year_avg_weighted',
    
    'away_rank',
    'away_total_points',
    'away_cur_year_avg',
    'away_cur_year_avg_weighted',
    
    'Home Avg Goals',
    'Away Avg Goals'
]
X = X[COLUMNS]

In [35]:
COLUMNS

['Away Team Name',
 'Home Team Name',
 'home_rank',
 'home_total_points',
 'home_cur_year_avg',
 'home_cur_year_avg_weighted',
 'away_rank',
 'away_total_points',
 'away_cur_year_avg',
 'away_cur_year_avg_weighted',
 'Home Avg Goals',
 'Away Avg Goals']

### Split `X` and `y` into train / test sets

In [36]:
len(X)

1376

In [37]:
len(X.columns)

12

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Fast testing

In [39]:
def test_model(model, cv=10):
    cv_scores = cross_validate(model, X, y, cv=cv)
    
    mean_train_acc = mean(cv_scores["train_score"]) 
    mean_test_acc = mean(cv_scores["test_score"])
    
    print()
    print("Train Accuracy: ", mean_train_acc)
    print("Test Accuracy: ", mean_test_acc)
    print()
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print()
    print("Normal split accuracy score: ", accuracy_score(y_test, y_pred))

### Logistic Regression

In [40]:
test_model(LogisticRegression())


Train Accuracy:  0.5217219598704846
Test Accuracy:  0.513144599409986

             precision    recall  f1-score   support

          0       0.44      0.06      0.11       129
          1       0.54      0.78      0.64       201
          2       0.43      0.50      0.46       125

avg / total       0.48      0.50      0.44       455


Normal split accuracy score:  0.4989010989010989




### K-Nearest Neighbors

In [16]:
test_model(KNeighborsClassifier(n_neighbors=5))


Train Accuracy:  0.6228208069219797
Test Accuracy:  0.4171386931375616

             precision    recall  f1-score   support

          0       0.35      0.41      0.38       115
          1       0.55      0.62      0.58       209
          2       0.37      0.24      0.29       131

avg / total       0.45      0.46      0.45       455


Normal split accuracy score:  0.45714285714285713




### Random Forests

In [17]:
model = RandomForestClassifier(n_estimators=500, max_depth=3, bootstrap=True, n_jobs=-1)

test_model(model)




Train Accuracy:  0.523175005570466
Test Accuracy:  0.49860354683005514

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       115
          1       0.51      0.89      0.65       209
          2       0.40      0.27      0.33       131

avg / total       0.35      0.49      0.39       455


Normal split accuracy score:  0.4901098901098901


  'precision', 'predicted', average, warn_for)


### Support Vector Machines

In [18]:
test_model(SVC(C=1.0, kernel="rbf", gamma="auto"))


Train Accuracy:  0.9611614585003122
Test Accuracy:  0.47742851754402105

             precision    recall  f1-score   support

          0       0.67      0.14      0.23       115
          1       0.50      0.92      0.65       209
          2       0.67      0.24      0.36       131

avg / total       0.59      0.53      0.46       455


Normal split accuracy score:  0.5296703296703297




### Extremely Randomized Trees

In [19]:
test_model(
    ExtraTreesClassifier(n_estimators=300, max_depth=3, bootstrap=True, n_jobs=-1)
)




Train Accuracy:  0.46802424605126736
Test Accuracy:  0.45428726072850156

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       115
          1       0.47      0.95      0.63       209
          2       0.41      0.11      0.17       131

avg / total       0.34      0.47      0.34       455


Normal split accuracy score:  0.46813186813186813


  'precision', 'predicted', average, warn_for)


### Gradient Boosting Machines

In [20]:
model = GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1)

test_model(model)




Train Accuracy:  0.810968778406586
Test Accuracy:  0.4636080393996823

             precision    recall  f1-score   support

          0       0.41      0.31      0.36       115
          1       0.58      0.70      0.64       209
          2       0.49      0.44      0.47       131

avg / total       0.51      0.53      0.52       455


Normal split accuracy score:  0.5274725274725275


### XGBoost

In [42]:
# best_model = XGBClassifier(n_estimators=500, max_depth=3, learning_rate=0.01, n_jobs=-1)

model = XGBClassifier(n_estimators=300, max_depth=3, learning_rate=0.01, n_jobs=-1)

test_model(model)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:



Train Accuracy:  0.5978698466132304
Test Accuracy:  0.5013964531699449

             precision    recall  f1-score   support

          0       0.22      0.05      0.09       129
          1       0.53      0.77      0.63       201
          2       0.42      0.45      0.43       125

avg / total       0.41      0.48      0.42       455


Normal split accuracy score:  0.47692307692307695


  if diff:


### AdaBoost with Decision Tree

In [23]:
tree = DecisionTreeClassifier()
ada = AdaBoostClassifier(tree, n_estimators=100, learning_rate=0.1)

test_model(ada)




Train Accuracy:  0.9773095177600445
Test Accuracy:  0.45642422123682125

             precision    recall  f1-score   support

          0       0.34      0.25      0.29       115
          1       0.59      0.64      0.61       209
          2       0.43      0.46      0.44       131

avg / total       0.48      0.49      0.48       455


Normal split accuracy score:  0.4901098901098901


### Neural Network

In [43]:
test_model(
    MLPClassifier(
        hidden_layer_sizes=(60,),
        activation="logistic",
        solver="adam",
        alpha=0.001
    )
)


Train Accuracy:  0.5286632424397011
Test Accuracy:  0.4999256349396186

             precision    recall  f1-score   support

          0       0.47      0.07      0.12       129
          1       0.51      0.81      0.63       201
          2       0.43      0.40      0.41       125

avg / total       0.48      0.49      0.42       455


Normal split accuracy score:  0.4879120879120879


