In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder

import pickle

# Import dataset

In [2]:
data = pd.read_csv("../DataFormating/final.csv")

In [3]:
data.head()

Unnamed: 0,Away Team Goals,Away Team Name,Home Team Goals,Home Team Name,Year,home_rank,home_total_points,home_previous_points,home_rank_change,home_cur_year_avg,...,away_cur_year_avg,away_cur_year_avg_weighted,away_last_year_avg,away_last_year_avg_weighted,away_two_year_ago_avg,away_two_year_ago_weighted,away_three_year_ago_avg,away_three_year_ago_weighted,Home Avg Goals,Away Avg Goals
0,0.0,Morocco,2.0,Nigeria,2000.0,76,0.0,444,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,Denmark,3.0,France,2000.0,3,0.0,765,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,Senegal,4.0,Tunisia,2000.0,28,0.0,596,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,England,3.0,Portugal,2000.0,15,0.0,672,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,Germany,1.0,England,2000.0,12,0.0,695,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Set `X` and `y`

In [4]:
data.columns

Index(['Away Team Goals', 'Away Team Name', 'Home Team Goals',
       'Home Team Name', 'Year', 'home_rank', 'home_total_points',
       'home_previous_points', 'home_rank_change', 'home_cur_year_avg',
       'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'away_rank', 'away_total_points',
       'away_previous_points', 'away_rank_change', 'away_cur_year_avg',
       'away_cur_year_avg_weighted', 'away_last_year_avg',
       'away_last_year_avg_weighted', 'away_two_year_ago_avg',
       'away_two_year_ago_weighted', 'away_three_year_ago_avg',
       'away_three_year_ago_weighted', 'Home Avg Goals', 'Away Avg Goals'],
      dtype='object')

In [5]:
X = data.drop(["Away Team Goals", "Home Team Goals"], axis=1)

y = []
for i in range(len(data)):
    home_team_goals = data.iloc[i]["Home Team Goals"]
    away_team_goals = data.iloc[i]["Away Team Goals"]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [6]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [7]:
word_cup_teams = [
    "Egypt",
    "Morocco",
    "Nigeria",
    "Senegal",
    "Tunisia",
    "Australia",
    "IR Iran",
    "Japan",
    "Korea DPR",
    "Saudi Arabia",
    "Belgium",
    "Croatia",
    "Denmark",
    "England",
    "France",
    "Germany",
    "Iceland",
    "Poland",
    "Portugal",
    "Russia",
    "Serbia",
    "Spain",
    "Sweden",
    "Switzerland",
    "Costa Rica",
    "Mexico",
    "Panama",
    "Argentina",
    "Brazil",
    "Colombia",
    "Peru",
    "Uruguay"
]

team_names = list(data["Home Team Name"].unique()) + list(data["Away Team Name"].unique()) + word_cup_teams

In [8]:
team_name_encoder = LabelEncoder().fit(team_names)

In [9]:
X["Home Team Name"] = team_name_encoder.transform(X["Home Team Name"])
X["Away Team Name"] = team_name_encoder.transform(X["Away Team Name"])

### Feature Selection

In [10]:
feature_names = []

COLUMNS = []
for column_name in X.columns:
    for feature_name in feature_names:
        if feature_name in column_name:
            COLUMNS.append(column_name)
            break

COLUMNS = [
    'Away Team Name',
    'Home Team Name',

    'home_rank',
    'home_total_points',
    'home_cur_year_avg',
    'home_cur_year_avg_weighted',
    
    'away_rank',
    'away_total_points',
    'away_cur_year_avg',
    'away_cur_year_avg_weighted',
    
    'Home Avg Goals',
    'Away Avg Goals'
]
X = X[COLUMNS]

In [11]:
COLUMNS

['Away Team Name',
 'Home Team Name',
 'home_rank',
 'home_total_points',
 'home_cur_year_avg',
 'home_cur_year_avg_weighted',
 'away_rank',
 'away_total_points',
 'away_cur_year_avg',
 'away_cur_year_avg_weighted',
 'Home Avg Goals',
 'Away Avg Goals']

# Traning Session

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1376 entries, 0 to 1375
Data columns (total 12 columns):
Away Team Name                1376 non-null int64
Home Team Name                1376 non-null int64
home_rank                     1376 non-null int64
home_total_points             1376 non-null float64
home_cur_year_avg             1376 non-null float64
home_cur_year_avg_weighted    1376 non-null float64
away_rank                     1376 non-null int64
away_total_points             1376 non-null float64
away_cur_year_avg             1376 non-null float64
away_cur_year_avg_weighted    1376 non-null float64
Home Avg Goals                1376 non-null float64
Away Avg Goals                1376 non-null float64
dtypes: float64(8), int64(4)
memory usage: 129.1 KB


In [13]:
X.describe()

Unnamed: 0,Away Team Name,Home Team Name,home_rank,home_total_points,home_cur_year_avg,home_cur_year_avg_weighted,away_rank,away_total_points,away_cur_year_avg,away_cur_year_avg_weighted,Home Avg Goals,Away Avg Goals
count,1376.0,1376.0,1376.0,1376.0,1376.0,1376.0,1376.0,1376.0,1376.0,1376.0,1376.0,1376.0
mean,15.388081,14.797965,25.114826,313.31702,159.992071,159.992071,27.063953,310.246824,158.726374,158.726374,0.787147,0.604247
std,10.178364,9.578303,21.412721,477.040213,249.79163,249.79163,22.966828,475.36764,248.195149,248.195149,1.075932,0.865986
min,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,7.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0
50%,16.0,14.0,20.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0
75%,24.0,23.0,37.0,697.79,342.1075,342.1075,40.0,676.4725,345.26,345.26,1.333333,1.0
max,32.0,32.0,139.0,1725.29,1090.54,1090.54,139.0,1725.29,1090.54,1090.54,7.0,5.0


In [14]:
model = LogisticRegression()

In [15]:
model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Save model and encoders

In [16]:
with open("model.b", "wb") as f:
    pickle.dump(model, f)
    
with open("team_name_encoder.b", "wb") as f:
    pickle.dump(team_name_encoder, f)