Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import chardet
import IPython
import chardet

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score

# Support Vector Machine (SVM)
from sklearn.svm import SVC

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

# Decision Tree
from sklearn.tree import DecisionTreeClassifier


Importing Data and Converting it to a DataFrame

In [None]:

path = 'LaLiga_Matches.csv'
    
with open(path, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
matches=pd.read_csv(path,encoding='Windows-1252')


Data Attributes 

⦁	'Date'
⦁	 'HomeTeam',
⦁	 'AwayTeam',
⦁	 'FTHG' [Full time home goal]
⦁	 'FTAG' [Full time away goal]
⦁	'FTR' [Full time result]
⦁	'HTHG' [ Half time home goal] 
⦁	'HTAG' [Half time away goal]
⦁	 'HTR' [Half time result]


In [None]:
list(matches.columns)

Data Cleaning and Preprocessing

home and away team points == 3 and tie points == 1 
other cells == 0

In [None]:
# Creating a new dataframe with the columns we need

Home_win = matches.loc[matches.FTHG > matches.FTAG][['Date','Season','HomeTeam','home_team_points']]
Away_win = matches.loc[matches.FTHG < matches.FTAG][['Date','Season','AwayTeam','away_team_points']]
Home_tie = matches.loc[matches.FTHG == matches.FTAG][['Date','Season','HomeTeam','tie_points']]
Away_tie = matches.loc[matches.FTHG == matches.FTAG][['Date','Season','AwayTeam','tie_points']]

Home_win.columns = ['date','Season', 'team', 'points']
Away_win.columns = ['date','Season', 'team', 'points']
Home_tie.columns = ['date','Season', 'team', 'points']
Away_tie.columns = ['date','Season', 'team', 'points']

In [None]:
matches.shape

In [None]:
iframe = "<iframe src='https://public.flourish.studio/visualisation/6430207/' style='width:100%;height:600px;'></iframe><div style='width:100%!;margin-top:4px!important;text-align:right!important;'><a class='flourish-credit' href='https://public.flourish.studio/visualisation/6430207/' target='_top' style='text-decoration:none!important'><img alt='Made with Flourish' src='https://public.flourish.studio/resources/made_with_flourish.svg' style='width:105px!important;height:16px!important;border:none!important;margin:0!important;'> </a></div>"
IPython.display.HTML(iframe)

In [None]:
matches=matches.drop(['HTHG','HTAG'],axis=1)


In [None]:
matches['Season'].unique()


In [None]:
matches.head()

In [None]:
matches[matches.isnull().any(axis=1)]

In [None]:

matches.loc[136, 'HTR'] = 'D'
matches.loc[1472, 'HTR'] = 'D'

In [None]:
#finding the null values
matches.isnull().sum()

In [None]:
matches['resultHome'] = matches['FTR'].map({'H':3,'A':0,'D':1})
matches['resultAway'] = matches['FTR'].map({'H':0,'A':3,'D':1})
matches['result'] = matches['resultHome'] + matches['resultAway']
matches

In [None]:

y=matches['result']

X_train, X_test, y_train, y_test = train_test_split(matches[['FTHG','FTAG']],matches['result'],
                                                              test_size=0.2,
                                                              random_state=0,
                                                              stratify=y)
print(X_train)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
X_train.shape

In [None]:
X_train

In [None]:
X_test.shape

In [None]:
X_test

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
total_data_count=matches.shape[0]
round(total_data_count*0.25)

In [None]:
matches.dtypes

In [None]:
for i in range(3):
    print("Class -",i,":",list(y_train).count(i))

In [None]:
for i in range(3):
    print("Class -",i,":",list(y_test).count(i))

In [None]:
uni=len(matches['result'].unique())
for i in range(uni):
    print("Class -",i,":",list(matches['result']).count(i))

In [None]:
print("per-feature minimum before scaling:\n {}".format(X_train.min(axis=0)))
print("per-feature maximum before scaling:\n {}".format(X_train.max(axis=0)))

In [None]:
knn=KNeighborsClassifier()

knn.fit(X_train, y_train)

print("Test set accuracy: {:.2f}".format(knn.score(X_test, y_test)))

In [None]:
preds=knn.predict(X_test)
# acc=accuracy_score(y_test,preds)
combined=pd.DataFrame(dict(actual=y_test,predictions=preds),index=X_test.index)
# pd.crosstab(index=combined["actual"],columns=combined["predictions"])

In [None]:
combined


In [None]:
from sklearn.metrics import precision_score
precision_score(y_test,preds,average='weighted')


In [None]:
def rolling_averages(group,cols,new_cols):
  group=group.sort_values("date")
  rolling_stat=group[cols].rolling(3, closed='left').mean()
  group[new_cols]=rolling_stat
  group=group.dropna(subset=new_cols)
  return group

In [None]:

scaler1 = MinMaxScaler()

scaler1.fit(X_train,X_test)
# transform data
X_train_scaled = scaler1.transform(X_train)
# transform test data
X_test_scaled = scaler1.transform(X_test)


In [None]:
print("per-feature minimum after scaling:\n {}".format(
    X_train_scaled.min(axis=0)))
print("per-feature maximum after scaling:\n {}".format(
    X_train_scaled.max(axis=0)))

In [None]:
#train
knn.fit(X_train_scaled, y_train)

# scoring on the scaled test set
print("Scaled test set accuracy: {:.2f}".format(
    knn.score(X_test_scaled, y_test)))

#another approach
preds2=knn.predict(X_test_scaled)
acc2=accuracy_score(y_test,preds2)
# combined=pd.DataFrame(dict(actual=y_test,predictions=preds2))
# pd.crosstab(index=combined["actual"],columns=combined["predictions"])


In [None]:
acc2

In [None]:
preds2

Support Vector Machine (SVM)

In [None]:

svm = SVC(kernel='linear', C=1, random_state=0)
svm.fit(X_train_scaled, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train_scaled, y_train)))

print('Accuracy of SVM classifier on test set: {:.2f}'
        .format(svm.score(X_test_scaled, y_test)))

svm2 = SVC(kernel='rbf', C=1, random_state=0)
svm2.fit(X_train_scaled, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm2.score(X_train_scaled, y_train)))

print('Accuracy of SVM classifier on test set: {:.2f}'
        .format(svm2.score(X_test_scaled, y_test)))

svm3 = SVC(kernel='poly', C=1, random_state=0)
svm3.fit(X_train_scaled, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm3.score(X_train_scaled, y_train)))

print('Accuracy of SVM classifier on test set: {:.2f}'
        .format(svm3.score(X_test_scaled, y_test)))


Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)
print('Accuracy of Naive Bayes classifier on training set: {:.2f}'
     .format(nb.score(X_train_scaled, y_train)))
print('Accuracy of Naive Bayes classifier on test set: {:.2f}'
        .format(nb.score(X_test_scaled, y_test)))


Decision Tree

In [None]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train_scaled, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(tree.score(X_train_scaled, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
        .format(tree.score(X_test_scaled, y_test)))
