### Loading data (matches) from the backend api

In [186]:
import requests
import json

leagueId="6250d75e81afe4381753aade"
r = requests.get(f'http://localhost:3000/football/{leagueId}/all/matches')
json_matches=json.loads(r.text)

### Cleaning and filtering and preprocessing data

In [187]:
from datetime import datetime

json_data = []
for json_match in json_matches:
    clean_match = {}
    if(json_match['finished'] == False):
        continue
    clean_match['homeTeam'] = json_match['homeTeam']['name']
    clean_match['awayTeam'] = json_match['awayTeam']['name']
    clean_match['date']=datetime.strptime(json_match['date'],'%Y-%m-%dT%H:%M:%S.%fZ').toordinal()
    clean_match['homeTeamGoals'] = len(
        list(filter(lambda goal: goal['homeTeam'] == True, json_match['goals'])))
    clean_match['awayTeamGoals'] = len(
        list(filter(lambda goal: goal['homeTeam'] == False, json_match['goals'])))
    json_data.append(clean_match)



### Create targets 
- result in ['home','draw','away']
- over0.5 in [0,1]

In [188]:
result_target=[]    # contain match result ('home','draw','away') 
over05_target=[]    # contain if goals sum is over 0.5
over15_target=[]    # contain if goals sum is over 1.5
over25_target=[]    # contain if goals sum is over 2.5
over35_target=[]    # contain if goals sum is over 3.5
# this targets support double prediction example 1x 12 x2
home_target=[]      
draw_target=[]
away_target=[]

for match in json_data:
    if(match['homeTeamGoals'] > match['awayTeamGoals']):
        result_target.append('home')
        home_target.append(1)      
        draw_target.append(0)
        away_target.append(0)
    elif(match['homeTeamGoals'] < match['awayTeamGoals']):
        result_target.append('away')
        home_target.append(0)
        draw_target.append(0)
        away_target.append(1)
    else:
        home_target.append(0)
        draw_target.append(1)
        away_target.append(0)
        result_target.append('draw')
        
    goals_sum=match['homeTeamGoals'] + match['awayTeamGoals']
    if(goals_sum>0.5):
        over05_target.append(1)
    else:
        over05_target.append(0)
        
    if(goals_sum>1.5):
        over15_target.append(1)
    else:
        over15_target.append(0)
        
    if(goals_sum>2.5):
        over25_target.append(1)
    else:
        over25_target.append(0)
        
    if(goals_sum>3.5):
        over35_target.append(1)
    else:
        over35_target.append(0)

   
targets={
    'result':result_target,
    'over 0.5':over05_target,
    'over 1.5':over15_target,
    'over 2.5':over25_target,
    'over 3.5':over35_target,
    'home':home_target,
    'away':draw_target,
    'draw':away_target,
    
}

### Restructuring data to pandas Dataframe

In [189]:
import pandas as pd

data=[]
home_team = []
away_team = []
date=[]
for match in json_data:
    home_team.append(match['homeTeam'])
    away_team.append(match['awayTeam'])
    date.append(match['date'])

data = pd.DataFrame({
    'homeTeam': home_team, 
    'awayTeam': away_team,
    'date':date
    
})

print(data)




           homeTeam    awayTeam    date
0         barcelona  villarreal  738297
1     real-sociedad  atl-madrid  738297
2           sevilla  ath-bilbao  738297
3            alaves       cadiz  738297
4        granada-cf    espanyol  738297
...             ...         ...     ...
4555        osasuna     almeria  734013
4556  real-sociedad  villarreal  734013
4557        levante     sevilla  734012
4558         malaga    valencia  734012
4559       hercules  ath-bilbao  734012

[4560 rows x 3 columns]


### Convert categorical variable into dummy/indicator variables.

In [190]:

data=pd.get_dummies(data)

### Preprocessing :Normalize date column

In [191]:
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaler.fit(data)
scaled = scaler.transform(data)
data = pd.DataFrame(scaled, columns=data.columns)

print(data)

          date  homeTeam_alaves  homeTeam_almeria  homeTeam_ath-bilbao  \
0     1.000000              0.0               0.0                  0.0   
1     1.000000              0.0               0.0                  0.0   
2     1.000000              0.0               0.0                  0.0   
3     1.000000              1.0               0.0                  0.0   
4     1.000000              0.0               0.0                  0.0   
...        ...              ...               ...                  ...   
4555  0.000233              0.0               0.0                  0.0   
4556  0.000233              0.0               0.0                  0.0   
4557  0.000000              0.0               0.0                  0.0   
4558  0.000000              0.0               0.0                  0.0   
4559  0.000000              0.0               0.0                  0.0   

      homeTeam_atl-madrid  homeTeam_barcelona  homeTeam_betis  homeTeam_cadiz  \
0                     0.0     

### Naive bayes clasiffier average score

In [193]:
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split

def average_score(classifier, data, target):
    sum = 0
    cycles = 300

    for i in range(1, cycles):

        train_data, test_data, train_target, test_target = train_test_split(
            data, target, test_size=0.2)
        classifier.fit(train_data, train_target)
        sum += classifier.score(test_data, test_target)
        
    return sum/cycles



nb_clf = naive_bayes.MultinomialNB(fit_prior=True)

for target_label in targets:
    print(f'score for {target_label} : {round(average_score(nb_clf,data,targets[target_label]),2)}',)


score for result : 0.51
score for over 0.5 : 0.81
score for over 1.5 : 0.67
score for over 2.5 : 0.6
score for over 3.5 : 0.76
score for home : 0.68
score for away : 0.67
score for draw : 0.77
