In [120]:
import pandas as pd
import numpy as np
import collections
import random
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score, mean_squared_error

In [3]:
df = pd.read_csv('teams_matches_stats-2.csv')

In [4]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
df.select_dtypes('number').columns

Index(['Attendance', 'Performance.3', 'Standard.5', 'Poss', 'Standard.4',
       'Standard.3', 'offside', 'crosses', 'fouls_drw', 'fouls_com', 'Int',
       'Tackles.1', 'Standard.1', 'Standard.2', 'Performance.2', 'Performance',
       'Performance.4', 'OwnGoals', 'Ast', 'Penalty Kicks.1',
       'Penalty Kicks.2', 'Penalty Kicks.3', 'Penalty Kicks', 'Standard.9',
       'Tkl+Int', 'sec_yel', 'red', 'yellow', 'Standard.8', 'GA', 'GF',
       'season', 'GT', 'Gdiff'],
      dtype='object')

В этом блоке построим линейную модель на преобразованных данных по примеру реализации в https://habr.com/ru/articles/456226/

Это лишь начальный пример того, как можно сделать, и от чего можно измерять качество дальнейших моделей. В ней показан примерный процесс того, как будут создаваться фичи для обучения линейной модели.

Для каждого матча статистики команды суммируются и из них вычитаются статистики противника - финальный вектор кладется в линейную регрессию, рассчитываем вероятности. Если вероятность выше 0,5 - команда выигрывает.

В данном случае Feature Engineering не лишен минусов - сумма показателей сильно зависит от числа проведенных матчей. Поэтому нужно рассмотреть другой способ аггрегации.

In [6]:
def GetSeasonTeamStat(team, season):
    goalScored = 0 #Голов забито
    goalAllowed = 0 #Голов пропущено

    gameWin = 0 #Выиграно
    gameDraw = 0 #Ничья
    gameLost = 0 #Проиграно

    totalScore = 0 #Количество набранных очков

    matches = 0 #Количество сыгранных матчей

    shot = 0 #Удары
    shotOnTarget = 0 #Удары в створ

    cross = 0 #Навесы

    Pass = 0 #Пасы


    for i in range(len(df)):
        if (df['season'][i] == season) and (df['team'][i] == team):
            matches += 1

            goalScored += df['GF'][i]
            goalAllowed += df['GA'][i]

            if (df['GF'][i] > df['GA'][i]):
                totalScore += 3
                gameWin += 1
            elif (df['GF'][i] < df['GA'][i]):
                gameLost +=1
            else:
                totalScore += 1
                gameDraw += 1


            shot += df['Standard.1'][i]
            shotOnTarget += df['Standard.1'][i]

            Pass += df['Poss'][i]

            cross += df['crosses'][i]

    return [gameWin, gameDraw, gameLost,
            goalScored, goalAllowed, totalScore,
            shot, shotOnTarget,
            Pass,
            cross]

In [7]:
num_features =  len(GetSeasonTeamStat("Barcelona", 2122))

In [8]:
teams = df['team'].unique()

In [9]:
#Функция, возвращающая статистику всех команд за сезон
def GetSeasonAllTeamStat(season):
    annual = collections.defaultdict(list)
    for team in teams:
        team_vector = GetSeasonTeamStat(team, season)
        annual[team] = team_vector
    return annual

In [10]:
def GetTrainingData(seasons):
    totalNumGames = 0
    for season in seasons:
        annual = df[df['season'] == season]
        totalNumGames += len(annual.index)
    xTrain = np.zeros(( totalNumGames, num_features))
    yTrain = np.zeros(( totalNumGames ))
    indexCounter = 0
    for season in seasons:
        team_vectors = GetSeasonAllTeamStat(season)
        annual = df[df['season'] == season]
        numGamesInYear = len(annual.index)
        xTrainAnnual = np.zeros(( numGamesInYear, num_features))
        yTrainAnnual = np.zeros(( numGamesInYear ))
        counter = 0
        for index, row in annual.iterrows():
            team = row['team']
            t_vector = team_vectors[team]
            rivals = row['opponent']
            r_vector = team_vectors[rivals]

            diff = [a - b for a, b in zip(t_vector, r_vector)]

            if len(diff) != 0:
                xTrainAnnual[counter] = diff
            if row['result'] == 'W':
                yTrainAnnual[counter] = 1
            else:
                yTrainAnnual[counter] = 0
            counter += 1
        xTrain[indexCounter:numGamesInYear+indexCounter] = xTrainAnnual
        yTrain[indexCounter:numGamesInYear+indexCounter] = yTrainAnnual
        indexCounter += numGamesInYear
    return xTrain, yTrain

In [11]:
seasons = df['season'].unique()

In [12]:
xTrain, yTrain = GetTrainingData(seasons)

In [15]:
model = LinearRegression()
model.fit(xTrain, yTrain)

In [16]:
def createGamePrediction(team1_vector, team2_vector):
    diff = [[a - b for a, b in zip(team1_vector, team2_vector)]]
    predictions = model.predict(diff)
    return predictions

In [17]:
team1_name = "Barcelona"
team2_name = "Real Madrid"

team1_vector = GetSeasonTeamStat(team1_name, 2324)
team2_vector = GetSeasonTeamStat(team2_name, 2324)

print ('Вероятность, что выиграет ' + team1_name + ':', createGamePrediction(team1_vector, team2_vector))
print ('Вероятность, что выиграет ' + team2_name + ':', createGamePrediction(team2_vector, team1_vector))

Вероятность, что выиграет Barcelona: [0.36527014]
Вероятность, что выиграет Real Madrid: [0.56258158]


In [18]:
ypred = []

In [19]:
names = df[(df['team'] == 'Barcelona')&(df['season'] == 2324)]['opponent']
for team_name in names:
    team1_name = "Barcelona"
    team2_name = team_name

    if(team1_name != team2_name):
        team1_vector = GetSeasonTeamStat(team1_name, 2324)
        team2_vector = GetSeasonTeamStat(team2_name, 2324)

        print(team1_name, createGamePrediction(team1_vector, team2_vector), " - ", team2_name, createGamePrediction(team2_vector, team1_vector,))
        if createGamePrediction(team1_vector, team2_vector)[0] > 0.5:
          ypred.append(1)
        else:
          ypred.append(0)

Barcelona [0.52425756]  -  Antwerp [0.40359416]
Barcelona [0.52425756]  -  Porto [0.40359416]
Barcelona [0.52425756]  -  Shakhtar [0.40359416]
Barcelona [0.52425756]  -  Shakhtar [0.40359416]
Barcelona [0.52425756]  -  Porto [0.40359416]
Barcelona [0.52425756]  -  Antwerp [0.40359416]
Barcelona [0.52425756]  -  Barbastro [0.40359416]
Barcelona [0.53734979]  -  Osasuna [0.39050193]
Barcelona [0.36527014]  -  Real Madrid [0.56258158]
Barcelona [0.52425756]  -  Unionistas Sal [0.40359416]
Barcelona [0.43021674]  -  Athletic Club [0.49763498]
Barcelona [0.52339296]  -  Napoli [0.40445875]
Barcelona [0.52339296]  -  Napoli [0.40445875]
Barcelona [0.43234573]  -  Paris S-G [0.49550598]
Barcelona [0.43234573]  -  Paris S-G [0.49550598]


In [None]:
ypred

[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0]

In [20]:
ytrue = list(map((lambda i: 1 if i == 'W' else 0), df[(df['team'] == 'Barcelona')&(df['season'] == 2324)]['result']))

In [21]:
f1_score(ytrue,ypred)

0.8

F1 = 0,8.

In [None]:
df[(df['team'] == 'Barcelona')&(df['season'] == 2324)][['team','opponent','result']]

Unnamed: 0,team,opponent,result
892,Barcelona,Antwerp,W
893,Barcelona,Porto,W
894,Barcelona,Shakhtar,W
895,Barcelona,Shakhtar,L
896,Barcelona,Porto,W
897,Barcelona,Antwerp,L
898,Barcelona,Barbastro,W
899,Barcelona,Osasuna,W
900,Barcelona,Real Madrid,L
901,Barcelona,Unionistas Sal,W


Больше тест:

ТЕСТ 2 - Добавляем данные

In [None]:
df.select_dtypes('number').columns

Index(['Attendance', 'Performance.3', 'Standard.5', 'Poss', 'Standard.4',
       'Standard.3', 'offside', 'crosses', 'fouls_drw', 'fouls_com', 'Int',
       'Tackles.1', 'Standard.1', 'Standard.2', 'Performance.2', 'Performance',
       'Performance.4', 'OwnGoals', 'Ast', 'Penalty Kicks.1',
       'Penalty Kicks.2', 'Penalty Kicks.3', 'Penalty Kicks', 'Standard.9',
       'Tkl+Int', 'sec_yel', 'red', 'yellow', 'Standard.8', 'GA', 'GF',
       'season', 'GT', 'Gdiff'],
      dtype='object')

In [None]:
def GetSeasonTeamStat1(team, season):
    goalScored = 0 #Голов забито
    goalAllowed = 0 #Голов пропущено

    gameWin = 0 #Выиграно
    gameDraw = 0 #Ничья
    gameLost = 0 #Проиграно

    totalScore = 0 #Количество набранных очков

    matches = 0 #Количество сыгранных матчей

    shot = 0 #Удары
    shotOnTarget = 0 #Удары в створ

    cross = 0 #Навесы
    accurateCross = 0 #Точные навесы

    Pass = 0 #Пасы
    Save = 0


    for i in range(len(df)):
        if (df['season'][i] == season) and (df['team'][i] == team):
            matches += 1

            goalScored += df['GF'][i]
            goalAllowed += df['GA'][i]

            if (df['GF'][i] > df['GA'][i]):
                totalScore += 3
                gameWin += 1
            elif (df['GF'][i] < df['GA'][i]):
                gameLost +=1
            else:
                totalScore += 1
                gameDraw += 1


            shot += df['Standard.1'][i]
            shotOnTarget += df['Standard.1'][i]

            Pass += df['Poss'][i]

            cross += df['crosses'][i]

            #Save += df['Performance.3']
    if matches != 0:
          gameWin = gameWin/matches
          gameDraw = gameDraw/matches
          gameLost = gameLost/matches
          goalScored = goalScored/matches
          goalAllowed = goalAllowed/matches
          #totalScore = totalScore/matches
          shot = shot/matches
          shotOnTarget = shotOnTarget/matches
          Pass = Pass/matches
          cross = cross/matches

    return [gameWin, gameDraw, gameLost,
            goalScored, goalAllowed, totalScore,
            shot, shotOnTarget,
            Pass,
            cross]#, Save]

In [None]:
num_features =  len(GetSeasonTeamStat1("Barcelona", 2122))

In [None]:
def GetSeasonAllTeamStat1(season):
    annual = collections.defaultdict(list)
    for team in teams:
        team_vector = GetSeasonTeamStat1(team, season)
        annual[team] = team_vector
    return annual

In [None]:
def GetTrainingData1(seasons):
    totalNumGames = 0
    for season in seasons:
        annual = df[df['season'] == season]
        totalNumGames += len(annual.index)
    xTrain = np.zeros(( totalNumGames, num_features))
    yTrain = np.zeros(( totalNumGames ))
    indexCounter = 0
    for season in seasons:
        team_vectors = GetSeasonAllTeamStat1(season)
        annual = df[df['season'] == season]
        numGamesInYear = len(annual.index)
        xTrainAnnual = np.zeros(( numGamesInYear, num_features))
        yTrainAnnual = np.zeros(( numGamesInYear ))
        counter = 0
        for index, row in annual.iterrows():
            team = row['team']
            t_vector = team_vectors[team]
            rivals = row['opponent']
            r_vector = team_vectors[rivals]

            diff = [a - b for a, b in zip(t_vector, r_vector)]

            if len(diff) != 0:
                xTrainAnnual[counter] = diff
            if row['result'] == 'W':
                yTrainAnnual[counter] = 1
            else:
                yTrainAnnual[counter] = 0
            counter += 1
        xTrain[indexCounter:numGamesInYear+indexCounter] = xTrainAnnual
        yTrain[indexCounter:numGamesInYear+indexCounter] = yTrainAnnual
        indexCounter += numGamesInYear
    return xTrain, yTrain

In [None]:
xTrain1, yTrain1 = GetTrainingData1(seasons)

In [None]:
model1 = LinearRegression()
model1.fit(xTrain1, yTrain1)

In [None]:
def createGamePrediction1(team1_vector, team2_vector):
    diff = [[a - b for a, b in zip(team1_vector, team2_vector)]]
    predictions = model1.predict(diff)
    return predictions

In [None]:
team1_name = "Barcelona"
team2_name = "Real Madrid"

team1_vector = GetSeasonTeamStat(team1_name, 2324)
team2_vector = GetSeasonTeamStat(team2_name, 2324)

print ('Вероятность, что выиграет ' + team1_name + ':', createGamePrediction1(team1_vector, team2_vector))
print ('Вероятность, что выиграет ' + team2_name + ':', createGamePrediction1(team2_vector, team1_vector))

Вероятность, что выиграет Barcelona: [-2.93917471]
Вероятность, что выиграет Real Madrid: [3.85683176]


**ЛОГИСТИЧЕСКАЯ РЕГРЕССИЯ**

In [None]:
df1 = df.drop(['game','time','GT','Gdiff','season','date','Referee','Captain','Formation', 'Opp Formation', 'GA','GF','OwnGoals'], axis = 1)

In [None]:
df1.columns[:50]

Index(['Attendance', 'Performance.3', 'Standard.5', 'Poss', 'Standard.4',
       'Standard.3', 'offside', 'crosses', 'fouls_drw', 'fouls_com', 'Int',
       'Tackles.1', 'Standard.1', 'Standard.2', 'Performance.2', 'Performance',
       'Performance.4', 'Ast', 'Penalty Kicks.1', 'Penalty Kicks.2',
       'Penalty Kicks.3', 'Penalty Kicks', 'Standard.9', 'Tkl+Int', 'sec_yel',
       'red', 'yellow', 'Standard.8', 'result', 'team', 'day', 'venue',
       'opponent'],
      dtype='object')

In [None]:
df1.select_dtypes('object').head(10)

Unnamed: 0,result,team,day,venue,opponent
0,W,Ajaccio,Sun,Away,Jura Sud Foot
1,L,Ajaccio,Sat,Away,Toulouse
2,W,Alavés,Tue,Away,Getafe
3,W,Alavés,Thu,Home,Getafe
4,W,Alavés,Wed,Away,SD Formentera
5,W,Alavés,Wed,Home,SD Formentera
6,L,Alavés,Wed,Away,Valencia
7,W,Alavés,Wed,Home,Valencia
8,D,Alavés,Wed,Home,Girona
9,L,Alavés,Wed,Away,Girona


In [None]:
df1['result'] = list(map((lambda i: 1 if i == 'W' else 0), df1['result']))

In [None]:
df1.select_dtypes('object').head(10)

Unnamed: 0,team,day,venue,opponent
0,Ajaccio,Sun,Away,Jura Sud Foot
1,Ajaccio,Sat,Away,Toulouse
2,Alavés,Tue,Away,Getafe
3,Alavés,Thu,Home,Getafe
4,Alavés,Wed,Away,SD Formentera
5,Alavés,Wed,Home,SD Formentera
6,Alavés,Wed,Away,Valencia
7,Alavés,Wed,Home,Valencia
8,Alavés,Wed,Home,Girona
9,Alavés,Wed,Away,Girona


In [None]:
data = pd.get_dummies(df1,prefix=['team','day','venue','opponent'], columns = ['team','day','venue','opponent'], drop_first=True)

In [None]:
data.columns

Index(['Attendance', 'Performance.3', 'Standard.5', 'Poss', 'Standard.4',
       'Standard.3', 'offside', 'crosses', 'fouls_drw', 'fouls_com',
       ...
       'opponent_Zorya Luhansk', 'opponent_Zrinjski Mostar',
       'opponent_Zulte Waregem', 'opponent_Zürich', 'opponent_Épinal',
       'opponent_Étoile FC FSR', 'opponent_Östersund', 'opponent_Újpest',
       'opponent_Čukarički', 'opponent_Žalgiris'],
      dtype='object', length=933)

In [None]:
y = data['result']
x = data.drop('result', axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    train_size=0.8,
                                                    random_state=125)

In [None]:
normalizer = StandardScaler()
scaler = normalizer.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)


In [None]:
#сама модель
logreg = LogisticRegression()
logreg.fit(X_train_norm, y_train)


In [None]:
#Предсказание классов
y_pred = logreg.predict(X_test_norm)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
#Предсказание вероятностей (для расчета бизнес-метрик)
np.set_printoptions(suppress=True)
y_pred_prob = logreg.predict_proba(X_test_norm)
y_pred_prob

array([[0.94164031, 0.05835969],
       [0.99986678, 0.00013322],
       [0.69160951, 0.30839049],
       ...,
       [0.99993715, 0.00006285],
       [0.99998997, 0.00001003],
       [0.9999972 , 0.0000028 ]])

In [None]:
#потенциально ничья
j = 0
for i in y_pred_prob:
  if i[0] >= 0.4 and i[0] <= 0.7:
    print(i)
    j += 1

[0.69160951 0.30839049]
[0.45203422 0.54796578]
[0.68778024 0.31221976]
[0.69598086 0.30401914]
[0.53832568 0.46167432]
[0.5922149 0.4077851]
[0.49445747 0.50554253]
[0.52637139 0.47362861]
[0.43114754 0.56885246]
[0.64605025 0.35394975]
[0.64875264 0.35124736]
[0.48535975 0.51464025]
[0.60036854 0.39963146]
[0.64399581 0.35600419]
[0.6301901 0.3698099]
[0.52170429 0.47829571]
[0.41760251 0.58239749]
[0.67815445 0.32184555]
[0.42449421 0.57550579]
[0.48049452 0.51950548]
[0.60660081 0.39339919]
[0.69329188 0.30670812]
[0.69246171 0.30753829]
[0.524359 0.475641]
[0.44587083 0.55412917]
[0.64617422 0.35382578]
[0.44993378 0.55006622]
[0.43915637 0.56084363]
[0.59687501 0.40312499]
[0.41788436 0.58211564]
[0.43924683 0.56075317]
[0.45955685 0.54044315]
[0.58710695 0.41289305]
[0.58582991 0.41417009]
[0.47178589 0.52821411]
[0.63985325 0.36014675]
[0.48886179 0.51113821]
[0.4857308 0.5142692]
[0.47876828 0.52123172]
[0.41434764 0.58565236]
[0.44577513 0.55422487]
[0.52805831 0.47194169]
[0

In [None]:
from sklearn.metrics import f1_score, mean_squared_error
f1_score(y_test, y_pred)

0.9020044543429844

In [None]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [None]:
coefs = logreg.coef_.tolist()
indices = [i for i, x in enumerate(coefs[0]) if x == 0]

In [None]:
X_train.columns[indices]

Index(['opponent_AS La Châtaigneraie', 'opponent_Alès OL', 'opponent_Andratx',
       'opponent_Atl Sanluqueño', 'opponent_Badajoz', 'opponent_Bourges',
       'opponent_CD Anaitasuna', 'opponent_CD Cantolagua',
       'opponent_CD Extremadura', 'opponent_CD L'Alcora',
       'opponent_CD Marchamalo', 'opponent_CD Ribadumia',
       'opponent_CE Cardassar', 'opponent_CFJ Mollerussa',
       'opponent_Cheltenham', 'opponent_Chesterfield', 'opponent_Chorley',
       'opponent_Ciudad de Lucena', 'opponent_Compostela', 'opponent_Cornellà',
       'opponent_Dinan-Léhon', 'opponent_Drochtersen/A', 'opponent_Eldense',
       'opponent_FC Loon-Plage', 'opponent_GFA Rumilly Vallières',
       'opponent_Gimnàstic', 'opponent_Hastedt', 'opponent_IC Croix',
       'opponent_Juventud Torremolinos CF', 'opponent_Kryvbas Kryvyi Rih',
       'opponent_Levski Sofia', 'opponent_Limonest', 'opponent_Llagostera',
       'opponent_Lokomotivi Tbilisi', 'opponent_Lübeck',
       'opponent_Macclesfield', 'opp

In [None]:
coefs

[[-0.11685991788171499,
  0.5203078953224193,
  0.5757200053235582,
  -0.30965395995889056,
  0.6954900342639321,
  0.018592029969117334,
  -0.02221764718631012,
  -0.3216543828008999,
  -0.008894316645031057,
  -0.1700848717069594,
  0.026604649394529008,
  -0.015457924299312795,
  0.29986733851601904,
  0.6266842193744201,
  1.5042210168787282,
  -2.4101323495570086,
  0.6758737068399401,
  -0.15510369218347644,
  0.7034124535089434,
  -0.19185812958125872,
  0.07736342296052887,
  0.03270886633423264,
  -0.10285160274439555,
  0.09310680085808634,
  -0.10312111856907875,
  -0.00665414134580316,
  -0.09686501144469636,
  0.05420787817237664,
  0.44063366371873636,
  -8.098262808172104,
  9.94801243853834,
  0.06090551832259725,
  -0.012517064996994905,
  0.13918538990883791,
  0.026411714584971394,
  -0.06899841507041701,
  0.05290399308418536,
  0.003064109119271148,
  -0.025489352678748946,
  0.03048731787693368,
  -0.048911203509631906,
  0.028823279765139705,
  -0.137120240382942

Коэффициенты не зануляются перед основными признаками.
Ради интереса попробуем выкинуть чего-нибудь с маленькими весами

In [None]:
data.columns[:48]

Index(['Attendance', 'Performance.3', 'Standard.5', 'Poss', 'Standard.4',
       'Standard.3', 'offside', 'crosses', 'fouls_drw', 'fouls_com', 'Int',
       'Tackles.1', 'Standard.1', 'Standard.2', 'Performance.2', 'Performance',
       'Performance.4', 'OwnGoals', 'Ast', 'Penalty Kicks.1',
       'Penalty Kicks.2', 'Penalty Kicks.3', 'Penalty Kicks', 'Standard.9',
       'Tkl+Int', 'sec_yel', 'red', 'yellow', 'Standard.8', 'result', 'GA',
       'GF', 'team_Alavés', 'team_Almería', 'team_Amiens', 'team_Angers',
       'team_Arminia', 'team_Arsenal', 'team_Aston Villa', 'team_Atalanta',
       'team_Athletic Club', 'team_Atlético Madrid', 'team_Augsburg',
       'team_Auxerre', 'team_Barcelona', 'team_Bayern Munich',
       'team_Benevento', 'team_Betis'],
      dtype='object')

In [None]:
data.columns[[5,6,8,10,11]]

Index(['Standard.3', 'offside', 'fouls_drw', 'Int', 'Tackles.1'], dtype='object')

In [None]:
data1 = data

In [None]:
#data1 = data.drop(['Standard.3', 'offside', 'fouls_drw', 'Int', 'Tackles.1'], axis = 1)

In [None]:
y1 = data1['result']
x1 = data1.drop('result', axis = 1)
X_train1, X_test1, y_train1, y_test1 = train_test_split(x1, y1,
                                                    test_size=0.3,
                                                    random_state=0)
normalizer = StandardScaler()
scaler1 = normalizer.fit(X_train1)
X_train_norm1 = scaler1.transform(X_train1)
X_test_norm1 = scaler1.transform(X_test1)
logreg1 = LogisticRegression(penalty = 'l1', solver = 'liblinear')
logreg1.fit(X_train_norm1, y_train1)
y_pred1 = logreg1.predict(X_test_norm1)
y_pred1
f1_1 = f1_score(y_test1, y_pred1)
f1_1

0.9062385990514411

На стандартной модели:

Попробовала убрать Standard.3 (Shots on target %) - качество ухудшилось (на 0.002) - f1 = 0.9914163090128756

Качество улучшилось когда убрали offsides, crosses вместе! (на 0.002)

Поменяли тип модели на penalty = 'l1 и solver = 'liblinear' - качество сразу стало 1 (или близко к 1 при разных random_seed, больше чем 0,9994)

In [None]:
X_train1.columns[[6,7]]

Index(['Int', 'Tackles.1'], dtype='object')

In [None]:
logreg1.coef_[0][[6,7]]

array([-0.00995473, -0.09787671])

In [None]:
y_test.value_counts()

Unnamed: 0_level_0,count
result,Unnamed: 1_level_1
0,1042
1,907


3 ТЕСТ - другой feature engineering + log reg

In [146]:
df.sort_values(by = 'date', ignore_index = 'True', inplace = True)

In [147]:
data1 = df.drop([ 'Opp Formation', 'Formation', 'Captain', 'Referee','result','time','game', 'day', 'venue', 'opponent', 'season','GT', 'Gdiff'], axis = 1)

In [148]:
data1.sort_values(by = 'date', ignore_index = 'True', inplace = True)

In [113]:
# def GetFeatures(data):
#   data = data.sort_values(by = 'date', ignore_index = 'True')
#   features = []
#   for i in range(len(data)):
#     srez = data[(data['team']==data['team'][i])&(data['date'] < data['date'][i])]
#     if 0 < len(srez) < 10:
#       stats = srez.drop(['team','date'], axis = 1).sum()/len(srez)
#     elif len(srez) >= 10:
#       stats = srez.drop(['team','date'], axis = 1)[-10:].sum()/10
#     else:
#       srez = data[(data['team']==data['team'][i])&(data['date'] <= data['date'][i])]
#       stats = srez.drop(['team','date'], axis = 1).sum()

#     features.append(stats.values.tolist())
#   return features

In [143]:
#Статистика для одной команды за последние 10 матчей (показатели в среднем)
def getStats(team, date, df = data1):
    srez = df[(df['team']== team)&(df['date'] < date)]
    if 0 < len(srez) < 10:
      stats = srez.drop(['team','date'], axis = 1).sum()/len(srez)
    elif len(srez) >= 10:
      stats = srez.drop(['team','date'], axis = 1)[-10:].sum()/10
    else:
      srez = df[(df['team']== team)&(df['date'] <= date)]
      stats = srez.drop(['team','date'], axis = 1).sum()

    return stats.values.tolist()

In [205]:
getStats('Paris S-G','2024-04-16')

[41738.0,
 73.57000000000001,
 0.337,
 64.7,
 0.15100000000000002,
 45.84,
 2.7,
 12.2,
 11.8,
 10.1,
 7.7,
 8.8,
 16.6,
 7.5,
 3.5,
 4.3,
 0.4,
 0.0,
 2.3,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3,
 9.9,
 0.0,
 0.0,
 1.0,
 0.2,
 0.8,
 2.9]

In [154]:
def GetTrain(data):
  features = []
  for i in range(len(data)):
    team1 = getStats(data['team'][i], data['date'][i])
    team2 = getStats(data['opponent'][i], data['date'][i])
    diff = [a - b for a, b in zip(team1, team2)]
    features.append(diff)
  return features

In [155]:
feat_data = GetTrain(df)

In [156]:
feat_data

[[31342.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  1.0,
  1.0],
 [21337.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  1.0,
  2.0],
 [32124.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0],
 [14000.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0],
 [26783.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0

In [206]:
x = pd.DataFrame(feat_data, columns = data1.drop(['team','date'], axis = 1).columns)

In [207]:
#С добавлением venue показатель F1 вырос на 0,02 - оставляем в датасете
table = pd.concat([x, df['venue']], axis = 1)

In [209]:
data = pd.get_dummies(table,prefix=['venue'], columns = ['venue'], drop_first=True)

In [211]:
y = list(map((lambda i: 1 if i == 'W' else 0), df['result']))

In [212]:
X_train, X_test, y_train, y_test = train_test_split(data, y,
                                                    train_size=0.8,
                                                    random_state=125)

In [213]:
normalizer = StandardScaler()
scaler = normalizer.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [214]:
logreg = LogisticRegression()
logreg.fit(X_train_norm, y_train)

In [215]:
y_pred = logreg.predict(X_test_norm)
y_pred

array([0, 0, 0, ..., 1, 1, 1])

In [216]:
#Предсказание вероятностей (для расчета бизнес-метрик)
np.set_printoptions(suppress=True)
y_pred_prob = logreg.predict_proba(X_test_norm)
y_pred_prob

array([[0.5171447 , 0.4828553 ],
       [0.54215754, 0.45784246],
       [0.58252593, 0.41747407],
       ...,
       [0.29052432, 0.70947568],
       [0.31138752, 0.68861248],
       [0.40632686, 0.59367314]])

In [217]:
f1_score(y_test, y_pred)

0.6008279124778237

In [218]:
#Здесь я делаю другую спецификацию для модели, но результат остается тот же самый

logreg1 = LogisticRegression(penalty = 'l1', solver = 'liblinear')
logreg1.fit(X_train_norm, y_train)
y_pred1 = logreg1.predict(X_test_norm)
y_pred1
f1_1 = f1_score(y_test, y_pred)
f1_1

0.6008279124778237

Пробуем обучить модель на полном датасете и после взять в качестве теста часть этого же датасета

In [219]:
normalizer = StandardScaler()
scaler = normalizer.fit(data)
data_norm = scaler.transform(data)

In [224]:
data_norm

array([[ 1.29056843, -0.48235779, -0.40144257, ...,  0.45135211,
         1.06238355, -0.1344379 ],
       [ 0.76583048, -0.48235779, -0.40144257, ...,  1.32715388,
         1.06238355, -0.1344379 ],
       [ 1.33158243, -0.48235779, -0.40144257, ...,  0.45135211,
         1.06238355, -0.1344379 ],
       ...,
       [ 0.33452787,  1.45149838,  0.71022917, ...,  0.53893229,
        -0.94127964, -0.1344379 ],
       [ 0.35172693,  0.70157073,  0.30526303, ...,  1.0769248 ,
        -0.94127964, -0.1344379 ],
       [ 3.06240997,  1.83718586,  0.52680333, ...,  0.45135211,
        -0.94127964, -0.1344379 ]])

In [220]:
logreg3 = LogisticRegression()
logreg3.fit(data_norm, y)


In [225]:
y_pred2 = logreg3.predict(X_test_norm)
f1_2 = f1_score(y_test, y_pred2)
f1_2

0.6015393724097099