In [96]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.feature_selection import f_classif, chi2, mutual_info_classif, SelectKBest, SelectPercentile
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder, MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline

from OutliersCorrection import OutliersIQRDeleter, OutliersLOFDeleter, OutliersIQRCorrector, OutliersLOFCorrector

## Сбор данных

In [73]:
df = pd.read_csv('teams_matches_stats-2.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.sort_values(by = 'date', ignore_index = 'True', inplace = True)
y = list(map((lambda i: 1 if i == 'W' else 0), df['result']))
data = df.drop([ 'Opp Formation', 'Formation', 'Captain', 'Referee','result','time','game', 'day', 'season','GT', 'Gdiff'], axis = 1)

In [38]:
def getStats(team, date, df = data):
    srez = df[(df['team']== team)&(df['date'] < date)]
    if 0 < len(srez) < 10:
      stats = srez.drop(['team','date', 'opponent', 'venue'], axis = 1).sum()/len(srez)
    elif len(srez) >= 10:
      stats = srez.drop(['team','date', 'opponent', 'venue'], axis = 1)[-10:].sum()/10
    else:
      srez = df[(df['team']== team)&(df['date'] <= date)]
      stats = srez.drop(['team','date', 'opponent', 'venue'], axis = 1).sum()

    return stats.values.tolist()

In [51]:
def GetTrain(data):
  features = []
  for i in range(len(data)):
    team1 = getStats(data['team'][i], data['date'][i])
    team2 = getStats(data['opponent'][i], data['date'][i])
    diff = [a - b for a, b in zip(team1, team2)]
    features.append(diff)
  return features

In [66]:
feat_data = GetTrain(data)
x = pd.DataFrame(feat_data, columns=data.drop(['team', 'date', 'opponent', 'venue'], axis=1).columns)
x = pd.concat([x, data['venue']], axis=1)

In [67]:
num_cols = x.dtypes[(x.dtypes == 'int64') | (x.dtypes == 'float64')].index.tolist()
cat_cols = x.dtypes[x.dtypes == 'object'].index.tolist()

## OHE + StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    train_size=0.8,
                                                    random_state=125)
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

In [127]:
enc = OneHotEncoder(handle_unknown='ignore',drop = 'first', sparse_output = False)
encd = enc.fit_transform(X_train[['venue']])
encd_test = enc.transform(X_test[['venue']])

one_hot_df = pd.DataFrame(encd, columns=enc.get_feature_names_out(['venue']))
one_hot_df_test = pd.DataFrame(encd_test, columns=enc.get_feature_names_out(['venue']))


X_train = pd.concat([X_train.drop(['venue'], axis = 1), one_hot_df], axis = 1)
X_test = pd.concat([X_test.drop(['venue'], axis = 1), one_hot_df_test], axis = 1)

In [130]:
normalizer = StandardScaler()
scaler = normalizer.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [131]:
model = BaggingClassifier()
model.fit(X_train, y_train)

In [132]:
print(f1_score(y_test, model.predict(X_test)))

0.5347137637028014


## OHE + MinMaxScaler

In [133]:
normalizer = MinMaxScaler()
scaler = normalizer.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [134]:
model = BaggingClassifier()
model.fit(X_train, y_train)

In [135]:
print(f1_score(y_test, model.predict(X_test)))

0.536144578313253


## OrdinalEnc + StandardScaler

In [136]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    train_size=0.8,
                                                    random_state=125)
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

In [137]:
enc = OrdinalEncoder()
encd = enc.fit_transform(X_train[['venue']])
encd_test = enc.transform(X_test[['venue']])

one_hot_df = pd.DataFrame(encd, columns=enc.get_feature_names_out(['venue']))
one_hot_df_test = pd.DataFrame(encd_test, columns=enc.get_feature_names_out(['venue']))


X_train = pd.concat([X_train.drop(['venue'], axis = 1), one_hot_df], axis = 1)
X_test = pd.concat([X_test.drop(['venue'], axis = 1), one_hot_df_test], axis = 1)

In [138]:
normalizer = StandardScaler()
scaler = normalizer.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [139]:
model = BaggingClassifier()
model.fit(X_train, y_train)

In [140]:
print(f1_score(y_test, model.predict(X_test)))

0.5315370483772198


## OrdincalEnc + MinMaxScaler

In [141]:
normalizer = MinMaxScaler()
scaler = normalizer.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [142]:
model = BaggingClassifier()
model.fit(X_train, y_train)

In [143]:
print(f1_score(y_test, model.predict(X_test)))

0.5226006191950464


## TargetEnc + StandardScaler

In [144]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    train_size=0.8,
                                                    random_state=125)
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

In [147]:
enc = TargetEncoder()
encd = enc.fit_transform(X_train[['venue']], y_train)
encd_test = enc.transform(X_test[['venue']])

one_hot_df = pd.DataFrame(encd, columns=enc.get_feature_names_out(['venue']))
one_hot_df_test = pd.DataFrame(encd_test, columns=enc.get_feature_names_out(['venue']))


X_train = pd.concat([X_train.drop(['venue'], axis = 1), one_hot_df], axis = 1)
X_test = pd.concat([X_test.drop(['venue'], axis = 1), one_hot_df_test], axis = 1)

In [148]:
normalizer = StandardScaler()
scaler = normalizer.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [149]:
model = BaggingClassifier()
model.fit(X_train, y_train)

In [150]:
print(f1_score(y_test, model.predict(X_test)))

0.5206874602164226


## TargetEnc + MinMaxScaler

In [151]:
normalizer = MinMaxScaler()
scaler = normalizer.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [152]:
model = BaggingClassifier()
model.fit(X_train, y_train)

In [153]:
print(f1_score(y_test, model.predict(X_test)))

0.529192546583851


## Подбор гиперпараметров лучшей модели

In [157]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    train_size=0.8,
                                                    random_state=125)
X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)

enc = OneHotEncoder(handle_unknown='ignore',drop = 'first', sparse_output = False)
encd = enc.fit_transform(X_train[['venue']])
encd_test = enc.transform(X_test[['venue']])

one_hot_df = pd.DataFrame(encd, columns=enc.get_feature_names_out(['venue']))
one_hot_df_test = pd.DataFrame(encd_test, columns=enc.get_feature_names_out(['venue']))


X_train = pd.concat([X_train.drop(['venue'], axis = 1), one_hot_df], axis = 1)
X_test = pd.concat([X_test.drop(['venue'], axis = 1), one_hot_df_test], axis = 1)

normalizer = MinMaxScaler()
scaler = normalizer.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [None]:
from sklearn.model_selection import cross_val_score
import optuna

def objective_bagging(trial):
    max_samples = trial.suggest_int("max_samples", 1, 100)
    n_estimators = trial.suggest_int("n_estimators", 1200, 2000)
    max_features = trial.suggest_int("max_features", 1, 32)
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])

    model = BaggingClassifier(
        n_estimators=n_estimators,
        max_samples=max_samples,
        max_features=max_features,
        bootstrap=bootstrap
    )

    score = cross_val_score(model, X_train, y_train, cv=3, scoring="f1", n_jobs=-1).mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective_bagging, n_trials=100)

[I 2025-03-16 01:28:08,029] A new study created in memory with name: no-name-c8ae25e2-84f0-46bb-bd0d-7ce7c323de14
[I 2025-03-16 01:28:10,869] Trial 0 finished with value: 0.5182575140538287 and parameters: {'max_samples': 60, 'n_estimators': 1284, 'max_features': 2, 'bootstrap': False}. Best is trial 0 with value: 0.5182575140538287.
[I 2025-03-16 01:28:13,720] Trial 1 finished with value: 0.6113934666264859 and parameters: {'max_samples': 81, 'n_estimators': 1320, 'max_features': 13, 'bootstrap': True}. Best is trial 1 with value: 0.6113934666264859.
[I 2025-03-16 01:28:16,156] Trial 2 finished with value: 0.5113301527171116 and parameters: {'max_samples': 63, 'n_estimators': 1337, 'max_features': 2, 'bootstrap': True}. Best is trial 1 with value: 0.6113934666264859.
[I 2025-03-16 01:28:17,729] Trial 3 finished with value: 0.519500990662253 and parameters: {'max_samples': 6, 'n_estimators': 1435, 'max_features': 10, 'bootstrap': True}. Best is trial 1 with value: 0.6113934666264859.
[

In [161]:
study.best_params, study.best_trial

({'max_samples': 100,
  'n_estimators': 1530,
  'max_features': 26,
  'bootstrap': False},
 FrozenTrial(number=78, state=TrialState.COMPLETE, values=[0.623590098392456], datetime_start=datetime.datetime(2025, 3, 16, 1, 31, 39, 793128), datetime_complete=datetime.datetime(2025, 3, 16, 1, 31, 42, 851747), params={'max_samples': 100, 'n_estimators': 1530, 'max_features': 26, 'bootstrap': False}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_samples': IntDistribution(high=100, log=False, low=1, step=1), 'n_estimators': IntDistribution(high=2000, log=False, low=1200, step=1), 'max_features': IntDistribution(high=32, log=False, low=1, step=1), 'bootstrap': CategoricalDistribution(choices=(True, False))}, trial_id=78, value=None))

## Лучшая модель

In [162]:
model = BaggingClassifier(**study.best_params)
model.fit(X_train, y_train)
print(f1_score(y_test, model.predict(X_test)))

0.5964099594672843
