In [13]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler 
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

df = pd.read_csv("new_match.csv")

# Предобработка даты
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Определение результата матчей для домашних и гостевых команд
df['home_win/not'] = df.apply(
    lambda x: 1 if x['home_team_goal'] > x['away_team_goal'] else (0 if x['home_team_goal'] < x['away_team_goal'] else 1),
    axis=1
)
df['away_win/not'] = df.apply(
    lambda x: 1 if x['home_team_goal'] < x['away_team_goal'] else (0 if x['home_team_goal'] > x['away_team_goal'] else 1),
    axis=1
)

# Создаём обобщённый DataFrame для всех матчей
df_combined = pd.concat([
    df[['date', 'home_team_goal', 'home_win/not']].rename(
        columns={'home_team_goal': 'team', 'home_win/not': 'result'}
    ),
    df[['date', 'away_team_api_id', 'away_win/not']].rename(
        columns={'away_team_api_id': 'team', 'away_win/not': 'result'}
    )
], ignore_index=True).sort_values(by=['team', 'date'], ascending=[True, False])


# Группировка команд по 6 последним матчам
last_five_matches = df_combined.groupby('team').head(6)
last_five_matches = last_five_matches.reset_index(drop=True)


#Создание первой части с последним матчем
part1 = df_combined.groupby('team').head(1)

# Создание второй части со 2го по шестой матчи
part2 = df_combined.groupby('team').apply(lambda x: x.head(6) if len(x) >= 6 else x)

# Сброс индексов
part1 = part1.reset_index(drop=True)
part2 = part2.reset_index(drop=True)


new_match_df = pd.DataFrame(df, columns=['date', 'home_team_api_id', 'away_team_api_id', 'home_win/not', 'away_win/not'])

result_sum = part2.groupby('team')['result'].sum().reset_index()




# Объединение данных по уникальному идентификатору команды
merged_data = pd.merge(new_match_df, result_sum, left_on='home_team_api_id', right_on='team')
merged_data_total = pd.merge(merged_data, result_sum, left_on='away_team_api_id', right_on='team')



merged_data_total['home_superiority/not'] = merged_data_total.apply(
    lambda x: 0 if x['result_x'] < x['result_y'] else 1, axis=1)

merged_data_total['away_superiority/not'] = merged_data_total.apply(
    lambda x: 0 if x['result_x'] > x['result_y'] else 1, axis=1)

# Оставлем только нужные столбцы в DataFrame
merged_data_total.drop(['team_x'], axis=1, inplace=True)
merged_data_total.drop(['team_y'], axis=1, inplace=True)
merged_data_total.drop(['result_x'], axis=1, inplace=True)
merged_data_total.drop(['result_y'], axis=1, inplace=True)

display(merged_data_total)

Unnamed: 0,date,home_team_api_id,away_team_api_id,home_win/not,away_win/not,home_superiority/not,away_superiority/not
0,2015-07-24,9997,8342,1,0,0,1
1,2015-12-26,8571,8342,0,1,0,1
2,2016-02-28,9987,8342,1,0,1,1
3,2016-02-14,8573,8342,0,1,0,1
4,2015-08-22,10000,8342,1,0,0,1
...,...,...,...,...,...,...,...
3321,2015-09-23,9824,10199,0,1,0,1
3322,2015-07-25,10192,10199,1,1,1,0
3323,NaT,10192,10199,1,0,1,0
3324,2015-08-29,7896,10199,0,1,1,1


In [14]:
# Анализ для home_team

merged_data_total['home_win/not'] = merged_data_total['home_win/not'].replace({1.0: 1, 0.0: 0})

log_reg = LogisticRegression()
X = merged_data_total[['home_superiority/not']]
Y=merged_data_total['home_win/not']
lab = LabelEncoder()
Y = lab.fit_transform(Y)

test_split_index = int(X.shape[0]*0.75)
X_train = X[:test_split_index]
X_test = X[test_split_index:]
Y_train = Y[:test_split_index]
Y_test = Y[test_split_index:]

log_reg.fit(X_train, Y_train)
Y_predicted = log_reg.predict(X_test)

print('acc:%.2f'% accuracy_score(Y_test,Y_predicted))
print('prec:%.2f'% precision_score(Y_test,Y_predicted)) 
print('rec:%.2f' % recall_score(Y_test,Y_predicted)) 

acc:0.69
prec:0.69
rec:1.00


In [16]:
merged_data_total['away_win/not'] = merged_data_total['away_win/not'].replace({1.0: 1, 0.0: 0})

# Анализ для away_team

log_reg = LogisticRegression()
X = merged_data_total[['away_superiority/not']]
Y=merged_data_total['away_win/not']
lab = LabelEncoder()
Y = lab.fit_transform(Y)

test_split_index = int(X.shape[0]*0.75)
X_train = X[:test_split_index]
X_test = X[test_split_index:]
Y_train = Y[:test_split_index]
Y_test = Y[test_split_index:]

log_reg.fit(X_train, Y_train)
Y_predicted = log_reg.predict(X_test)

print('acc:%.2f'% accuracy_score(Y_test,Y_predicted))
print('prec:%.2f'% precision_score(Y_test,Y_predicted)) 
print('rec:%.2f' % recall_score(Y_test,Y_predicted)) 

acc:0.62
prec:0.64
rec:0.69
