In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
import matplotlib as plt
import joblib
import logging

In [8]:
logging.basicConfig(filename='model_logs.log', level=logging.INFO, format='%(asctime)s - %(message)s')
logging.info("Старт модели...")

In [None]:
df = pd.read_csv("football.csv",encoding='cp1251')
print(df)
df.columns = df.columns.str.strip()
print(df.columns.tolist())


         home_team   away_team weather neutral_ground     result
0          Арсенал         ПСЖ   clear              0  away_team
1          Спартак      Динамо   clear              0  home_team
2    Штурм Грац ll  Ферст Вена  cloudy              0  home_team
3        Барселона       Интер   clear              0       draw
4        Локомотив      Ростов   windy              0  away_team
..             ...         ...     ...            ...        ...
258            Уфа       Чайка  cloudy              0  away_team
259          Ротор  Черноморец   clear              0  away_team
260          Акрон   Краснодар  cloudy              0  away_team
261          Зенит       Факел  cloudy              0  home_team
262          Рубин        ЦСКА   clear              0       draw

[263 rows x 5 columns]
['home_team', 'away_team', 'weather', 'neutral_ground', 'result']


In [4]:
features = ['home_team', 'away_team', 'weather','neutral_ground']
target = 'result'

X = df[features]
y = df[target]

In [5]:
categorical_features = ['home_team', 'away_team', 'weather']
numeric_features = ['neutral_ground']
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('num', 'passthrough', numeric_features)
    ])

In [6]:
print(X.head())
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

       home_team   away_team weather neutral_ground
0        Арсенал         ПСЖ   clear              0
1        Спартак      Динамо   clear              0
2  Штурм Грац ll  Ферст Вена  cloudy              0
3      Барселона       Интер   clear              0
4      Локомотив      Ростов   windy              0
Index(['home_team', 'away_team', 'weather', 'neutral_ground'], dtype='object')


In [9]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
logging.info(f"Random Forest accuracy: {acc_rf:.4f}")

ValueError: could not convert string to float: 'away_team'

In [None]:
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='mlogloss'))
])

xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
logging.info(f"XGBoost accuracy: {acc_xgb:.4f}")

In [None]:
plt.bar(['Random Forest', 'XGBoost'], [acc_rf, acc_xgb], color=['green', 'orange'])
plt.ylabel("Accuracy")
plt.title("Сравнение моделей")
plt.savefig("model_comparison.png")
plt.show()

In [None]:
joblib.dump(rf_pipeline, 'rf_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')
logging.info("Модель сохранена")

In [None]:
def predict_user_input(model, preprocessor):
    home = input("Домашняя команда: ")
    away = input("Гостевая команда: ")
    weather = input("Погода (clear, cloudy, rain, snow, hot): ")
    neutral = int(input("Нейтральное поле(Это значит, что обе команды выехали в другой город, и ни одна из них не играет дома)? (0 - нет, 1 - да): "))

    df_input = pd.DataFrame([{
        'home_team': home,
        'away_team': away,
        'weather': weather,
        'neutral_ground': neutral
    }])

    pred = model.predict(df_input)[0]
    proba = model.predict_proba(df_input)[0]
    labels = model.classes_
    proba_dict = dict(zip(labels, proba))
    
    logging.info(f"Прогноз: {pred}, вероятности: {proba_dict}")
    print(f"\nПобеда {pred.upper()} с вероятностью: {proba_dict[pred]:.2%}")



In [None]:
predict_user_input(rf_pipeline, preprocessor)