In [1]:
!pip install catboost

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-win_amd64.whl (101.0 MB)
     -------------------------------------- 101.0/101.0 MB 2.9 MB/s eta 0:00:00
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     ---------------------------------------- 47.0/47.0 kB ? eta 0:00:00
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2.2 graphviz-0.20.1


In [5]:
Dataset = pd.read_csv('all_players.csv')  # Замени на путь к файлу
print(Dataset.head())

   Unnamed: 0             Name     Nation             Club Position  Age  \
0           0    Kylian Mbappé     France         Paris SG       ST   24   
1           1   Erling Haaland     Norway  Manchester City       ST   23   
2           2  Kevin De Bruyne    Belgium  Manchester City       CM   32   
3           3     Lionel Messi  Argentina   Inter Miami CF       CF   36   
4           4    Karim Benzema     France       Al Ittihad       CF   35   

   Overall  Pace  Shooting  Passing  ...  Strength  Aggression  Att work rate  \
0       91    97        90       80  ...        77          64           High   
1       91    89        93       66  ...        93          87           High   
2       91    72        88       94  ...        74          75           High   
3       90    80        87       90  ...        68          44            Low   
4       90    79        88       83  ...        82          63         Medium   

   Def work rate  Preferred foot  Weak foot  Skill moves

In [6]:
label_encoder = LabelEncoder()
Dataset['Preferred foot Encoded'] = label_encoder.fit_transform(Dataset['Preferred foot'])
print(Dataset[['Preferred foot', 'Preferred foot Encoded']].head(10))

  Preferred foot  Preferred foot Encoded
0          Right                       1
1           Left                       0
2          Right                       1
3           Left                       0
4          Right                       1
5           Left                       0
6          Right                       1
7          Right                       1
8           Left                       0
9          Right                       1


In [7]:
non_numeric_columns = Dataset.select_dtypes(exclude=['number']).columns.tolist()
Dataset = Dataset.drop(columns=non_numeric_columns)
Dataset = Dataset.drop("GK", axis=1)
Dataset.info()
print(Dataset)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17326 entries, 0 to 17325
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Unnamed: 0              17326 non-null  int64
 1   Age                     17326 non-null  int64
 2   Overall                 17326 non-null  int64
 3   Pace                    17326 non-null  int64
 4   Shooting                17326 non-null  int64
 5   Passing                 17326 non-null  int64
 6   Dribbling               17326 non-null  int64
 7   Defending               17326 non-null  int64
 8   Physicality             17326 non-null  int64
 9   Acceleration            17326 non-null  int64
 10  Sprint                  17326 non-null  int64
 11  Positioning             17326 non-null  int64
 12  Finishing               17326 non-null  int64
 13  Shot                    17326 non-null  int64
 14  Long                    17326 non-null  int64
 15  Volleys            

In [8]:
X = Dataset.drop(columns=['Preferred foot Encoded'])
y = Dataset['Preferred foot Encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
under_sampler = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)
# Параметры для настройки CatBoost
catboost_params = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8]
}

# Создание модели CatBoost
catboost_model = CatBoostClassifier(verbose=False)

# Подбор лучших параметров с помощью GridSearchCV
catboost_grid_search = GridSearchCV(catboost_model, catboost_params, cv=5)
catboost_grid_search.fit(X_resampled, y_resampled)

# Лучшие параметры
best_catboost_params = catboost_grid_search.best_params_
print(f"Лучшие параметры для CatBoost: {best_catboost_params}")

Лучшие параметры для CatBoost: {'depth': 8, 'iterations': 300, 'learning_rate': 0.05}


In [9]:
# Переобучение модели с лучшими параметрами на всем тренировочном наборе данных
best_catboost_model = CatBoostClassifier(**best_catboost_params, verbose=False)
best_catboost_model.fit(X_resampled, y_resampled)

# Предсказание на тестовых данных
y_pred_catboost = best_catboost_model.predict(X_test)

# Оценка точности
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"Точность CatBoost на тестовых данных: {accuracy_catboost}")

# Отчет о классификации для CatBoost
print("Classification Report для CatBoost:")
print(classification_report(y_test, y_pred_catboost))

Точность CatBoost на тестовых данных: 0.6522868272976482
Classification Report для CatBoost:
              precision    recall  f1-score   support

           0       0.36      0.63      0.46      1643
           1       0.85      0.66      0.74      5288

    accuracy                           0.65      6931
   macro avg       0.61      0.64      0.60      6931
weighted avg       0.74      0.65      0.68      6931

