In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt

In [5]:
Dataset = pd.read_csv('all_players.csv')  # Замени на путь к файлу
print(Dataset.head())

   Unnamed: 0             Name     Nation             Club Position  Age  \
0           0    Kylian Mbappé     France         Paris SG       ST   24   
1           1   Erling Haaland     Norway  Manchester City       ST   23   
2           2  Kevin De Bruyne    Belgium  Manchester City       CM   32   
3           3     Lionel Messi  Argentina   Inter Miami CF       CF   36   
4           4    Karim Benzema     France       Al Ittihad       CF   35   

   Overall  Pace  Shooting  Passing  ...  Strength  Aggression  Att work rate  \
0       91    97        90       80  ...        77          64           High   
1       91    89        93       66  ...        93          87           High   
2       91    72        88       94  ...        74          75           High   
3       90    80        87       90  ...        68          44            Low   
4       90    79        88       83  ...        82          63         Medium   

   Def work rate  Preferred foot  Weak foot  Skill moves

In [6]:
label_encoder = LabelEncoder()
Dataset['Preferred foot Encoded'] = label_encoder.fit_transform(Dataset['Preferred foot'])
print(Dataset[['Preferred foot', 'Preferred foot Encoded']].head(10))

  Preferred foot  Preferred foot Encoded
0          Right                       1
1           Left                       0
2          Right                       1
3           Left                       0
4          Right                       1
5           Left                       0
6          Right                       1
7          Right                       1
8           Left                       0
9          Right                       1


In [10]:
non_numeric_columns = Dataset.select_dtypes(exclude=['number']).columns.tolist()
Dataset = Dataset.drop(columns=non_numeric_columns)
Dataset.info()
print(Dataset)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17326 entries, 0 to 17325
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Unnamed: 0              17326 non-null  int64
 1   Age                     17326 non-null  int64
 2   Overall                 17326 non-null  int64
 3   Pace                    17326 non-null  int64
 4   Shooting                17326 non-null  int64
 5   Passing                 17326 non-null  int64
 6   Dribbling               17326 non-null  int64
 7   Defending               17326 non-null  int64
 8   Physicality             17326 non-null  int64
 9   Acceleration            17326 non-null  int64
 10  Sprint                  17326 non-null  int64
 11  Positioning             17326 non-null  int64
 12  Finishing               17326 non-null  int64
 13  Shot                    17326 non-null  int64
 14  Long                    17326 non-null  int64
 15  Volleys            

In [11]:
X = Dataset.drop(columns=['Preferred foot Encoded'])
y = Dataset['Preferred foot Encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
under_sampler = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)

# Для решающего дерева
decision_tree = DecisionTreeClassifier()
dt_params = {'max_depth': [None, 5, 10, 15, 20]}  # Пример параметров для настройки

dt_grid_search = GridSearchCV(decision_tree, dt_params, cv=5)
dt_grid_search.fit(X_resampled, y_resampled)

y_pred_dt = dt_grid_search.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

print(f"Decision Tree - Лучшее значение параметра: {dt_grid_search.best_params_}, Точность на тесте: {accuracy_dt}")

# Для случайного леса
random_forest = RandomForestClassifier()
rf_params = {'n_estimators': [50, 100, 150, 200], 'max_depth': [None, 5, 10, 15]}  # Пример параметров для настройки

rf_grid_search = GridSearchCV(random_forest, rf_params, cv=5)
rf_grid_search.fit(X_resampled, y_resampled)

y_pred_rf = rf_grid_search.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest - Лучшие значения параметров: {rf_grid_search.best_params_}, Точность на тесте: {accuracy_rf}")

Decision Tree - Лучшее значение параметра: {'max_depth': 5}, Точность на тесте: 0.6485355648535565
Random Forest - Лучшие значения параметров: {'max_depth': None, 'n_estimators': 200}, Точность на тесте: 0.6453614197085558


In [14]:
from sklearn.metrics import classification_report

# Для решающего дерева
best_dt_params = {'max_depth': 5}
best_decision_tree = DecisionTreeClassifier(**best_dt_params)
best_decision_tree.fit(X_resampled, y_resampled)

y_pred_best_dt = best_decision_tree.predict(X_test)
accuracy_best_dt = accuracy_score(y_test, y_pred_best_dt)

print(f"Decision Tree с лучшими параметрами - Точность на тесте: {accuracy_best_dt}")
print("Classification Report для Decision Tree:")
print(classification_report(y_test, y_pred_best_dt))

# Для случайного леса
best_rf_params = {'max_depth': None, 'n_estimators': 50}
best_random_forest = RandomForestClassifier(**best_rf_params)
best_random_forest.fit(X_resampled, y_resampled)

y_pred_best_rf = best_random_forest.predict(X_test)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)

print(f"Random Forest с лучшими параметрами - Точность на тесте: {accuracy_best_rf}")
print("Classification Report для Random Forest:")
print(classification_report(y_test, y_pred_best_rf))

Decision Tree с лучшими параметрами - Точность на тесте: 0.6485355648535565
Classification Report для Decision Tree:
              precision    recall  f1-score   support

           0       0.36      0.59      0.44      1643
           1       0.84      0.67      0.74      5288

    accuracy                           0.65      6931
   macro avg       0.60      0.63      0.59      6931
weighted avg       0.73      0.65      0.67      6931

Random Forest с лучшими параметрами - Точность на тесте: 0.6241523589669601
Classification Report для Random Forest:
              precision    recall  f1-score   support

           0       0.34      0.63      0.44      1643
           1       0.84      0.62      0.72      5288

    accuracy                           0.62      6931
   macro avg       0.59      0.63      0.58      6931
weighted avg       0.73      0.62      0.65      6931

