In [142]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score, recall_score, f1_score

import matplotlib.pyplot as plt

In [143]:
Dataset = pd.read_csv('all_players.csv')  # Замени на путь к файлу
print(Dataset.head())

   Unnamed: 0             Name     Nation             Club Position  Age  \
0           0    Kylian Mbappé     France         Paris SG       ST   24   
1           1   Erling Haaland     Norway  Manchester City       ST   23   
2           2  Kevin De Bruyne    Belgium  Manchester City       CM   32   
3           3     Lionel Messi  Argentina   Inter Miami CF       CF   36   
4           4    Karim Benzema     France       Al Ittihad       CF   35   

   Overall  Pace  Shooting  Passing  ...  Strength  Aggression  Att work rate  \
0       91    97        90       80  ...        77          64           High   
1       91    89        93       66  ...        93          87           High   
2       91    72        88       94  ...        74          75           High   
3       90    80        87       90  ...        68          44            Low   
4       90    79        88       83  ...        82          63         Medium   

   Def work rate  Preferred foot  Weak foot  Skill moves

In [144]:
label_encoder = LabelEncoder()
Dataset['Preferred foot Encoded'] = label_encoder.fit_transform(Dataset['Preferred foot'])
print(Dataset[['Preferred foot', 'Preferred foot Encoded']].head(10))

  Preferred foot  Preferred foot Encoded
0          Right                       1
1           Left                       0
2          Right                       1
3           Left                       0
4          Right                       1
5           Left                       0
6          Right                       1
7          Right                       1
8           Left                       0
9          Right                       1


In [145]:
non_numeric_columns = Dataset.select_dtypes(exclude=['number']).columns.tolist()
Dataset = Dataset.drop(columns=non_numeric_columns)
Dataset = Dataset.drop("GK", axis=1)
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17326 entries, 0 to 17325
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Unnamed: 0              17326 non-null  int64
 1   Age                     17326 non-null  int64
 2   Overall                 17326 non-null  int64
 3   Pace                    17326 non-null  int64
 4   Shooting                17326 non-null  int64
 5   Passing                 17326 non-null  int64
 6   Dribbling               17326 non-null  int64
 7   Defending               17326 non-null  int64
 8   Physicality             17326 non-null  int64
 9   Acceleration            17326 non-null  int64
 10  Sprint                  17326 non-null  int64
 11  Positioning             17326 non-null  int64
 12  Finishing               17326 non-null  int64
 13  Shot                    17326 non-null  int64
 14  Long                    17326 non-null  int64
 15  Volleys            

In [175]:
X = Dataset.drop('Preferred foot Encoded', axis=1) 
y = Dataset['Preferred foot Encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


under_sampler = RandomUnderSampler(random_state=42)


X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_train)


# Определение методов наивного байеса
naive_bayes_models = {
    'GaussianNB': GaussianNB(),
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB(),
}

# Параметры для GridSearchCV
param_grid = {
    'GaussianNB': {},
    'MultinomialNB': {'alpha': [0.1, 0.5, 1.0]},
    'BernoulliNB': {'alpha': [0.1, 0.5, 1.0], 'binarize': [0.0, 0.1, 0.2]},
}

# Выбор метрики для оценки моделей
scoring_metric = 'accuracy'

# Обучение и оценка моделей с использованием GridSearchCV
best_models = {}
for model_name, model in naive_bayes_models.items():
    grid_search = GridSearchCV(model, param_grid[model_name], scoring=scoring_metric, cv=5)
    grid_search.fit(X_resampled, y_resampled)
    
    best_models[model_name] = grid_search.best_estimator_
# Оценка наилучшей модели на тестовом наборе
best_model_name = max(best_models, key=lambda k: grid_search.cv_results_['mean_test_score'][grid_search.best_index_])
best_model = best_models[best_model_name]


# Вывод результатов
print(f"Лучшая модель: {best_model_name}")
print(f"Лучшие параметры: {grid_search.best_params_}")


Лучшая модель: GaussianNB
Лучшие параметры: {'alpha': 0.1, 'binarize': 0.0}


In [176]:
# Retrain the best model on the entire dataset
best_model.fit(X_resampled, y_resampled)

# Make predictions on the entire dataset
y_pred_full = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_full)
classification_rep = classification_report(y_test, y_pred_full)
print(f'Точность модели: {accuracy}')
print(classification_rep)

Точность модели: 0.43226085701918915
              precision    recall  f1-score   support

           0       0.27      0.81      0.40      1643
           1       0.84      0.31      0.46      5288

    accuracy                           0.43      6931
   macro avg       0.56      0.56      0.43      6931
weighted avg       0.71      0.43      0.45      6931

