# Лабораторная работа 2
## ----------------------------------------------------------------------------------------------------------------------------------
## Задание:
1. Провести классификацию найденного датасета, методом машины опорных векторов. В формате Markdown писать пояснения. Объяснить почему были выбраны именно такие гиперпараметры, была ли перекрестная проверка, и т.д.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# Загрузка данных
data = pd.read_csv('all_players.csv')  # Замени на путь к файлу
# Показать первые строки данных для проверки
print(data)

       Unnamed: 0              Name          Nation             Club Position  \
0               0     Kylian Mbappé          France         Paris SG       ST   
1               1    Erling Haaland          Norway  Manchester City       ST   
2               2   Kevin De Bruyne         Belgium  Manchester City       CM   
3               3      Lionel Messi       Argentina   Inter Miami CF       CF   
4               4     Karim Benzema          France       Al Ittihad       CF   
...           ...               ...             ...              ...      ...   
17321        1476  Kateřina Vithová  Czech Republic     Slavia Praha       LM   
17322        1477     Lucie Bendová  Czech Republic     Slavia Praha       CB   
17323        1478    Aleigh Gambone   United States  Glasgow City FC       CM   
17324        1479        Joy Lysser     Switzerland        FC Zürich       CM   
17325        1480      Soraya Wulff     Switzerland        FC Zürich       CB   

       Age  Overall  Pace  

In [3]:
label_encoder = LabelEncoder()
data['Preferred foot Encoded'] = label_encoder.fit_transform(data['Preferred foot'])

non_numeric_columns = data.select_dtypes(exclude=['number']).columns.tolist()

data = data.drop(columns=non_numeric_columns)
data = data.drop("GK", axis=1)
data.info()
print(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17326 entries, 0 to 17325
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Unnamed: 0              17326 non-null  int64
 1   Age                     17326 non-null  int64
 2   Overall                 17326 non-null  int64
 3   Pace                    17326 non-null  int64
 4   Shooting                17326 non-null  int64
 5   Passing                 17326 non-null  int64
 6   Dribbling               17326 non-null  int64
 7   Defending               17326 non-null  int64
 8   Physicality             17326 non-null  int64
 9   Acceleration            17326 non-null  int64
 10  Sprint                  17326 non-null  int64
 11  Positioning             17326 non-null  int64
 12  Finishing               17326 non-null  int64
 13  Shot                    17326 non-null  int64
 14  Long                    17326 non-null  int64
 15  Volleys            

In [14]:
X = data.drop('Preferred foot Encoded', axis=1) 
y = data['Preferred foot Encoded']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

under_sampler = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = under_sampler.fit_resample(X_train, y_test)

print(pd.Series(y_resampled).value_counts())
print(y_test.unique())

# Создаем словарь с параметрами для GridSearchCV
param_grid = {'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
              'C': [0.001, 0.01, 0.1, 1, 10],
              'degree': [1, 2, 3, 4, 5, 6, 7]}  # Добавляем параметр degree от 1 до 7

# Создаем объект GridSearchCV
svm_grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Обучаем GridSearch для подбора лучших параметров
svm_grid.fit(X_resampled, y_resampled)

# Получаем лучшие параметры
best_params = svm_grid.best_params_
best_kernel = best_params['kernel']
best_C = best_params['C']
best_degree = best_params['degree']

# Выводим лучшие параметры
print(f"Лучшие параметры: Ядро = {best_kernel}, C = {best_C}, Degree = {best_degree}")

0    1643
1    1643
Name: Preferred foot Encoded, dtype: int64
[1 0]
Лучшие параметры: Ядро = rbf, C = 1, Degree = 1


In [18]:
svm_model = SVC(kernel=best_kernel, C=best_C) 

svm_model.fit(X_resampled, y_resampled)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division = 1)

print(f'Точность модели: {accuracy}')
print(report)

stratified_kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(svm_model, X_resampled, y_resampled, cv=stratified_kfold)

print(f'Средняя точность перекрестной проверки: {cross_val_scores.mean()}')

Точность модели: 0.6929735968835666
              precision    recall  f1-score   support

           0       0.41      0.67      0.51      1643
           1       0.87      0.70      0.78      5288

    accuracy                           0.69      6931
   macro avg       0.64      0.68      0.64      6931
weighted avg       0.76      0.69      0.71      6931

Средняя точность перекрестной проверки: 0.6412051125989044
