In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Import Libraries

import pickle
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn import tree, metrics
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score


In [4]:
# Data Loading

pd.set_option('display.max_columns', None)

dataset1 = pd.read_csv('/content/drive/My Drive/Roseline/male_players_legacy.csv')

dataset2 = pd.read_csv('/content/drive/My Drive/Roseline/players_22.csv')

  dataset1 = pd.read_csv('/content/drive/My Drive/Roseline/male_players_legacy.csv')
  dataset2 = pd.read_csv('/content/drive/My Drive/Roseline/players_22.csv')


In [5]:
ds1 = pd.DataFrame(dataset1)
ds2 = pd.DataFrame(dataset2)

In [6]:
ds1.columns

Index(['player_id', 'player_url', 'fifa_version', 'fifa_update',
       'fifa_update_date', 'short_name', 'long_name', 'player_positions',
       'overall', 'potential',
       ...
       'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk',
       'player_face_url'],
      dtype='object', length=110)

In [7]:
ds2.columns

Index(['sofifa_id', 'player_url', 'short_name', 'long_name',
       'player_positions', 'overall', 'potential', 'value_eur', 'wage_eur',
       'age',
       ...
       'lcb', 'cb', 'rcb', 'rb', 'gk', 'player_face_url', 'club_logo_url',
       'club_flag_url', 'nation_logo_url', 'nation_flag_url'],
      dtype='object', length=110)

In [8]:
ds1.shape

(161583, 110)

In [8]:
ds2.shape

(19239, 110)

In [9]:
# Select and keep only the columns in where the proportion of missing values is less than or equal to 0.3 (30%)
ds1 = ds1.loc[:, (ds1.isnull().mean() <= 0.3) | (ds1.isnull().sum() == 0)]
ds2 = ds2.loc[:, (ds2.isnull().mean() <= 0.3) | (ds2.isnull().sum() == 0)]

In [10]:
ds1.shape

(161583, 102)

In [11]:
ds2.shape

(19239, 102)

**Check for missing values.**

In [12]:
nan_ds1 = ds1.isna()
nan_columns_ds1 = nan_ds1.any()
print("\nDataset1:\n")
nan_columns_ds1


Dataset1:



player_id           False
player_url          False
fifa_version        False
fifa_update         False
fifa_update_date    False
                    ...  
cb                  False
rcb                 False
rb                  False
gk                  False
player_face_url     False
Length: 102, dtype: bool

In [13]:
nan_ds2 = ds2.isna()
nan_columns_ds2 = nan_ds2.any()
print("\nDataset2:\n")
nan_columns_ds2


Dataset2:



sofifa_id           False
player_url          False
short_name          False
long_name           False
player_positions    False
                    ...  
gk                  False
player_face_url     False
club_logo_url        True
club_flag_url        True
nation_flag_url     False
Length: 102, dtype: bool

In [14]:
missing_ds1 = ds1.isnull().sum()
columns_missing_ds1 = missing_ds1[missing_ds1 > 0].index.tolist()
columns_missing_ds1

['value_eur',
 'wage_eur',
 'league_id',
 'league_name',
 'league_level',
 'club_team_id',
 'club_name',
 'club_position',
 'club_jersey_number',
 'club_joined_date',
 'club_contract_valid_until_year',
 'pace',
 'shooting',
 'passing',
 'dribbling',
 'defending',
 'physic',
 'mentality_composure']

In [15]:
missing_ds2 = ds2.isnull().sum()
columns_missing_ds2 = missing_ds2[missing_ds2 > 0].index.tolist()
columns_missing_ds2

['value_eur',
 'wage_eur',
 'club_team_id',
 'club_name',
 'league_name',
 'league_level',
 'club_position',
 'club_jersey_number',
 'club_joined',
 'club_contract_valid_until',
 'release_clause_eur',
 'pace',
 'shooting',
 'passing',
 'dribbling',
 'defending',
 'physic',
 'club_logo_url',
 'club_flag_url']

In [16]:
# Extract Categorical features
cat_ds1 = ds1.select_dtypes(include=['object']).columns
cat_ds1

Index(['player_url', 'fifa_update_date', 'short_name', 'long_name',
       'player_positions', 'dob', 'league_name', 'club_name', 'club_position',
       'club_joined_date', 'nationality_name', 'preferred_foot', 'work_rate',
       'body_type', 'real_face', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf',
       'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm',
       'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk',
       'player_face_url'],
      dtype='object')

In [17]:
# Extract Categorical features
cat_ds2 = ds2.select_dtypes(include=['object']).columns
cat_ds2

Index(['player_url', 'short_name', 'long_name', 'player_positions', 'dob',
       'club_name', 'league_name', 'club_position', 'club_joined',
       'nationality_name', 'preferred_foot', 'work_rate', 'body_type',
       'real_face', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam',
       'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm',
       'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk', 'player_face_url',
       'club_logo_url', 'club_flag_url', 'nation_flag_url'],
      dtype='object')

In [18]:
# Eliminating columns: dropping the categorical features and column IDs
ds1.drop(cat_ds1, axis=1, inplace=True)
ds2.drop(cat_ds2, axis=1, inplace=True)

ds1.drop(['player_id'], axis=1, inplace=True)
ds2.drop(['sofifa_id'], axis=1, inplace=True)

In [19]:
ds1.shape

(161583, 58)

In [20]:
ds2.shape

(19239, 56)

In [21]:
ds1

Unnamed: 0,fifa_version,fifa_update,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_id,league_level,club_team_id,club_jersey_number,club_contract_valid_until_year,nationality_id,weak_foot,skill_moves,international_reputation,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,15,2,93,95,100500000.0,550000.0,27,169,67,53.0,1.0,241.0,10.0,2018.0,52,3,4,5,93.0,89.0,86.0,96.0,27.0,63.0,84,94,71,89,85,96,89,90,76,96,96,90,94,94,95,80,73,77,60,88,48,22,92,90,76,,25,21,20,6,11,15,14,8
1,15,2,92,92,79000000.0,375000.0,29,185,80,53.0,1.0,243.0,7.0,2018.0,38,4,5,5,93.0,93.0,81.0,91.0,32.0,79.0,83,95,86,82,87,93,88,79,72,92,91,94,93,90,63,94,94,89,79,93,63,24,91,81,85,,22,31,23,7,11,15,14,11
2,15,2,90,90,54500000.0,275000.0,30,180,80,19.0,1.0,21.0,10.0,2017.0,34,2,4,5,93.0,86.0,83.0,92.0,32.0,64.0,80,85,50,86,86,93,85,83,76,90,93,93,93,89,91,86,61,78,65,90,47,39,89,84,80,,29,26,26,10,8,11,5,15
3,15,2,90,90,52500000.0,275000.0,32,195,95,16.0,1.0,73.0,10.0,2016.0,46,4,4,5,76.0,91.0,81.0,86.0,34.0,86.0,76,91,76,84,92,88,80,80,76,90,74,77,86,85,41,93,72,78,93,88,84,20,86,83,91,,25,41,27,13,15,10,9,12
4,15,2,90,90,63500000.0,300000.0,28,193,92,19.0,1.0,21.0,1.0,2019.0,21,4,1,5,,,,,,,25,25,25,42,25,25,25,25,41,31,58,61,43,89,35,42,78,44,83,25,29,30,25,20,37,,25,25,25,87,85,92,90,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161578,23,2,46,61,110000.0,700.0,18,180,73,2012.0,1.0,112978.0,45.0,2024.0,155,3,2,1,63.0,48.0,40.0,47.0,24.0,44.0,48,49,36,38,43,47,39,35,33,45,67,60,54,39,66,64,41,42,52,31,26,21,40,42,54,40.0,23,21,25,9,13,13,12,7
161579,23,2,46,58,110000.0,750.0,19,188,83,2012.0,1.0,112429.0,41.0,2027.0,155,3,2,1,55.0,25.0,29.0,34.0,48.0,57.0,30,19,42,30,25,30,24,26,25,32,54,56,45,42,48,33,56,53,64,25,46,46,31,31,31,35.0,50,51,45,6,14,8,13,14
161580,23,2,46,58,110000.0,500.0,19,181,73,65.0,1.0,563.0,34.0,2023.0,25,2,2,1,65.0,36.0,43.0,46.0,43.0,53.0,37,30,47,51,37,49,30,30,43,35,66,64,58,50,65,51,52,53,53,32,51,43,53,40,37,35.0,36,45,50,8,9,7,14,9
161581,23,2,46,70,150000.0,500.0,17,175,68,65.0,1.0,306.0,28.0,2027.0,25,3,2,1,55.0,50.0,36.0,46.0,20.0,42.0,29,56,45,40,37,44,38,25,26,43,59,52,57,45,60,42,67,59,36,47,27,16,45,44,63,43.0,19,17,14,13,12,14,7,13


In [22]:
ds2

Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,club_jersey_number,club_contract_valid_until,nationality_id,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,93,93,78000000.0,320000.0,34,170,72,73.0,1.0,30.0,2023.0,52,4,4,5,144300000.0,85.0,92.0,91.0,95.0,34.0,65.0,85,95,70,91,88,96,93,94,91,96,91,80,91,94,95,86,68,72,69,94,44,40,93,95,75,96,20,35,24,6,11,15,14,8
1,92,92,119500000.0,270000.0,32,185,81,21.0,1.0,9.0,2023.0,37,4,4,5,197200000.0,78.0,92.0,79.0,86.0,44.0,82.0,71,95,90,85,89,85,79,85,70,88,77,79,77,93,82,90,85,76,86,87,81,49,95,81,90,88,35,42,19,15,6,12,8,10
2,91,91,45000000.0,270000.0,36,187,83,11.0,1.0,7.0,2023.0,38,4,5,5,83300000.0,87.0,94.0,80.0,88.0,34.0,75.0,87,95,90,80,86,88,81,84,77,88,85,88,86,94,74,94,95,77,77,93,63,29,95,76,88,95,24,32,24,7,11,15,14,11
3,91,91,129000000.0,270000.0,29,175,68,73.0,1.0,10.0,2025.0,54,5,5,5,238700000.0,91.0,83.0,86.0,94.0,37.0,63.0,85,83,63,86,86,95,88,87,81,95,93,89,96,89,84,80,64,81,53,81,63,37,86,90,93,93,35,32,29,9,9,15,15,11
4,91,91,125500000.0,350000.0,30,181,70,10.0,1.0,17.0,2025.0,7,5,4,4,232200000.0,76.0,86.0,93.0,88.0,64.0,78.0,94,82,55,94,82,88,85,83,93,91,76,76,79,91,78,91,63,89,74,91,76,66,88,94,83,89,68,65,53,15,13,5,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19234,47,52,70000.0,1000.0,22,180,64,112541.0,1.0,36.0,2021.0,155,3,2,1,114000.0,58.0,35.0,46.0,48.0,42.0,49.0,46,32,48,50,30,45,33,38,48,49,56,60,55,53,70,46,62,51,46,30,52,42,38,43,42,37,38,43,48,6,10,5,15,13
19235,47,59,110000.0,500.0,19,175,70,445.0,1.0,27.0,2021.0,25,3,2,1,193000.0,59.0,39.0,50.0,46.0,41.0,51.0,54,33,46,51,32,41,53,31,50,42,60,58,64,49,69,49,47,63,47,38,49,39,51,49,44,47,37,44,47,11,12,6,8,10
19236,47,55,100000.0,500.0,21,178,72,111131.0,1.0,31.0,2021.0,25,3,2,1,175000.0,60.0,37.0,45.0,49.0,41.0,52.0,39,32,43,49,37,47,37,37,49,49,60,60,58,46,59,50,57,56,50,34,51,38,45,46,39,36,38,44,48,8,6,7,10,6
19237,47,60,110000.0,500.0,19,173,66,111131.0,1.0,12.0,2021.0,25,3,2,1,239000.0,68.0,46.0,36.0,48.0,15.0,42.0,29,49,40,38,34,42,36,34,33,45,69,67,72,48,73,48,50,50,40,41,34,14,47,40,49,47,10,14,11,7,10,7,14,15


In [23]:
missing_ds1 = ds1.isnull().sum()
columns_missing_ds1 = missing_ds1[missing_ds1 > 0].index.tolist()
columns_missing_ds1

['value_eur',
 'wage_eur',
 'league_id',
 'league_level',
 'club_team_id',
 'club_jersey_number',
 'club_contract_valid_until_year',
 'pace',
 'shooting',
 'passing',
 'dribbling',
 'defending',
 'physic',
 'mentality_composure']

In [24]:
missing_ds2 = ds2.isnull().sum()
columns_missing_ds2 = missing_ds2[missing_ds2 > 0].index.tolist()
columns_missing_ds2

['value_eur',
 'wage_eur',
 'club_team_id',
 'league_level',
 'club_jersey_number',
 'club_contract_valid_until',
 'release_clause_eur',
 'pace',
 'shooting',
 'passing',
 'dribbling',
 'defending',
 'physic']

In [25]:
# Impute missing values.

imp_ds1 = SimpleImputer(strategy='most_frequent')
imputed_data_ds1 = imp_ds1.fit_transform(ds1)
ds1 = pd.DataFrame(imputed_data_ds1, columns=ds1.columns)

# Check for missing values
missing_ds1 = ds1.isnull().sum()
columns_missing_ds1 = missing_ds1[missing_ds1 > 0].index.tolist()
columns_missing_ds1

[]

In [26]:
# Impute missing values.

# Check for missing values
imp_ds2 = SimpleImputer(strategy='most_frequent')
imputed_data_ds2 = imp_ds2.fit_transform(ds2)
ds2 = pd.DataFrame(imputed_data_ds2, columns=ds2.columns)

missing_ds2 = ds2.isnull().sum()
columns_missing_ds2 = missing_ds2[missing_ds2 > 0].index.tolist()
columns_missing_ds2

[]

**Feature Selection**

In [27]:
# Check the correlation between each feature and the target variable; overall
corr_mtrx = ds1.corr()
target_corr = corr_mtrx['overall'].drop('overall')

# Select the 10 best features with the highest absolute correlation
features = 15
best_features = target_corr[target_corr > 0.4].abs().nlargest(features).index

# Collect the best correlation feature
selected_features = best_features.tolist()

ds1[selected_features]

Unnamed: 0,movement_reactions,potential,passing,wage_eur,mentality_composure,value_eur,dribbling,attacking_short_passing,mentality_vision,international_reputation,skill_long_passing,power_shot_power,age,skill_ball_control,physic
0,94.0,95.0,86.0,550000.0,60.0,100500000.0,96.0,89.0,90.0,5.0,76.0,80.0,27.0,96.0,63.0
1,90.0,92.0,81.0,375000.0,60.0,79000000.0,91.0,82.0,81.0,5.0,72.0,94.0,29.0,92.0,79.0
2,89.0,90.0,83.0,275000.0,60.0,54500000.0,92.0,86.0,84.0,5.0,76.0,86.0,30.0,90.0,64.0
3,85.0,90.0,81.0,275000.0,60.0,52500000.0,86.0,84.0,83.0,5.0,76.0,93.0,32.0,90.0,86.0
4,89.0,90.0,59.0,300000.0,60.0,63500000.0,65.0,42.0,20.0,5.0,41.0,42.0,28.0,31.0,71.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161578,39.0,61.0,40.0,700.0,40.0,110000.0,47.0,38.0,42.0,1.0,33.0,64.0,18.0,45.0,44.0
161579,42.0,58.0,29.0,750.0,35.0,110000.0,34.0,30.0,31.0,1.0,25.0,33.0,19.0,32.0,57.0
161580,50.0,58.0,43.0,500.0,35.0,110000.0,46.0,51.0,40.0,1.0,43.0,51.0,19.0,35.0,53.0
161581,45.0,70.0,36.0,500.0,43.0,150000.0,46.0,40.0,44.0,1.0,26.0,42.0,17.0,43.0,42.0


In [28]:
selected_features

['movement_reactions',
 'potential',
 'passing',
 'wage_eur',
 'mentality_composure',
 'value_eur',
 'dribbling',
 'attacking_short_passing',
 'mentality_vision',
 'international_reputation',
 'skill_long_passing',
 'power_shot_power',
 'age',
 'skill_ball_control',
 'physic']

In [29]:
ds1['overall']

0         93.0
1         92.0
2         90.0
3         90.0
4         90.0
          ... 
161578    46.0
161579    46.0
161580    46.0
161581    46.0
161582    46.0
Name: overall, Length: 161583, dtype: float64

In [30]:
X = ds1[selected_features]
y = ds1['overall']

In [31]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

**Training and Model Creation**

In [32]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

x_train.shape, y_test.shape, y_train.shape, x_test.shape

((129266, 15), (32317,), (129266,), (32317, 15))

**Models**

In [33]:
# RandomForest, XGBoost and Gradient Boost Regressors with cv and grid search:
models = {
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'GradientBoost': GradientBoostingRegressor()
}

params = {
    'RandomForest': {'n_estimators': [4, 5], 'max_depth': [None, 5]},
    'XGBoost': {'n_estimators': [5, 6], 'learning_rate': [0.01, 0.1]},
    'GradientBoost': {'n_estimators': [4, 6], 'learning_rate': [0.01, 0.1]}
}

for name, model in models.items():
    gs = GridSearchCV(model, params[name], cv=5)
    gs.fit(x_train, y_train)
    # Make predictions using the best model from GridSearchCV
    y_pred = gs.predict(x_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Validation score for {name}: {gs.score(x_test, y_test)}")
    print(f"mean_absolute_error for {name}: {mae}")
    print(f"mean_squared_error for {name}: {mse}",)
    print(f"r2_score for {name}: {r2}\n")


Best parameters for RandomForest: {'max_depth': None, 'n_estimators': 5}
Validation score for RandomForest: 0.9816936383983145
mean_absolute_error for RandomForest: 0.5688089859826101
mean_squared_error for RandomForest: 0.9073280316861094
r2_score for RandomForest: 0.9816936383983145


Best parameters for XGBoost: {'learning_rate': 0.1, 'n_estimators': 6}
Validation score for XGBoost: 0.6824578425450534
mean_absolute_error for XGBoost: 3.112163577696679
mean_squared_error for XGBoost: 15.738512489255594
r2_score for XGBoost: 0.6824578425450534


Best parameters for GradientBoost: {'learning_rate': 0.1, 'n_estimators': 6}
Validation score for GradientBoost: 0.6214674233099172
mean_absolute_error for GradientBoost: 3.3707171877981628
mean_squared_error for GradientBoost: 18.76141339334521
r2_score for GradientBoost: 0.6214674233099172



In [34]:
predict_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    random_state=42
)

# Train the model
predict_model.fit(x_train, y_train)
y_pred = predict_model.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mean_absolute_error:",mae)
print("mean_squared_error:",mse)
print("r2_score:",r2)


  warn(


mean_absolute_error: 0.5131893430702109
mean_squared_error: 0.7094564346938143
r2_score: 0.9856859199974098


In [None]:
# Voting Classifier

decision_tree = DecisionTreeClassifier(random_state=42, criterion='entropy')
knn = KNeighborsClassifier(n_neighbors=10)
svm = SVC(probability=True, random_state=42)

voting_classifier = VotingClassifier(estimators=[
    ('decision_tree', decision_tree),
    ('knn', knn),
    ('svm', svm)
], voting='soft')

for model in (decision_tree, knn, svm,voting_classifier):
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(model.__class__.__name__,accuracy_score(y_pred,y_test))

DecisionTreeClassifier 0.7024785716495962
KNeighborsClassifier 0.2672277748553393


In [None]:
# RandomForestClassifier

rfc=RandomForestClassifier(n_estimators=15, max_depth=4, criterion='entropy')

# Perform cross-validation
cv_scores = cross_val_score(rfc, x_train, y_train, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"\nMean cross-validation score: {cv_scores.mean()}")

# Fit the model
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test)
accuracy_score(y_pred,y_test)
print('\nAccuracy of the model:',accuracy_score(y_pred,y_test))

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\nmean_absolute_error of the model:",mae)
print("\nmean_squared_error of the model:",mse)
print("\nr2_score of the model:",r2)

# Fine-tune the model (RandomForestClassifier) with GridSearchCV
n_estimators_range = list(range(1, 31))

# Create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_estimators=n_estimators_range)

grid = GridSearchCV(RandomForestClassifier(max_depth=3, criterion='entropy'), param_grid, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)

rfc=RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], max_depth=3, criterion='entropy')
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test)
accuracy_score(y_pred,y_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Examine the best model
print("\ngrid.best_score:",grid.best_score_)
print("\ngrid.best_params:",grid.best_params_)
print("\ngrid.best_estimator:",grid.best_estimator_)
print("\nAccuracy of the best model:", accuracy_score(y_pred,y_test))
print("\nmean_absolute_error of the best model:",mae)
print("\nmean_squared_error of the best model:",mse)
print("\nr2_score of the best model:",r2)

In [None]:
# Correlation between variables in selected_features and the target variable; overall
for name, score in zip(x_train.columns, rfc.feature_importances_):
  print(name, score)

### Testing, using players_22

---

In [None]:
test_ds1 = ds2[selected_features]
test_ds2 = ds2['overall']

x_test_22 = test_ds1
y_test_22 = test_ds2

scaler = StandardScaler()
x_test_22_scaled = scaler.fit_transform(x_test_22)
x_test_22 = pd.DataFrame(x_test_22_scaled, columns=x_test_22.columns)

In [None]:
3# RandomForest, XGBoost, Gradient Boost Regressors - testing wit players_22 dataset
models = {
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'GradientBoost': GradientBoostingRegressor()
}

params = {
    'RandomForest': {'n_estimators': [4, 5], 'max_depth': [None, 4]},
    'XGBoost': {'n_estimators': [5, 7], 'learning_rate': [0.01, 0.1]},
    'GradientBoost': {'n_estimators': [5, 6], 'learning_rate': [0.01, 0.1]}
}

for name, m in models.items():
    gs = GridSearchCV(m, params[name], cv=5)
    gs.fit(x_train, y_train)
    # Make predictions using the best model from GridSearchCV
    y_pred = gs.predict(x_test_22)

    # Evaluate the model
    mae = mean_absolute_error(y_test_22, y_pred)
    mse = mean_squared_error(y_test_22, y_pred)
    r2 = r2_score(y_test_22, y_pred)

    #print('\nAccuracy: {}'.format(nb_model.score(x_test, y_test)))
    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Validation score for {name}: {gs.score(x_test_22, y_test_22)}")
    #print()
    print(f"mean_absolute_error for {name}: {mae}")
    print(f"mean_squared_error for {name}: {mse}",)
    print(f"r2_score for {name}: {r2}\n")

**Saving Model**

In [35]:
filename = 'predictor.pkl'
pickle.dump(predict_model, open(filename, 'wb'))