## Loading the dataset

In [None]:
import numpy as np  
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from google.colab import files


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
players_df = pd.read_csv('/content/drive/MyDrive/players_22.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Exploring the Dataset

In [None]:
#Overall info on the dataset
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Columns: 110 entries, sofifa_id to nation_flag_url
dtypes: float64(16), int64(44), object(50)
memory usage: 16.1+ MB


In [None]:
#columns of the dataset
players_df.columns

Index(['sofifa_id', 'player_url', 'short_name', 'long_name',
       'player_positions', 'overall', 'potential', 'value_eur', 'wage_eur',
       'age',
       ...
       'lcb', 'cb', 'rcb', 'rb', 'gk', 'player_face_url', 'club_logo_url',
       'club_flag_url', 'nation_logo_url', 'nation_flag_url'],
      dtype='object', length=110)

In [None]:
#Shape of the dataset
players_df.shape

(19239, 110)

In [None]:
# check for datatypes, some might require changing depending on what needs to be done.
players_df.dtypes

sofifa_id            int64
player_url          object
short_name          object
long_name           object
player_positions    object
                     ...  
player_face_url     object
club_logo_url       object
club_flag_url       object
nation_logo_url     object
nation_flag_url     object
Length: 110, dtype: object

In [None]:
# View summary statistics
players_df.describe() 

Unnamed: 0,sofifa_id,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
count,19239.0,19239.0,19239.0,19165.0,19178.0,19239.0,19239.0,19239.0,19178.0,19178.0,...,19239.0,19239.0,19239.0,19239.0,19239.0,19239.0,19239.0,19239.0,19239.0,2132.0
mean,231468.086959,65.772182,71.07937,2850452.0,9017.989363,25.210822,181.299704,74.943032,50580.498123,1.354364,...,57.92983,46.601746,48.045584,45.9067,16.406102,16.192474,16.055356,16.229274,16.491814,36.439962
std,27039.717497,6.880232,6.086213,7613700.0,19470.176724,4.748235,6.863179,7.069434,54401.868535,0.747865,...,12.159326,20.200807,21.232718,20.755683,17.574028,16.839528,16.564554,17.059779,17.884833,10.751563
min,41.0,47.0,49.0,9000.0,500.0,16.0,155.0,49.0,1.0,1.0,...,12.0,4.0,5.0,5.0,2.0,2.0,2.0,2.0,2.0,15.0
25%,214413.5,61.0,67.0,475000.0,1000.0,21.0,176.0,70.0,479.0,1.0,...,50.0,29.0,28.0,25.0,8.0,8.0,8.0,8.0,8.0,27.0
50%,236543.0,66.0,71.0,975000.0,3000.0,25.0,181.0,75.0,1938.0,1.0,...,59.0,52.0,56.0,53.0,11.0,11.0,11.0,11.0,11.0,36.0
75%,253532.5,70.0,75.0,2000000.0,8000.0,29.0,186.0,80.0,111139.0,1.0,...,66.0,63.0,65.0,63.0,14.0,14.0,14.0,14.0,14.0,45.0
max,264640.0,93.0,95.0,194000000.0,350000.0,54.0,206.0,110.0,115820.0,5.0,...,96.0,93.0,93.0,92.0,91.0,92.0,93.0,92.0,90.0,65.0


## Cleaning the Dataset

In [None]:
# checking for missing values
players_df.isnull().any()

sofifa_id           False
player_url          False
short_name          False
long_name           False
player_positions    False
                    ...  
player_face_url     False
club_logo_url        True
club_flag_url        True
nation_logo_url      True
nation_flag_url     False
Length: 110, dtype: bool

In [None]:
#lets handle the above missing values
col = []
for c in players_df.columns:
    missing_values=np.mean(players_df[c].isnull())* 100
    if missing_values > 60:
        print('{} - {}%'.format(c, round(missing_values)))
        col.append(c)

print("\n We need to drop these columns: \n \n", col)

club_loaned_from - 94%
nation_team_id - 96%
nation_position - 96%
nation_jersey_number - 96%
player_tags - 93%
goalkeeping_speed - 89%
nation_logo_url - 96%

 We need to drop these columns: 
 
 ['club_loaned_from', 'nation_team_id', 'nation_position', 'nation_jersey_number', 'player_tags', 'goalkeeping_speed', 'nation_logo_url']


In [None]:
#Dropping the columns with missing values
players_df.drop(columns=['club_loaned_from', 'nation_team_id', 'nation_position', 'nation_jersey_number', 'player_tags', 'goalkeeping_speed', 'nation_logo_url'], inplace=True)

In [None]:
#extracting the columns whose dtype is number and ignoring the ones with object dtype
dfCols = []
for c in players_df.columns:
  if players_df[c].dtype == 'int64':
    dfCols.append(c)

print(dfCols)

['sofifa_id', 'overall', 'potential', 'age', 'height_cm', 'weight_kg', 'nationality_id', 'weak_foot', 'skill_moves', 'international_reputation', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes']


## Extract the feauture

In [None]:
# Extract preferred foot
players_df['preferred_foot'] = players_df['preferred_foot'].apply(lambda x: 1 if x == 'Right' else 0)




In [None]:
# Calculate overall potential
players_df['overall_potential'] = players_df['overall'] + players_df['potential']

## Feature Selection

In [None]:
# Calculate correlation coefficients
corr_matrix = players_df.corr()
corr_matrix['overall'].sort_values(ascending=False)

overall                        1.000000
overall_potential              0.918358
movement_reactions             0.871823
passing                        0.715001
mentality_composure            0.708867
dribbling                      0.666402
potential                      0.644275
wage_eur                       0.601764
power_shot_power               0.561180
value_eur                      0.554684
release_clause_eur             0.545197
physic                         0.529234
mentality_vision               0.523025
attacking_short_passing        0.516788
skill_long_passing             0.498216
shooting                       0.489623
international_reputation       0.470647
skill_ball_control             0.460411
age                            0.459451
skill_curve                    0.420423
power_long_shots               0.410010
attacking_crossing             0.401356
mentality_aggression           0.400161
power_stamina                  0.385150
skill_dribbling                0.384351


In [None]:
# Select top features
#feature_subset = players_df[['age', 'potential', 'value_eur', 'wage_eur', 'international_reputation']]


## Model Training and measure the perfomance of the model

In [None]:
## creating a new df with only numerical dtypes
new_df = players_df[['potential', 'value_eur', 'wage_eur', 
            'age',  'height_cm', 'weight_kg', 
            'pace',
            'overall',
          'shooting',
          'passing',
          'dribbling',
          'defending',
          'physic',
          'attacking_crossing',
          'attacking_finishing',
          'attacking_heading_accuracy',
          'attacking_short_passing',
          'attacking_volleys',
          'skill_dribbling',
          'skill_curve',
          'skill_fk_accuracy',
          'skill_long_passing',
          'skill_ball_control',
          'movement_acceleration',
          'movement_sprint_speed',
          'movement_agility',
          'movement_reactions',
          'movement_balance',
          'power_shot_power',
          'power_jumping',
          'power_stamina',
          'power_strength',
          'power_long_shots',
          'mentality_aggression',
          'mentality_interceptions',
          'mentality_positioning',
          'mentality_vision',
          'mentality_penalties',
          'mentality_composure',
          'defending_marking_awareness',
          'defending_standing_tackle',
          'defending_sliding_tackle',
          
            ]].copy()

In [None]:
new_df.dtypes

potential                        int64
value_eur                      float64
wage_eur                       float64
age                              int64
height_cm                        int64
weight_kg                        int64
pace                           float64
overall                          int64
shooting                       float64
passing                        float64
dribbling                      float64
defending                      float64
physic                         float64
attacking_crossing               int64
attacking_finishing              int64
attacking_heading_accuracy       int64
attacking_short_passing          int64
attacking_volleys                int64
skill_dribbling                  int64
skill_curve                      int64
skill_fk_accuracy                int64
skill_long_passing               int64
skill_ball_control               int64
movement_acceleration            int64
movement_sprint_speed            int64
movement_agility         

In [None]:
# changing all the dtypes to float64
new_df = new_df.astype(np.float64)

In [None]:
#dropping all the null values
new_df = new_df.dropna()
new_df.isnull().any()

potential                      False
value_eur                      False
wage_eur                       False
age                            False
height_cm                      False
weight_kg                      False
pace                           False
overall                        False
shooting                       False
passing                        False
dribbling                      False
defending                      False
physic                         False
attacking_crossing             False
attacking_finishing            False
attacking_heading_accuracy     False
attacking_short_passing        False
attacking_volleys              False
skill_dribbling                False
skill_curve                    False
skill_fk_accuracy              False
skill_long_passing             False
skill_ball_control             False
movement_acceleration          False
movement_sprint_speed          False
movement_agility               False
movement_reactions             False
m

## Splitting the data into training and Testing set

In [None]:
#create a test set and training set
from sklearn.model_selection import train_test_split
train_set, test_set= train_test_split(new_df, test_size= 0.2, random_state= 42)


In [None]:
y_train = train_set['overall'].copy()
x_train = train_set.drop(['overall'], axis=1)

In [None]:
# building the models
from sklearn.ensemble import RandomForestRegressor
forest =  RandomForestRegressor(random_state=30, max_features=22)
forest.fit(x_train, y_train)

## Measuring the perfomance of the model

In [None]:
y_test = test_set['overall'].copy()
x_test = test_set.drop(['overall'], axis=1)

In [None]:
from sklearn.metrics import mean_squared_error
forestPred = forest.predict(x_test)
forest_mse = mean_squared_error(y_test, forestPred)
forest_rmse = np.sqrt(forest_mse)
forest_mse, forest_rmse

(0.21710046934584934, 0.4659404139435099)

In [None]:
# lets tune the model by adding the parameters to learn from
forest_new = RandomForestRegressor(random_state=42, max_features=20)
forest_new.fit(x_test, y_test)

In [None]:
#checking the perfomance of the above model
newForestPred = forest_new.predict(x_test)
newForest_mse = mean_squared_error(y_test, newForestPred)
newForest_rmse = np.sqrt(newForest_mse)
newForest_mse, newForest_rmse

(0.06824036374303315, 0.2612285660930541)

In [None]:
## testing it on a new dataset
testingDf = pd.read_csv('/content/drive/MyDrive/players_22.csv')
new_test = testingDf[['potential', 'value_eur', 'wage_eur', 
            'age',  'height_cm', 'weight_kg', 
            'pace',
          'shooting',
          'passing',
          'dribbling',
          'defending',
          'physic',
          'attacking_crossing',
          'attacking_finishing',
          'attacking_heading_accuracy',
          'attacking_short_passing',
          'attacking_volleys',
          'skill_dribbling',
          'skill_curve',
          'skill_fk_accuracy',
          'skill_long_passing',
          'skill_ball_control',
          'movement_acceleration',
          'movement_sprint_speed',
          'movement_agility',
          'movement_reactions',
          'movement_balance',
          'power_shot_power',
          'power_jumping',
          'power_stamina',
          'power_strength',
          'power_long_shots',
          'mentality_aggression',
          'mentality_interceptions',
          'mentality_positioning',
          'mentality_vision',
          'mentality_penalties',
          'mentality_composure',
          'defending_marking_awareness',
          'defending_standing_tackle',
          'defending_sliding_tackle',
          
            ]].copy()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
#making the predictions
new_test = new_test.dropna()
newPred = forest_new.predict(new_test)
newPred

array([91.64, 87.59, 87.11, ..., 49.14, 48.  , 49.22])