## Predicting Players Rating

In [22]:
import sqlite3 as sql
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

#### Read Data from the database into Pandas

In [35]:
cnx=sql.connect('database.sqlite')
df=pd.read_sql_query("SELECT * FROM Player_Attributes",cnx)

In [36]:
print('Shape:',df.shape)
df.columns

Shape: (183978, 42)


Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')

In [39]:
np.count_nonzero(df.isnull())

47301

In [40]:
df = df.dropna() # Since its a huge dataset , as a first cut dropping all null value

In [42]:
print('Shape:',df.shape) # No. of rows reduced from 183978 to 180354

Shape: (180354, 42)


In [57]:
bc_stam=''' select ball_control, max(stamina) from Player_Attributes group by ball_control order by stamina'''
pd.read_sql_query(bc_stam, cnx)

TypeError: __init__() got an unexpected keyword argument 'max_rows'

    ball_control  max(stamina)
0            NaN           NaN
1            5.0          31.0
2            7.0          40.0
3           10.0          62.0
4           13.0          64.0
5            6.0          68.0
6           14.0          68.0
7            8.0          69.0
8           19.0          69.0
9            9.0          70.0
10          18.0          70.0
11          17.0          71.0
12          12.0          76.0
13          27.0          76.0
14          22.0          78.0
15          29.0          78.0
16          15.0          79.0
17          24.0          79.0
18          26.0          80.0
19          28.0          80.0
20          16.0          81.0
21          40.0          81.0
22          11.0          82.0
23          20.0          82.0
24          32.0          82.0
25          23.0          83.0
26          33.0          83.0
27          35.0          83.0
28          38.0          83.0
29          97.0          83.0
..           ...           ...
64      

#### It is evident that higher the stamina higher the ball control

In [68]:
df1=df[['preferred_foot', 'attacking_work_rate',
       'defensive_work_rate']]
df1=df1.apply(lambda x: pd.factorize(x)[0])
df.drop(['preferred_foot', 'attacking_work_rate',
       'defensive_work_rate'], axis=1, inplace=True)
df = df.join(df1)
df.iloc[1]

id                                       2
player_fifa_api_id                  218353
player_api_id                       505942
date                   2015-11-19 00:00:00
overall_rating                          67
potential                               71
crossing                                49
finishing                               44
heading_accuracy                        71
short_passing                           61
volleys                                 44
dribbling                               51
curve                                   45
free_kick_accuracy                      39
long_passing                            64
ball_control                            49
acceleration                            60
sprint_speed                            64
agility                                 59
reactions                               47
balance                                 65
shot_power                              55
jumping                                 58
stamina    

## Linear Regression

In [70]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
X = df.drop(['overall_rating','date'], axis = 1)
Y = df['overall_rating']

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)
print (X_train.shape,Y_train.shape)
print (X_test.shape,Y_test.shape)
#fit a model
lm = linear_model.LinearRegression()
model=lm.fit(X_train,Y_train)
Y_pred = lm.predict(X_test)
print('Intercept,b:',np.round(lm.intercept_,2))
print('Rsq (test score)= ', np.round(r2_score(Y_test, Y_pred),2))
print('test score:',np.round(model.score(X_test,Y_test),2))
print('MSE:', np.round(mean_squared_error(Y_test, Y_pred),2))
print('Sqrt MSE:',np.round(np.sqrt(mean_squared_error(Y_test, Y_pred)),2))

(144283, 40) (144283,)
(36071, 40) (36071,)
Intercept,b: 0.43
Rsq (test score)=  0.86
test score: 0.86
MSE: 6.92
Sqrt MSE: 2.63


## Decision Tree

In [75]:
rg = DecisionTreeRegressor(max_depth=20)
rg.fit(X_train, Y_train)
Y_pred = rg.predict(X_test)

print('MSE:', np.round(mean_squared_error(Y_test, Y_pred),2))
print('Sqrt MSE:',np.round(np.sqrt(mean_squared_error(Y_test, Y_pred)),2))

MSE: 1.86
Sqrt MSE: 1.36


## Observation: Decision tree shows a better RMSE value of 1.36 than a simple linear regression model of RMSE 2.63