In [1]:
import warnings
warnings.filterwarnings("ignore")
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',None)

In [76]:
path = "../input/soccer/null"  #Insert path
database = path + 'database.sqlite'

con = sqlite3.connect('../input/soccer/database.sqlite') #Insert path

tables = pd.read_sql("""SELECT * FROM sqlite_master WHERE type='table';""", con)
df = pd.read_sql_query("SELECT * FROM Player_Attributes", con)

df.head()

In [10]:
df.shape #  Rows , Columns 

Total Null values:

In [11]:
df.isnull().sum().sum() 

Missing values in each columns:

In [12]:
df.isnull().sum() 

Null values in percentage:

In [13]:
df.isnull().sum()/df.shape[0]*100 

In [14]:
df1 = df[['overall_rating','attacking_work_rate','crossing','heading_accuracy','vision','dribbling','sliding_tackle']].isnull().sum().T/df.shape[0]*100
plt.figure(figsize=(15,5))
plt.bar(df1.index, df1)
plt.title('Missing Values in %')

Removing  Null Values 

In [17]:
df = df.dropna(axis=0 , subset=['overall_rating']) 
df = df.dropna(axis=0, subset=['volleys'])  

In [18]:
df['attacking_work_rate'].value_counts()

In [19]:
df['attacking_work_rate'].mode()[0]

Imputing attacking_work_rate with mode 'Medium':

In [20]:
df.loc[ df['attacking_work_rate'].isnull() , 'attacking_work_rate' ] = df['attacking_work_rate'].mode()[0]

In [21]:
df = df.reset_index(drop=True)

Removing Redundant Features

In [22]:
df = df.drop(columns=['id','player_fifa_api_id','player_api_id','date'])
df.head()

Creating Dummy columns for Categorical Variable

In [29]:
df['attacking_work_rate'].value_counts()

In [30]:
 df['defensive_work_rate'].value_counts()

 some mistakes while taking data

In [23]:
pd.crosstab(df['attacking_work_rate'], df['defensive_work_rate'])

In [36]:
df['attacking_work_rate'].replace({'le':'lean','norm':'normal','stoc':'stocky','y':'yes'}, inplace=True)
df['defensive_work_rate'].replace({'ean':'lean','ormal':'normal','tocky':'stocky','es':'yes','o':'None','_0':'medium'}, inplace=True)
df.loc[ df['defensive_work_rate'].isin(list('0123456789')) , 'defensive_work_rate' ] = 'None'


In [35]:
df['attacking_work_rate'].value_counts()

In [37]:
df['defensive_work_rate'].value_counts()

All the classes have same name

In [38]:
df = pd.get_dummies(df, drop_first=True)
df.head()

In [39]:
x = df.drop(columns='overall_rating')
y = df[['overall_rating']]

x_num = x.iloc[:,:-15]
x_cat = x.iloc[:,-15:]

for i in x_num.columns[:3]:
    sns.boxplot(x_num[i])
    plt.show()

In [41]:
for i in x_num.columns:
    q1 = x_num[i].quantile(0.25)
    q3 = x_num[i].quantile(0.75)
    
    iqr = q3-q1
    
    ub = q3 + iqr*1.5
    lb = q1 - iqr*1.5
    
x_num.loc[ (x_num[i] < lb) , i] = x_num[i].quantile(0.01)  # x_num.loc[ (x_num[i] > ub) , i] = x_num[i].quantile(0.99)

for i in x_num.columns[:3]:
    sns.boxplot(x_num[i])
    plt.show()

In [43]:
x = pd.concat([x_num, x_cat], axis=1)

In [48]:
import statsmodels.api as sm
x1 = x.iloc[:,:-15]
xc = sm.add_constant(x)
ols1 = sm.OLS(y, xc)
ols_mod = ols1.fit()
ols_mod.summary()

In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score

Cross validation with all the features including categorical:

In [46]:
lr = LinearRegression()
kf = KFold(n_splits= 5, shuffle = True)
mse = cross_val_score(lr, x, y, cv=5, scoring='neg_mean_squared_error')
rmse = np.sqrt(abs(mse))
np.mean(rmse), np.std(rmse)

Cross validation without categorical features:

In [49]:
lr = LinearRegression()
kf = KFold(n_splits= 5, shuffle = True)
mse = cross_val_score(lr, x1, y, cv=5, scoring='neg_mean_squared_error')
rmse = np.sqrt(abs(mse))
np.mean(rmse), np.std(rmse)

There is very little to no sign of overfitting



Base Linear regression Model

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

With all features:

In [52]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=42)

lr = LinearRegression()
lr.fit(xtrain, ytrain)

ytrain_pred = lr.predict(xtrain)
ytest_pred = lr.predict(xtest)

print('Train R2: ',r2_score(ytrain, ytrain_pred))
print('Test R2: ',r2_score(ytest, ytest_pred))

print('Train RMSE: ',mean_squared_error(ytrain, ytrain_pred)**0.5)
print('Test R2: ',mean_squared_error(ytest, ytest_pred)**0.5)

Without categorical features:

In [53]:
xtrain, xtest, ytrain, ytest = train_test_split(x1, y, test_size=0.3, random_state=42)

lr = LinearRegression()
lr.fit(xtrain, ytrain)

ytrain_pred = lr.predict(xtrain)
ytest_pred = lr.predict(xtest)

print('Train R2: ',r2_score(ytrain, ytrain_pred))
print('Test R2: ',r2_score(ytest, ytest_pred))

print('Train RMSE: ',mean_squared_error(ytrain, ytrain_pred)**0.5)
print('Test R2: ',mean_squared_error(ytest, ytest_pred)**0.5)

R2- score and RMSE of train and test data are almost equal inferencing that the model is not overfit.



Regularization

In [54]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [55]:
st = StandardScaler()
x1 = pd.DataFrame(st.fit_transform(x1) , columns=x1.columns)

In [56]:
rid = Ridge()
param = {'alpha':[0.00005,0.00008,0.0001,0.001,0.01,0.1,0.5,1,2,3,4,5,6,7,8,9,10,20,30]}

grid = GridSearchCV(rid, param_grid=param, cv=5, scoring='neg_mean_squared_error' )

rid_mod=grid.fit(x1,y)

print(rid_mod.best_params_)
print(abs(rid_mod.best_score_))

In [57]:
rid = Ridge(**rid_mod.best_params_)
mse = cross_val_score(rid, x, y, cv=5, scoring='neg_mean_squared_error')
rmse = np.sqrt(abs(mse))
np.mean(rmse), np.std(rmse)

Splitting in test and train

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
xtrain, xtest, ytrain, ytest = train_test_split(x1, y, test_size=0.3, random_state=42)
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

In [60]:
st = StandardScaler()

xtrain = pd.DataFrame(st.fit_transform(xtrain), columns = xtrain.columns)

xtest = pd.DataFrame(st.transform(xtest), columns = xtest.columns)

xtrain.head()

In [61]:
cov_matrix =  np.cov(xtrain.T)  # We use transpose, because np.cov function will by default take rows for finding covariance
cov_matrix.shape

In [62]:
eig_vals, eig_vecs = np.linalg.eig(cov_matrix) 
eig_vals.shape, eig_vecs.shape

In [63]:
eigen_pairs = [(np.abs(eig_vals[i]), eig_vecs[ :, i]) for i in range(eig_vecs.shape[0])]
eigen_pairs_sorted = sorted(eigen_pairs,reverse=True)
eigen_pairs_sorted[:2] 

In [64]:
tot = sum(eig_vals)
var_exp = [round(( i /tot )*100,3) for i in sorted(eig_vals, reverse=True)]
print('Variance of each eigen vector:\n',var_exp)
cum_var_exp = np.cumsum(var_exp)
print("\nCumulative Variance Explained", cum_var_exp)

In [65]:
plt.figure(figsize=(15,6))
sns.barplot(np.arange(1,cum_var_exp.shape[0]+1) , cum_var_exp)

In [66]:
eigvalues_sort = [eigen_pairs_sorted[j][0] for j in range(len(eig_vals))]
eigvectors_sort = [eigen_pairs_sorted[j][1] for j in range(len(eig_vals))]
eig_vect =  np.array(eigvectors_sort).transpose()

In [67]:
feat_importance =  pd.DataFrame({'feature':xtrain.columns , 'PCA1_weight':eig_vect[:,0], 'PCA1_abs':abs(eig_vect[:,0])})
feat_importance.sort_values(by='PCA1_abs', ascending=False).head()

In [68]:
xtrain_pca = np.dot(xtrain, eig_vect)

xtest_pca = np.dot(xtest, eig_vect)

xtrain_pca[0] 

In [72]:
from sklearn.decomposition import PCA
pc = PCA()
xtr_pca = pc.fit_transform(xtrain)
xtt_pca = pc.transform(xtest)
xtr_pca[0]

Values are exactly same.

In [73]:
lr = LinearRegression()
lr.fit(xtrain_pca, ytrain)

ytrain_pred = lr.predict(xtrain_pca)
ytest_pred = lr.predict(xtest_pca)

print('Train R2: ',r2_score(ytrain, ytrain_pred))
print('Test R2: ',r2_score(ytest, ytest_pred))

print('Train RMSE: ',mean_squared_error(ytrain, ytrain_pred)**0.5)
print('Test R2: ',mean_squared_error(ytest, ytest_pred)**0.5)

In [74]:
lr = LinearRegression()
lr.fit(xtrain_pca[:,:18], ytrain)

ytrain_pred = lr.predict(xtrain_pca[:,:18])
ytest_pred = lr.predict(xtest_pca[:,:18])

print('Train R2: ',r2_score(ytrain, ytrain_pred))
print('Test R2: ',r2_score(ytest, ytest_pred))

print('Train RMSE: ',mean_squared_error(ytrain, ytrain_pred)**0.5)
print('Test R2: ',mean_squared_error(ytest, ytest_pred)**0.5)