In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import re

%matplotlib inline

In [None]:
data = pd.read_csv("data.csv", index_col='ID')

In [None]:
data.head()

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
data.head()

In [None]:
print(f"Total number of players in dataset {data.shape[0]}")

In [None]:
!pip install tabulate
from tabulate import tabulate

top = 10
print(tabulate(
    sorted(list(zip(data.columns, data.isnull().sum(), data.isnull().sum() / data.shape[0] * 100)), key=lambda x: -x[2])[:top], 
    headers=['col_name', 'null_cnt', 'null_perc']))

In [None]:
print(f"Weight column type is '{data['Weight'].dtype}'")

In [None]:
data['Weight_float'] = data['Weight'].str.extract(r'([0-9]+)lbs').astype(float)
data['Weight_float'].fillna(data['Weight_float'].median())
POUND_TO_KILO = 0.454
data['Weight_kg'] = data.apply(lambda row: row['Weight_float'] * POUND_TO_KILO, axis=1)
data.hist(column='Weight_kg', bins=30)
plt.show()

In [None]:
Height= pd.DataFrame(data['Height'].str.replace("'","."),columns=['Height'])
Height=Height.dropna()
Height['Height_m']=Height.copy()
FOOT_TO_INCH=12
INCH_TO_METR=0.0254
for x in range(len(Height['Height'])):
    Height['Height_m'].iloc[x]=(float(Height['Height'].iloc[x][0])*FOOT_TO_INCH+float(Height['Height'].iloc[x][2:]))*INCH_TO_METR
Height['Height_m']=pd.to_numeric(Height['Height_m'])
Height.hist(column='Height_m', bins=20)
plt.show()

In [None]:
data.plot.scatter(x='Weight_kg', y='Strength')
plt.title('Dependence of strength on weight')
plt.show()

In [None]:
sns.pairplot(data[['ShortPassing', 'Dribbling', 'BallControl', 'Strength']])

In [None]:

data['age_group'] = data.apply(lambda x: 'young' if x['Age'] < 20 else 'mature' if x['Age'] <= 30 else 'masters', axis=1)
distr = data.groupby('age_group').count().max(axis=1)[['young', 'mature', 'masters']]

plt.bar(distr.index, distr.values)
plt.ylabel('Number of players')
plt.title('Distribution of players across age groups')
plt.show()

In [None]:
sns.boxplot(x='age_group', y='SprintSpeed', data=data);

In [None]:
from sklearn.model_selection import train_test_split

data.fillna({'BallControl': data['BallControl'].mean(), 'Dribbling': data['Dribbling'].mean()}, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data['Dribbling'].values, data['BallControl'].values, train_size=0.8)
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)

In [None]:
from sklearn.linear_model import Ridge

lr = Ridge(alpha=0)
lr.fit(X=X_train, y=y_train)


In [None]:
print(f'w_0 = {lr.intercept_}, w_1 = {lr.coef_[0]}')

In [None]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

In [None]:
data['predicted_BallControl'] = lr.predict(data['Dribbling'].values.reshape(-1, 1))
data[['Name', 'Dribbling', 'BallControl', 'predicted_BallControl']].head()

In [None]:
def mse(y_true, y_pred):
    error = np.square(y_true-y_pred).mean()
    return error

In [None]:
from sklearn.metrics import mean_squared_error

assert round(mean_squared_error(y_train, y_pred_train), 9) == round(mse(y_train, y_pred_train), 9)
assert round(mean_squared_error(y_test, y_pred_test), 9) == round(mse(y_test, y_pred_test), 9)

In [None]:
print(f'Train MSE {mse(y_train, y_pred_train)}, test MSE {mse(y_test, y_pred_test)}')

In [None]:
x=data['Dribbling']
y1 = data['predicted_BallControl']
y2 = data['BallControl']
fig, ax = plt.subplots()
ax.scatter(x, y2,c='r',s=2)
ax.scatter(x, y1,c='y',s=2)
ax.legend(['true_score','predicted_score'])
plt.xlabel('Dribbling')
plt.ylabel('Ball Control')
plt.show()

In [None]:
def compute_residuals(w, X, y):
    #print(X)
    """
    Compute residuals when predicting y_hat as matrix product of X and transposed w
    :param w: linear regression weights, numpy.ndarrya: float64[num_features]
    :param X: training features, numpy.ndarray: float64[num_samples, num_features]
    :param y: training target, numpy.ndarray: float64[num_samples]
    :returns: vector of residuals (y_i_hat - y_i) for each sample_i in X
    
    """
    residuals = np.dot(X,w.T) - y      
    #print(residuals)
    return residuals


In [None]:
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from scipy.optimize import least_squares

class LinearRegression(BaseEstimator):
    def __init__(self, fit_intercept=True):
        self.fit_intercept = fit_intercept
    
    def fit(self, X, y):

        """
        fit model weights given input features and target
        :param X: training features, numpy.ndarray: numeric[num_samples, num_features]
        :param y: training target, numpy.ndarray: numeric[num_samples]
        :returns: linear predictor with fitted weights so that train MSE is the lowest possible
        :note: weights: numpy.ndarray: float64[num_features] stored as class field

        """
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Save train data information. Necessary for following the uniform API
        self.X_ = X
        self.y_ = y
        self.n_features_in_ = X.shape[1]
        # Copy arrays and cast them to uniform type
        X_train = X.astype('float64')
        y_train = y.astype('float64')
        # Add dummy column of ones to X_train if we want to train an intercept - last component of future weight vector
        #ls=least_squares(compute_residuals, np.zeros(X_train.shape[1]),args=(X_train,y_train))
        #print(X_train)
        if self.fit_intercept:
            X_train = np.column_stack((X_train, np.ones(X_train.shape[0])))
            ls=least_squares(compute_residuals, np.zeros(X_train.shape[1]),args=(X_train,y_train))
            self.intercept_=ls.x[-1]
            self.coef_=ls.x[0:-1]
        else:
            ls=least_squares(compute_residuals, np.zeros(X_train.shape[1]),args=(X_train,y_train))
            self.intercept_ =0
            self.coef_=ls.x
            
        # Your code here.
        # Just follow the suggested steps: create initial weights vector,
        # apply least_squares optimizer passing the parameters described above
        # and finally extract optimized weights.
        # Remember: you need to distinguish coefficients from intercept when fit_intercept=True
        #self.coef_ =
        #self.intercept_ =
        # Return the classifier
        return self
        
    def predict(self, X):
        # Check is fit had been called
        check_is_fitted(self)
        
        # Input validation
        X = check_array(X)
        
        return X.dot(self.coef_) + self.intercept_

In [None]:
#Testing area
from sklearn.utils.estimator_checks import check_estimator
from sklearn.linear_model import Ridge

lr = LinearRegression()
ridge = Ridge(alpha=0)
lr_no_intercept = LinearRegression(fit_intercept=False)
ridge_no_intercept = Ridge(alpha=0, fit_intercept=False)

#Check compatibility with Sklearn framework and apply some spesific internal tests
check_estimator(lr)
check_estimator(lr_no_intercept)

#Compare model accuracy with Ridge(0) from Sklearn
data.fillna({'BallControl': data['BallControl'].mean()
             , 'Dribbling': data['Dribbling'].mean()
             , 'Strength': data['Strength'].mean()}, inplace=True)
X_sample, y_sample = data[['Dribbling', 'Strength']], data['BallControl']
lr.fit(X_sample, y_sample)
ridge.fit(X_sample, y_sample)
assert np.allclose(lr.predict(X_sample), ridge.predict(X_sample), rtol=1e-03), "Your model with intercept not accurate enough!"
lr_no_intercept.fit(X_sample, y_sample)
ridge_no_intercept.fit(X_sample, y_sample)
assert np.allclose(lr_no_intercept.predict(X_sample), ridge_no_intercept.predict(X_sample), rtol=1e-03), "Your model without intercept not accurate enough!"