In [50]:
from __future__ import print_function, division
import pandas as pd
import requests
import pickle
from pandas.core import datetools

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
from seaborn import plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
import time

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

In [51]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)

# Data Scraping

# Clean Data

# Data Exploration

In [88]:
Points_Statistics_df = pickle.load(open('data/Points_vs_Statistics.p', 'rb'))

In [90]:
Points_Statistics_df.head()

Unnamed: 0,season,team,points,shots_pg,discipline_yellow,discipline_red,possession,pass_success,aerials_won,shots_conceded_pg,tackles_pg,interceptions_pg,fouls_pg,offsides_pg,shots_ot_pg,dribbles_pg,fouled_pg
0,2010 - 2011,Barcelona,96,15.8,71,2,67.4,89.6,6.2,7.4,18.5,18.2,10.6,3.9,7.3,14.7,13.4
1,2010 - 2011,Real Madrid,92,19.1,94,7,55.0,82.9,10.1,10.2,22.4,21.8,14.2,3.8,8.0,11.7,13.7
2,2010 - 2011,Valencia,71,13.1,112,8,54.6,80.0,11.8,13.4,19.2,23.3,16.0,2.8,5.1,7.9,12.1
3,2010 - 2011,Villarreal,62,13.9,88,5,52.7,82.2,7.8,12.0,22.8,24.6,12.4,2.2,5.4,9.2,16.4
4,2010 - 2011,Sevilla,58,13.8,100,7,52.7,80.1,13.7,12.6,19.3,20.4,14.5,2.2,5.5,7.7,14.9


In [91]:
Points_Statistics_df.columns

Index(['season', 'team', 'points', 'shots_pg', 'discipline_yellow',
       'discipline_red', 'possession', 'pass_success', 'aerials_won',
       'shots_conceded_pg', 'tackles_pg', 'interceptions_pg', 'fouls_pg',
       'offsides_pg', 'shots_ot_pg', 'dribbles_pg', 'fouled_pg'],
      dtype='object')

## Correlations between final points and features. 

In [92]:
#Let's see the correlations between the final team points with diffrent variables
Points_Statistics_df.corr().sort_values('points')
# points and tackles_pg has a correlation of 0.021. It's really low. Thus tackles_pg will be excluded from the model.

# Several features are highlly correlated with each other, such as shots_pg with shots_og_pg, discipline_yellow with fouls_pg and possession with pass_success. We'll come back to the multicollinearity problem later. 

Unnamed: 0,points,shots_pg,discipline_yellow,discipline_red,possession,pass_success,aerials_won,shots_conceded_pg,tackles_pg,interceptions_pg,fouls_pg,offsides_pg,shots_ot_pg,dribbles_pg,fouled_pg
shots_conceded_pg,-0.63655,-0.455999,0.375049,0.289173,-0.564062,-0.506937,0.112322,1.0,0.057106,0.309091,0.35328,-0.287218,-0.538155,-0.540689,-0.111676
discipline_yellow,-0.483785,-0.33355,1.0,0.441729,-0.438955,-0.52949,0.145392,0.375049,0.228712,0.293482,0.74053,-0.17833,-0.451294,-0.444416,-0.006553
fouls_pg,-0.447149,-0.400628,0.74053,0.356949,-0.552007,-0.64446,0.27243,0.35328,0.277587,0.314656,1.0,-0.149707,-0.479082,-0.546391,-0.17847
discipline_red,-0.370104,-0.132238,0.441729,1.0,-0.228531,-0.2835,0.116775,0.289173,0.156889,0.220819,0.356949,-0.14273,-0.250165,-0.335623,0.059555
aerials_won,-0.327054,-0.397959,0.145392,0.116775,-0.467095,-0.555623,1.0,0.112322,-0.009988,-0.239704,0.27243,-0.377915,-0.474952,-0.361278,-0.349827
interceptions_pg,-0.230981,-0.123078,0.293482,0.220819,-0.25296,-0.352741,-0.239704,0.309091,0.477674,1.0,0.314656,0.102373,-0.172496,-0.495694,0.070736
tackles_pg,0.016435,0.036475,0.228712,0.156889,-0.130876,-0.191562,-0.009988,0.057106,1.0,0.477674,0.277587,0.118622,-0.030039,-0.275303,0.167338
fouled_pg,0.235595,0.307255,-0.006553,0.059555,0.394211,0.384467,-0.349827,-0.111676,0.167338,0.070736,-0.17847,0.161226,0.308641,0.205582,1.0
offsides_pg,0.534218,0.583614,-0.17833,-0.14273,0.495478,0.423935,-0.377915,-0.287218,0.118622,0.102373,-0.149707,1.0,0.636384,0.235657,0.161226
dribbles_pg,0.585084,0.524475,-0.444416,-0.335623,0.683674,0.764941,-0.361278,-0.540689,-0.275303,-0.495694,-0.546391,0.235657,0.60589,1.0,0.205582


In [None]:
sns.pairplot(Points_Statistics_df);

## Simple linear regression

In [12]:
#We first have a look at the linear regression results for team points with all the features that have correlations over 0.1.

lm1 = smf.ols('points ~ shots_pg + discipline_yellow + discipline_red + possession + pass_success + aerials_won + shots_conceded_pg + interceptions_pg + fouls_pg + offsides_pg + shots_ot_pg + dribbles_pg+ fouled_pg', data=Points_Statistics_df)
fit1 = lm1.fit()
fit1.summary()

#Parameters Interpretation:
#R-squared: 0.778  pretty good, the model can explain 77.8% variance of the final points
#adj R-squared: 0.750  good
#F-statistic: 

0,1,2,3
Dep. Variable:,points,R-squared:,0.778
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,28.5
Date:,"Wed, 19 Jul 2017",Prob (F-statistic):,9.54e-29
Time:,16:55:31,Log-Likelihood:,-424.57
No. Observations:,120,AIC:,877.1
Df Residuals:,106,BIC:,916.2
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-26.3104,38.739,-0.679,0.499,-103.115,50.494
shots_pg,-3.6265,1.179,-3.076,0.003,-5.964,-1.289
discipline_yellow,-0.0228,0.083,-0.273,0.785,-0.188,0.142
discipline_red,-0.8091,0.373,-2.171,0.032,-1.548,-0.070
possession,-0.6552,0.437,-1.500,0.136,-1.521,0.211
pass_success,1.0384,0.553,1.877,0.063,-0.059,2.135
aerials_won,0.6117,0.370,1.654,0.101,-0.121,1.345
shots_conceded_pg,-1.9465,0.578,-3.366,0.001,-3.093,-0.800
interceptions_pg,0.1173,0.199,0.589,0.557,-0.278,0.512

0,1,2,3
Omnibus:,1.189,Durbin-Watson:,1.535
Prob(Omnibus):,0.552,Jarque-Bera (JB):,1.088
Skew:,0.231,Prob(JB):,0.58
Kurtosis:,2.941,Cond. No.,6780.0


In [13]:
# try removing the top three features with the hightes p value: 'offsides_pg', discipline_yellow' and 'interceptions_pg' and see if that improves adj R-squared.

lm2 = smf.ols('points ~ shots_pg + discipline_red + possession + pass_success + aerials_won + shots_conceded_pg + fouls_pg + shots_ot_pg + dribbles_pg+ fouled_pg', data=Points_Statistics_df)
fit2 = lm2.fit()
fit2.summary()

#The removal of three high-p-value features does improve adj R-squared from 0.750 to 0.756.


0,1,2,3
Dep. Variable:,points,R-squared:,0.776
Model:,OLS,Adj. R-squared:,0.756
Method:,Least Squares,F-statistic:,37.86
Date:,"Wed, 19 Jul 2017",Prob (F-statistic):,5.68e-31
Time:,16:55:50,Log-Likelihood:,-424.87
No. Observations:,120,AIC:,871.7
Df Residuals:,109,BIC:,902.4
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-17.1296,34.571,-0.495,0.621,-85.648,51.389
shots_pg,-3.6986,1.118,-3.308,0.001,-5.915,-1.482
discipline_red,-0.8346,0.363,-2.297,0.024,-1.555,-0.114
possession,-0.6060,0.416,-1.458,0.148,-1.430,0.218
pass_success,0.9526,0.513,1.858,0.066,-0.064,1.969
aerials_won,0.4961,0.290,1.711,0.090,-0.078,1.071
shots_conceded_pg,-1.9369,0.571,-3.390,0.001,-3.069,-0.804
fouls_pg,0.9093,0.628,1.447,0.151,-0.336,2.155
shots_ot_pg,16.1771,2.427,6.667,0.000,11.368,20.987

0,1,2,3
Omnibus:,1.894,Durbin-Watson:,1.518
Prob(Omnibus):,0.388,Jarque-Bera (JB):,1.7
Skew:,0.291,Prob(JB):,0.427
Kurtosis:,2.984,Cond. No.,4190.0


In [14]:
# try removing another four features with the hightes p value: 'dribbles_pg' and fouled_pg', and see if that improves adj R-squared.

lm3 = smf.ols('points ~ shots_pg + discipline_red + possession + pass_success + aerials_won + shots_conceded_pg + fouls_pg + shots_ot_pg', data=Points_Statistics_df)
fit3 = lm3.fit()
fit3.summary()

#The removal of another two high-p-value features further improves adj R-squared from 0.756 to 0.758. 

0,1,2,3
Dep. Variable:,points,R-squared:,0.774
Model:,OLS,Adj. R-squared:,0.758
Method:,Least Squares,F-statistic:,47.63
Date:,"Wed, 19 Jul 2017",Prob (F-statistic):,1.92e-32
Time:,16:55:58,Log-Likelihood:,-425.41
No. Observations:,120,AIC:,868.8
Df Residuals:,111,BIC:,893.9
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-17.5589,33.144,-0.530,0.597,-83.236,48.118
shots_pg,-3.6589,1.112,-3.290,0.001,-5.863,-1.455
discipline_red,-0.8125,0.357,-2.278,0.025,-1.519,-0.106
possession,-0.5742,0.410,-1.401,0.164,-1.387,0.238
pass_success,1.0863,0.475,2.288,0.024,0.146,2.027
aerials_won,0.4769,0.284,1.679,0.096,-0.086,1.040
shots_conceded_pg,-2.0197,0.546,-3.702,0.000,-3.101,-0.938
fouls_pg,0.8710,0.624,1.395,0.166,-0.366,2.108
shots_ot_pg,16.1417,2.414,6.688,0.000,11.359,20.925

0,1,2,3
Omnibus:,1.79,Durbin-Watson:,1.553
Prob(Omnibus):,0.409,Jarque-Bera (JB):,1.561
Skew:,0.279,Prob(JB):,0.458
Kurtosis:,3.012,Cond. No.,3980.0


In [15]:
# try removing another two features with the hightes p value: 'fouls_pg' and 'possession', and see if that improves adj R-squared.

lm4 = smf.ols('points ~ shots_pg + discipline_red  + pass_success + aerials_won + shots_conceded_pg + shots_ot_pg', data=Points_Statistics_df)
fit4 = lm4.fit()
fit4.summary()

#The removal of another two high-p-value features reduces adj R-squared from 0.756 to 0.755. That's not good. I'll keep the 8 features I have in the lm3 (shots_pg + discipline_red + possession + pass_success + aerials_won + shots_conceded_pg + fouls_pg + shots_ot_pg) for further analysis

0,1,2,3
Dep. Variable:,points,R-squared:,0.767
Model:,OLS,Adj. R-squared:,0.755
Method:,Least Squares,F-statistic:,62.08
Date:,"Wed, 19 Jul 2017",Prob (F-statistic):,1.7e-33
Time:,16:56:08,Log-Likelihood:,-427.29
No. Observations:,120,AIC:,868.6
Df Residuals:,113,BIC:,888.1
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,17.4947,25.201,0.694,0.489,-32.433,67.423
shots_pg,-3.5806,1.118,-3.204,0.002,-5.795,-1.366
discipline_red,-0.7145,0.348,-2.051,0.043,-1.405,-0.024
pass_success,0.4031,0.288,1.398,0.165,-0.168,0.974
aerials_won,0.3971,0.283,1.405,0.163,-0.163,0.957
shots_conceded_pg,-1.8829,0.538,-3.500,0.001,-2.949,-0.817
shots_ot_pg,15.8252,2.422,6.535,0.000,11.028,20.623

0,1,2,3
Omnibus:,3.871,Durbin-Watson:,1.558
Prob(Omnibus):,0.144,Jarque-Bera (JB):,3.283
Skew:,0.371,Prob(JB):,0.194
Kurtosis:,3.328,Cond. No.,2520.0


## Regularization