In [108]:
from __future__ import print_function, division
import pandas as pd
import requests
import pickle
from pandas.core import datetools

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
from seaborn import plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
import time

%matplotlib inline

In [103]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)

# Data Scraping

In [33]:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [34]:
chromedriver_path = "/Users/Tong/chromedriver"
driver = webdriver.Chrome(chromedriver_path)
driver.get('https://www.whoscored.com/')
time.sleep(5); 

In [35]:
def Laliga_click():
    Laliga = driver.find_element_by_xpath('//a[@title="Spain"]')
    Laliga.click()
    time.sleep(5);

In [36]:
Laliga_click() #choose the league Laliga

In [37]:
def season_click(season_num):
    season = driver.find_element_by_xpath('//*[@id="seasons"]/option[%d]' %(8-season_num))
    try:
        season.click()
    except WebDriverException:
        pass
    time.sleep(5)

# Clean Data

# Data Exploration

In [147]:
Points_vs_Statistics = pickle.load(open('data/Points_vs_Statistics.p', 'rb'))

In [148]:
Points_vs_Statistics.head()

Unnamed: 0,season,team,points,shots_pg,discipline_yellow,discipline_red,possession,pass_success,aerials_won,shots_conceded_pg,tackles_pg,interceptions_pg,fouls_pg,offsides_pg,shots_ot_pg,dribbles_pg,fouled_pg
0,2010 - 2011,Barcelona,96,15.8,71.0,2.0,67.4,89.6,6.2,7.4,18.5,18.2,10.6,3.9,7.3,14.7,13.4
1,2010 - 2011,Real Madrid,92,19.1,94.0,7.0,55.0,82.9,10.1,10.2,22.4,21.8,14.2,3.8,8.0,11.7,13.7
2,2010 - 2011,Valencia,71,13.1,112.0,8.0,54.6,80.0,11.8,13.4,19.2,23.3,16.0,2.8,5.1,7.9,12.1
3,2010 - 2011,Villarreal,62,13.9,88.0,5.0,52.7,82.2,7.8,12.0,22.8,24.6,12.4,2.2,5.4,9.2,16.4
4,2010 - 2011,Sevilla,58,13.8,100.0,7.0,52.7,80.1,13.7,12.6,19.3,20.4,14.5,2.2,5.5,7.7,14.9


In [154]:
Points_vs_Statistics.columns

Index(['season', 'team', 'points', 'shots_pg', 'discipline_yellow',
       'discipline_red', 'possession', 'pass_success', 'aerials_won',
       'shots_conceded_pg', 'tackles_pg', 'interceptions_pg', 'fouls_pg',
       'offsides_pg', 'shots_ot_pg', 'dribbles_pg', 'fouled_pg'],
      dtype='object')

In [156]:
#split the data into test set (season 2016-2017) and train set (6 previous season)
Test_set = Points_vs_Statistics[Points_vs_Statistics['season'] == '2016 - 2017']
Train_set = Points_vs_Statistics[Points_vs_Statistics['season'] != '2016 - 2017']
print(Test_set.shape)
print(Train_set.shape)

(20, 17)
(120, 17)


In [157]:
#Let's see the correlations between the final team points with diffrent variables
Train_set.corr().sort_values('points')

Unnamed: 0,points,shots_pg,discipline_yellow,discipline_red,possession,pass_success,aerials_won,shots_conceded_pg,tackles_pg,interceptions_pg,fouls_pg,offsides_pg,shots_ot_pg,dribbles_pg,fouled_pg
shots_conceded_pg,-0.637968,-0.441121,0.339195,0.268446,-0.572624,-0.528868,0.136452,1.0,0.02822,0.308699,0.354195,-0.289437,-0.530208,-0.574663,-0.089739
discipline_yellow,-0.46032,-0.315475,1.0,0.423842,-0.43837,-0.531298,0.13035,0.339195,0.220248,0.279329,0.752837,-0.182404,-0.435688,-0.434819,0.019041
fouls_pg,-0.423732,-0.3679,0.752837,0.374879,-0.540049,-0.636358,0.24678,0.354195,0.271171,0.316347,1.0,-0.115588,-0.445876,-0.537932,-0.154105
discipline_red,-0.373896,-0.129294,0.423842,1.0,-0.252,-0.282582,0.118715,0.268446,0.146661,0.159666,0.374879,-0.178426,-0.241572,-0.292502,0.071413
aerials_won,-0.341734,-0.41368,0.13035,0.118715,-0.446628,-0.528448,1.0,0.136452,-0.071842,-0.307719,0.24678,-0.402291,-0.48234,-0.328444,-0.31937
interceptions_pg,-0.245912,-0.149958,0.279329,0.159666,-0.257981,-0.32658,-0.307719,0.308699,0.457912,1.0,0.316347,0.078595,-0.173987,-0.461634,0.113026
tackles_pg,0.021334,0.051508,0.220248,0.146661,-0.082118,-0.144963,-0.071842,0.02822,1.0,0.457912,0.271171,0.101538,-0.011313,-0.237091,0.249027
fouled_pg,0.22441,0.289177,0.019041,0.071413,0.33606,0.321857,-0.31937,-0.089739,0.249027,0.113026,-0.154105,0.145649,0.297559,0.175609,1.0
offsides_pg,0.503728,0.553819,-0.182404,-0.178426,0.480089,0.42745,-0.402291,-0.289437,0.101538,0.078595,-0.115588,1.0,0.622238,0.24815,0.145649
dribbles_pg,0.622643,0.54817,-0.434819,-0.292502,0.710172,0.769414,-0.328444,-0.574663,-0.237091,-0.461634,-0.537932,0.24815,0.612996,1.0,0.175609


In [158]:
# points and tackles_pg has a correlation of 0.021. It's really low. Thus tackles_pg will not be included in the model.
#We first have a look at the linear regression results for home_team_goals with all the variables that have correlation over 0.1.

lm1 = smf.ols('points ~ shots_pg + discipline_yellow + discipline_red + possession + pass_success + aerials_won + shots_conceded_pg + interceptions_pg + fouls_pg + offsides_pg + shots_ot_pg + dribbles_pg+ fouled_pg', data=Train_set)
fit1 = lm1.fit()
fit1.summary()

0,1,2,3
Dep. Variable:,points,R-squared:,0.778
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,28.5
Date:,"Wed, 19 Jul 2017",Prob (F-statistic):,9.54e-29
Time:,13:59:25,Log-Likelihood:,-424.57
No. Observations:,120,AIC:,877.1
Df Residuals:,106,BIC:,916.2
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-26.3104,38.739,-0.679,0.499,-103.115,50.494
shots_pg,-3.6265,1.179,-3.076,0.003,-5.964,-1.289
discipline_yellow,-0.0228,0.083,-0.273,0.785,-0.188,0.142
discipline_red,-0.8091,0.373,-2.171,0.032,-1.548,-0.070
possession,-0.6552,0.437,-1.500,0.136,-1.521,0.211
pass_success,1.0384,0.553,1.877,0.063,-0.059,2.135
aerials_won,0.6117,0.370,1.654,0.101,-0.121,1.345
shots_conceded_pg,-1.9465,0.578,-3.366,0.001,-3.093,-0.800
interceptions_pg,0.1173,0.199,0.589,0.557,-0.278,0.512

0,1,2,3
Omnibus:,1.189,Durbin-Watson:,1.535
Prob(Omnibus):,0.552,Jarque-Bera (JB):,1.088
Skew:,0.231,Prob(JB):,0.58
Kurtosis:,2.941,Cond. No.,6780.0
