# Simple Linear Regression Project: Predicting NFL Offensive Rankings for 2019-2020 Season

## Imports

In [6]:
from __future__ import print_function, division
import requests
from selenium import webdriver
from bs4 import BeautifulSoup as bs 
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import statsmodels.api as sm
import seaborn as sns
import patsy
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression

## Scraping Function to pull relevant NFL stats

In [7]:
def offense(url):
    driver.get(url)
    page = driver.page_source
    sopa = bs(page)
    table = sopa.find('table', {'id':'team_stats'})
    ranker=list(filter(None,[tr.find('th', {'data-stat':"ranker"}).text for tr in table.find_all('tr') if tr.find('th', {'data-stat':"ranker"})!= None]))[1:]
    team=list(filter(None,[tr.find('td', {'data-stat':"team"}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat':"team"})!= None]))[:-3] 
    team_name_cleaned=[name.split()[-1] for name in team]
    points=list(filter(None,[tr.find('td', {'data-stat':"points"}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat':"points"})!= None]))[:-3]
    tot_yds=list(filter(None,[tr.find('td', {'data-stat':"total_yards"}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat':"total_yards"})!= None]))[:-3]  
    first_down=list(filter(None,[tr.find('td', {'data-stat': "first_down "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "first_down "})!= None]))[:-3]  
    pass_att=list(filter(None,[tr.find('td', {'data-stat': "pass_att "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "pass_att "})!= None]))[:-3]  
    pass_td=list(filter(None,[tr.find('td', {'data-stat': "pass_td "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "pass_td "})!= None]))[:-3]  
    pass_int=list(filter(None,[tr.find('td', {'data-stat': "pass_int "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "pass_int "})!= None]))[:-3]  
    pnypa=list(filter(None,[tr.find('td', {'data-stat': "pass_net_yds_per_att "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "pass_net_yds_per_att "})!= None]))[:-3]  
    rush_att=list(filter(None,[tr.find('td', {'data-stat': "rush_att "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "rush_att "})!= None]))[:-3]  
    rush_td=list(filter(None,[tr.find('td', {'data-stat': "rush_td "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "rush_td "})!= None]))[:-3]  
    score_pct=list(filter(None,[tr.find('td', {'data-stat': "score_pct "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "score_pct "})!= None]))[:-3]  
    turnover_pct=list(filter(None,[tr.find('td', {'data-stat': "turnover_pct "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "turnover_pct "})!= None]))[:-3]  
    exp_pts_tot=list(filter(None,[tr.find('td', {'data-stat': "exp_pts_tot "}).text for tr in table.find_all('tr') if tr.find('td', {'data-stat': "exp_pts_tot "})!= None]))  
    stats_zipped=[list(a) for a in zip(ranker, points, tot_yds, first_down, pass_att, pass_td, pass_int, pnypa, rush_att, rush_td, score_pct, turnover_pct,exp_pts_tot)]  
    kick_table = sopa.find('div', {'id':'all_kicking'})  
    kick_team=list(filter(None,[tr.find('td', {'data-stat': "team "}).text for tr in kick_table.find_all('tr') if tr.find('td', {'data-stat': "team "})!= None]))[:-3]  
    kick_team_cleaned=[name.split()[-1] for name in kick_team]  
    kick_goals=list(filter(None,[tr.find('td', {'data-stat': "fgm "}).text for tr in kick_table.find_all('tr') if tr.find('td', {'data-stat': "fgm "})!= None]))[:-3]  
    kick_zip=list(zip(kick_team_cleaned, kick_goals))  
    offense_dict=dict(zip(team_name_cleaned,stats_zipped))  
    for k, v in offense_dict.items():  
        for item in kick_zip:  
            if item[0]==k:  
                v.append(item[1])  
    return offense_dict  

## Initialize Functions for Pass 5 years

In [None]:
offense2018_dict=offense('https://www.pro-football-reference.com/years/2018/index.htm')
offense2017_dict=offense('https://www.pro-football-reference.com/years/2017/index.htm')
offense2016_dict=offense('https://www.pro-football-reference.com/years/2016/index.htm')
offense2015_dict=offense('https://www.pro-football-reference.com/years/2016/index.htm')
offense2014_dict=offense('https://www.pro-football-reference.com/years/2014/index.htm')

## Scraping Function for Positional Spending

In [9]:
def positional_spending(string): 
    otc_url = 'https://overthecap.com/positional-spending/' 
    response = requests.get(otc_url) 
    page = response.text 
    sopa = BeautifulSoup(page,  "lxml ") 
    pos_dic={} 
    pos_list=over_the_cap_sopa.find( "div ", { "id " : string}) 
    extract_pos = [ele.text for ele in pos_list.findAll('td', { "class " :  "sortable "})] 
    sub_list = [extract_pos[i * 12:(i + 1) * 12] for i in range((len(extract_pos) + 12 - 1) // 12 )]  
    for i in sub_list: 
        pos_dic[i[0]]=i[1:6] 
    return pos_dic 

## Initialize Positional Spending Function

In [None]:
pos_spend2014_dic=positional_spending( "y2014 ")
pos_spend2015_dic=positional_spending( "y2015 ")
pos_spend2016_dic=positional_spending( "y2016 ")
pos_spend2017_dic=positional_spending( "y2017 ")
pos_spend2018_dic=positional_spending( "y2018 ")

## Create Dictionary for Offence

In [10]:
def off_dict(dictionary): 
    columns=['Rank', 'Points_For', 'Total_Yds', 'First_Down','Pass_Att', 'Pass_TD', 'Pass_Int', 'PNYPA', 'Rush_Att', 'Rush_TD', 'Score%','TO%', 'EXP', 'FGM'] 
    offense_df=pd.DataFrame.from_dict(dictionary, orient='index', columns=columns) 
    return offense_df 

In [None]:
offense2018_df=off_dict(offense2018_dict)
offense2017_df=off_dict(offense2017_dict)
offense2016_df=off_dict(offense2016_dict)
offense2015_df=off_dict(offense2015_dict)
offense2014_df=off_dict(offense2014_dict)

In [None]:
pos_spend_df2018=ps_dict(pos_spend2018_dic) 
pos_spend_df2018['Year']='2018' 
pos_spend_df2017=ps_dict(pos_spend2017_dic) 
pos_spend_df2017['Year']='2017' 
pos_spend_df2016=ps_dict(pos_spend2016_dic) 
pos_spend_df2016['Year']='2016' 
pos_spend_df2015=ps_dict(pos_spend2015_dic) 
pos_spend_df2015['Year']='2015' 
pos_spend_df2014=ps_dict(pos_spend2014_dic) 
pos_spend_df2014['Year']='2014' 

## Concatenate Dictionaries into one Table

In [None]:
def concat(df1, df2):
    comb=pd.concat([df1, df2], axis=1)
    return comb

In [None]:
final_df=pd.concat([concat(offense2018_df, pos_spend_df2018), concat(offense2017_df, pos_spend_df2017), concat(offense2016_df, pos_spend_df2016), concat(offense2015_df, pos_spend_df2015), concat(offense2014_df, pos_spend_df2014)])

## Clean data

In [None]:
remove_char=['QB','RB','WR','TE','OL']
stringToint=['Rank','Points_For','Total_Yds','First_Down','Pass_Att','Pass_TD','Pass_Int','PNYPA','Rush_Att','Rush_TD', 'Score%', 'TO%', 'EXP', 'QB','RB','WR','TE','OL', 'FGM']

In [None]:
#Remove $ and ,: 
for column in remove_char: 
    final_df[column]=final_df[column].apply(lambda x: x.strip('$').replace(',', '')) 
     
#Convert String to Int 
for column in stringToint: 
    final_df[column]=pd.to_numeric(final_df[column]) 
     
final_df.head()

## Create features

In [None]:
#Pass to Rush Att Ratio  
final_df[ "Pass_Att_Per "]= round(final_df[ "Pass_Att "]/(final_df[ "Pass_Att "]+final_df[ "Rush_Att "]), 2)  
final_df[ "Rush_Att_Per "]= round(final_df[ "Rush_Att "]/(final_df[ "Pass_Att "]+final_df[ "Rush_Att "]), 2)  
final_df['PassTD_Cost']=round(final_df[ "QB "]/final_df[ "Pass_TD "], 2)  
final_df['RushTD_Cost']=round(final_df[ "RB "]/final_df[ "Pass_TD "], 2)

## Get general info on Dataframe

In [None]:
final_df=pd.get_dummies(final_df)
final_df.info()

## Lasso Regression to Predict Target [Rank]

In [None]:

x=final_df[[ 'OL','WR','TE','EXP','Pass_Att_Per','Pass_Att','Rush_Att_Per', 'Rush_Att','PassTD_Cost','RushTD_Cost','Year_2014','Year_2015', 'Year_2016','Year_2017' ,'Year_2018']]
y=final_df['Rank']

In [None]:
# Train, Test, Split"
# hold out 20% of the data for final testing
x, X_test, y, y_test = train_test_split(x, y, test_size=.2, random_state=10) 
 
lin = lr() 
 
scaler = StandardScaler() 
 
X_train, X_val, y_train, y_val =  
        train_test_split(x, y, test_size=.20, random_state=3)

In [None]:
#Feature scaling for train, val, and test so that we can run our ridge model on each\n",
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

lm_reg = LassoCV()

In [None]:
lm_reg.fit(X_train_scaled, y_train)
print(f'Lasso Regression (Test) R^2: {lm_reg.score(X_test_scaled, y_test):.3f}')