In [1]:
import pandas as pd
import numpy as np

## Feature Analysis

In [357]:
qb = pd.read_csv("data/finalized_data/finalized_QB.csv", index_col=0)
de = pd.read_csv("data/finalized_data/finalized_DEF.csv", index_col=0)
rb = pd.read_csv("data/finalized_data/finalized_RB.csv", index_col=0)
te = pd.read_csv("data/finalized_data/finalized_TE.csv", index_col=0)
wr = pd.read_csv("data/finalized_data/finalized_WR.csv", index_col=0)


### VARIABLES 

#### Very important variables
- week (for time series)
- Outcome
- Score

#### Useless variables
- Season 
- Position (same as file name, except for FB and RB in RB)
- Game Date is probably irrelevant if we have the weeks?
- Also all Years are 2019
- All Games Played are 1
- All Years Played are nan

#### All
- week (1) #integer
- opponent ('PIT') #string
- team ('det') #string
- salary (4500) #integer
- name ('Gabriel, Taylor' or 'Minnesota') #string
- Player Id

#### ALL BUT DEF AND QB
- Longest Reception ('5T' '-8' '--') #string
- Longest Rushing Run ('--' '4T' '9') #string
- Receptions ('--' or '1') #string
- Yards per Reception ('--' '6.0') #string

#### All BUT DEF
- Year (2019) #integer
- Game Date ('09/08') #string
- Home or Away ('Away') #string
- Outcome ('W') #string
- Score ('59 to 10') #string -> x to y --> x - y
- Games Started (1 or 0) #integer
- Rushing Attempts ('7' or '--') #string
- Yards Per Carry ('10.7' or '--') #string
- Rushing TDs ('1' or '--') #string
- Rushing Yards ('32' or '--' ) #string
- Fumbles ('--' or '1') #string
- Fumbles Lost ('--' or '1') #string
- Age (21.) #float?
- Birth Place ('Gainesville , GA') #string
- Birthday ('1/7/1997') #string
- College ('Louisville') #string
- Current Status ('Injured reserve') #string
- Current Team ('Buffalo Bills') #string WHAT IS THE DIFFERENCE BETWEEN THIS AND team???
- Experience ('10th season') #string
- Height (inches) (71.) #float
- High School ('Grand Blanc HS') #string
- High School Location ('Powder Springs, GA' or 'GA') #string
- Number (nan or 17. ) #float
- Weight (lbs) (260.) #float

#### ONLY ON QB 
- Passes Completed ('17' or '--') #string
- Passes Attempted ('17' or '--') #string
- Completion Percentage ('57.7' or '--') #string
- Passing Yards ('308' or '--') #string
- Passing Yards Per Attempt ('9.1' or '--') #string
- TD Passes ('3' or '--') #string
- Ints ('2' or '--') #string
- Sacks ('3' or '--') #string
- Sacked Yards Lost ('17' or '--') #string
- Passer Rating (71.9) #float

### Next Steps

1. Do analysis for the rest of features (CHECK)
2. One-hot-encode the strings and convert all ints to floats
3. Create one feature vector per instance
4. Separate the time variable, and labels
5. Start experimenting with regressors

In [332]:
all_cols = list(qb.columns) + list(rb.columns) + list(wr.columns) + list(te.columns)
all_cols = set(all_cols)
def_cols = list(de.columns)
len(all_cols), def_cols

(52, ['name', 'team', 'opponent', 'points', 'salary', 'week'])

## Data Engineering

In [333]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [358]:
### Remove columns
to_remove = ["Season", "Position", "Game Date", "Year", "Games Played", "Years Played", "Player Id", "name"]

targets = [qb, wr, te, rb]

for target in targets:
    target.drop(to_remove, axis=1, inplace=True)

In [359]:
all_numerical_cols = ["week", "salary"]

not_def_numerical_cols = ["Games Started", "Rushing Attempts", "Yards Per Carry", "Rushing TDs", 
                          "Rushing Yards", "Fumbles", "Fumbles Lost", "Age", "Height (inches)", 
                          "Weight (lbs)", "Number"]

not_def_qb_numerical_cols = ["Receptions", "Yards Per Reception"]

only_qb = ["Passes Completed", "Passes Attempted", "Completion Percentage", "Passing Yards", "Sacked Yards Lost",
           "Passing Yards Per Attempt", "TD Passes", "Ints", "Sacks", "Passer Rating"]

In [360]:
# Convert all numerical values to floats
for target in [qb, wr, te, rb, de]:
    for col in all_numerical_cols:
        target[col] = target[col].replace("--", "0").astype("float64")

for target in targets:
    for col in not_def_numerical_cols:
        target[col] = target[col].replace("--", "0").astype("float64")

for target in [wr, te, rb]:
    for col in not_def_qb_numerical_cols:
        target[col] = target[col].replace("--", "0").astype("float64")
        
for col in only_qb:
    qb[col] = qb[col].replace("--", "0").astype("float64")

In [361]:
qb.columns

Index(['week', 'Home or Away', 'opponent', 'Outcome', 'Score', 'Games Started',
       'Passes Completed', 'Passes Attempted', 'Completion Percentage',
       'Passing Yards', 'Passing Yards Per Attempt', 'TD Passes', 'Ints',
       'Sacks', 'Sacked Yards Lost', 'Passer Rating', 'Rushing Attempts',
       'Rushing Yards', 'Yards Per Carry', 'Rushing TDs', 'Fumbles',
       'Fumbles Lost', 'Unnamed: 0.1', 'team', 'points', 'salary', 'Age',
       'Birth Place', 'Birthday', 'College', 'Current Status', 'Current Team',
       'Experience', 'Height (inches)', 'High School', 'High School Location',
       'Number', 'Weight (lbs)'],
      dtype='object')

In [362]:
### Transform data
final_dfs = []

def transform(target):
    # Retrieve columns w/ categorical data
    cat = target.select_dtypes(include=['object']).copy()
    
    #Return df with onehot encoding applied to the columns which are categorical
    one_hot = pd.get_dummies(target, columns=cat.columns)
    one_hot = one_hot.replace(np.nan, 0.0)
    
    return one_hot

qb = transform(qb)
wr = transform(wr)
te = transform(te)
rb = transform(rb)

In [365]:
# Turn into X and Y
qb_X = qb.drop("points", axis=1).values
qb_y = qb["points"].values

In [366]:
qb_X

array([[ 1.,  1., 17., ...,  0.,  0.,  0.],
       [ 5.,  1., 19., ...,  0.,  0.,  0.],
       [ 7.,  1.,  9., ...,  0.,  0.,  0.],
       ...,
       [ 4.,  1., 18., ...,  0.,  0.,  0.],
       [ 5.,  1., 28., ...,  0.,  0.,  0.],
       [ 7.,  1., 31., ...,  0.,  0.,  0.]])

In [367]:
len(qb_X)

103

In [368]:
# Scale/normalize values
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(qb_X)
# Apply transform to both the training set and the test set.
qb_X = scaler.transform(qb_X)

In [369]:
len(qb_X)

103

In [375]:
# Perform Grid Search on PCA
pca = PCA(n_components=0.95)
pca.fit(qb_X)
trans_qb_X = pca.transform(qb_X)

# param_grid = {
#     'n_components': [20, 35, 50, 65, 0.95]
# }
# search = GridSearchCV(pca, param_grid, n_jobs=-1)
# search.fit(qb_X, qb_y)
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# print(search.best_params_)


In [376]:
len(trans_qb_X[0])

72

## Regression

In [381]:
# Apply simple Ridge Regression for testing purposes
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [382]:
X_train, X_test, y_train, y_test=train_test_split(qb_X, qb_y, test_size=0.2,random_state=69420)

In [383]:
ridge = Ridge()
param_grid = {
    'alpha': [0.01, 0.1, 0.2, 1, 10, 100]
}
search = GridSearchCV(ridge, param_grid, n_jobs=-1)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)



Best parameter (CV score=0.611):
{'alpha': 10}


In [386]:
ridge_opt = Ridge(alpha=10, random_state=69420)
ridge_opt.fit(X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=69420, solver='auto', tol=0.001)

In [387]:
preds = ridge_opt.predict(X_test)

In [392]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [406]:
max(y_test), min(y_test)

(28.9, 1.16)

In [393]:
mean_absolute_error(y_test, preds)

4.002270756503693

In [394]:
mean_squared_error(y_test, preds)

25.827383540169226

In [395]:
r2_score(y_test, preds)

0.5532005272426592

In [401]:
np.sqrt(mean_squared_error(y_test, preds))

5.082064889409542

In [405]:
len(X_test), len(y_test), len(preds)

(21, 21, 21)