## Import and combine all csv files

In [1]:
import pandas as pd
import numpy as np

PATH_TOTAL = ("data/csv/totals/")
PATH_ADVANCED = ("data/csv/advanced/")

In [2]:
import glob
allFiles = glob.glob(PATH_ADVANCED + "leagues_NBA_*_advanced.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None)
    list_.append(df)
frame = pd.concat(list_,  ignore_index=True)

## Get an overview of the data 

In [3]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14636 entries, 0 to 14635
Data columns (total 31 columns):
0              7281 non-null float64
0.1            7281 non-null float64
3PAr           14616 non-null object
AST%           14636 non-null object
Age            14636 non-null object
BLK%           14636 non-null object
BPM            14636 non-null object
DBPM           14636 non-null object
DRB%           14636 non-null object
DWS            14636 non-null object
FTr            14616 non-null object
G              14636 non-null object
MP             14636 non-null object
OBPM           14636 non-null object
ORB%           14636 non-null object
OWS            14636 non-null object
PER            14636 non-null object
Player         14636 non-null object
Pos            14636 non-null object
Rk             14636 non-null object
STL%           14636 non-null object
TOV%           14625 non-null object
TRB%           14636 non-null object
TS%            14622 non-null object
Tm 

# Data Cleaning 

## TODO
Implement a Pipeline

## Remove duplicates

Considering the data is from 1981 to 2015 it will contain duplicates for each player.
One intereseting approach would be to compute the average of each player, and calucate career PER, 
but for the sake of simplicity I compute the PER for their first season

non_duplicates = frame.drop_duplicates(subset=['Player'])

Update: Not necessary, because eliminates a lot of relevant data - rather, delete det "Player" column
Resulted in 0.1 increase 

## Need the numerical values

In [4]:
numerical_data_frames = ['3PAr', 'AST%', 'BLK%', 'BPM', 'DBPM', 'DWS', 'FTr', 'G', 'MP', 'OBPM', 'ORB%', 'OWS', 
                         'PER', 'Rk', 'STL%', 'TOV%', 'TRB%', 'TS%', 'USG%', 'VORP', 'WS', 'WS/48']
num_data = frame[numerical_data_frames]
player_data = frame[['Player']]

## If there are any strings, replace with 0

In [5]:
cols = num_data.columns
num_data[cols] = num_data[cols].apply(pd.to_numeric, errors='coerce').fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [6]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='median')
imputer.fit(num_data)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [7]:
num_data_impute = imputer.transform(num_data)
df = pd.DataFrame(num_data_impute, columns=num_data.columns)

## Handling Text and Categorical Attributes - using LabelBinarizer
Converting text labels to numbers 

# TODO
Find out relevant use-cases for these

In [8]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
player_data_1hot = encoder.fit_transform(player_data)
player_data_1hot

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
print(encoder.classes_)

['A.C. Green' 'A.J. Bramlett' 'A.J. English' ..., 'Zoran Dragic'
 'Zoran Planinic' 'Zydrunas Ilgauskas']


Gets one "class" for each player 

## Find columns with missing values (Not necessary, because of Imputer)

num_data.isnull().any()
num_data = num_data.fillna(0)
num_data.isnull().any()

## Dividing into training and test set

In [10]:
from sklearn.model_selection import train_test_split
X = num_data.drop('PER', axis=1)
y = num_data[['PER']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# TODO
## Feature Scaling

Standardization --> Less affected by outliers, and not necessary to have bound values to a specific range (yet)

from sklearn import preprocessing
X_scaled = preprocessing 

## Standard train and fit, using a LinearRegression model

In [11]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

lin_reg.predict(X_test)
lin_reg.score(X_test, y_test)

0.97838478881571622

## Trying an example from GitHub

In [12]:
from sklearn.pipeline import make_pipeline                                                                                               
from sklearn.preprocessing import StandardScaler                                                
from sklearn.neural_network import MLPRegressor                         

pipeline = make_pipeline(StandardScaler(),                                                      
                         MLPRegressor(solver='lbfgs', hidden_layer_sizes=[25, 50, 75]))                   
pipeline.fit(X_train, y_train)                                                                  
pipeline.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.98020251276593207

# TODO

Check for under/overfitting (using RMSE)

## Trying out a more complex model (Decision Tree)
### and using RMSE as evaluation metric

In [13]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [15]:
from sklearn.metrics import mean_squared_error
df_predictions = tree_reg.predict(X_test)
tree_mse = mean_squared_error(y_test, df_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

1.9252643349843785

## Evaluating Using Cross Validation

In [29]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, X, y, scoring='neg_mean_squared_error', cv=10)
rmse_scores = np.sqrt(-scores)

In [30]:
def display_scores(scores):
    print('Scores:', scores)
    print('Mean: ', scores.mean())
    print('Standard Deviation: ', scores.std())


In [31]:
display_scores(rmse_scores)

Scores: [ 1.90910781  1.97130369  2.76861805  2.34538263  1.68692335  1.62831
  1.73707118  2.42982015  1.87919865  1.89715584]
Mean:  2.02528913367
Standard Deviation:  0.350423971598


## Checking the cross-validation score against the Linear Regression model

In [32]:
lin_scores = cross_val_score(lin_reg, X, y, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [ 0.89929063  0.85827018  1.28257257  1.01522299  0.91963505  0.82478086
  0.89494013  1.01042731  0.97537369  1.03278666]
Mean:  0.971330006492
Standard Deviation:  0.123319515935


# TODO
Figure out what these numbers actually mean (Low score = Good)

## Random Forest Regressor

In [35]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

predictions = forest_reg.predict(X_test)
forest_mse = mean_squared_error(y_test, predictions)
forest_rmse = np.sqrt(forest_mse)

display_scores(forest_rmse)

  This is separate from the ipykernel package so we can avoid doing imports until


1.5703171775954583

Improvement from Decision Tree, but not Linear Regression?

## Grid Search 
Hyperparameter tuning for RandomForestRegressor

NOTE: Considering relatively small space ignoring Randomized Search 

In [43]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [3, 10, 30, 60, 100], 'max_features': [2, 4, 6, 8, 10, 12, 14]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30, 60, 100], 'max_features': [2, 4, 6, 8, 10, 12, 14]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

### Best parameters for the Random Forest Regressor Model

In [47]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

2.27128163121 {'max_features': 2, 'n_estimators': 3}
1.88975815648 {'max_features': 2, 'n_estimators': 10}
1.83228367525 {'max_features': 2, 'n_estimators': 30}
1.77034637557 {'max_features': 2, 'n_estimators': 60}
1.76677280523 {'max_features': 2, 'n_estimators': 100}
2.05989489304 {'max_features': 4, 'n_estimators': 3}
1.73220493173 {'max_features': 4, 'n_estimators': 10}
1.57833306667 {'max_features': 4, 'n_estimators': 30}
1.59616060924 {'max_features': 4, 'n_estimators': 60}
1.57481024307 {'max_features': 4, 'n_estimators': 100}
1.94064199983 {'max_features': 6, 'n_estimators': 3}
1.5676670554 {'max_features': 6, 'n_estimators': 10}
1.52530778456 {'max_features': 6, 'n_estimators': 30}
1.49792688026 {'max_features': 6, 'n_estimators': 60}
1.52306202074 {'max_features': 6, 'n_estimators': 100}
1.78520951429 {'max_features': 8, 'n_estimators': 3}
1.63079842204 {'max_features': 8, 'n_estimators': 10}
1.47158459726 {'max_features': 8, 'n_estimators': 30}
1.52527668099 {'max_features':

Best: 1.44240919996 {'max_features': 10, 'n_estimators': 100}  

## Analyzing the importance of the features 

In [49]:
sorted(zip(feature_importances, num_data), reverse=True)

[(0.37889987285674137, 'WS'),
 (0.13577379778949264, 'TRB%'),
 (0.12682931266415895, 'OBPM'),
 (0.076957862816252026, 'OWS'),
 (0.075163837227855967, 'TS%'),
 (0.046702969574813701, 'VORP'),
 (0.043548951587347166, 'BPM'),
 (0.023614481423272052, 'MP'),
 (0.016512189880423724, 'TOV%'),
 (0.012238365471565016, 'DBPM'),
 (0.012165834155296864, 'USG%'),
 (0.0076489288378868434, 'ORB%'),
 (0.0070430630980230138, '3PAr'),
 (0.0069839714877451041, 'Rk'),
 (0.0069345827354256248, 'AST%'),
 (0.0057428855153554611, 'BLK%'),
 (0.0044746384849858132, 'FTr'),
 (0.0038308111805793977, 'STL%'),
 (0.0035648737022867505, 'PER'),
 (0.0035115123368214705, 'G'),
 (0.0018572571736710311, 'DWS')]