## Import and combine all csv files

In [1]:
import pandas as pd
import numpy as np

PATH_TOTAL = ("data/csv/totals/")
PATH_ADVANCED = ("data/csv/advanced/")

In [2]:
import glob
allFiles = glob.glob(PATH_ADVANCED + "leagues_NBA_*_advanced.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None)
    list_.append(df)
frame = pd.concat(list_,  ignore_index=True)

## Get an overview of the data 

In [3]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14636 entries, 0 to 14635
Data columns (total 31 columns):
0              7281 non-null float64
0.1            7281 non-null float64
3PAr           14616 non-null object
AST%           14636 non-null object
Age            14636 non-null object
BLK%           14636 non-null object
BPM            14636 non-null object
DBPM           14636 non-null object
DRB%           14636 non-null object
DWS            14636 non-null object
FTr            14616 non-null object
G              14636 non-null object
MP             14636 non-null object
OBPM           14636 non-null object
ORB%           14636 non-null object
OWS            14636 non-null object
PER            14636 non-null object
Player         14636 non-null object
Pos            14636 non-null object
Rk             14636 non-null object
STL%           14636 non-null object
TOV%           14625 non-null object
TRB%           14636 non-null object
TS%            14622 non-null object
Tm 

In [4]:
frame.head()

Unnamed: 0,0,0.1,3PAr,AST%,Age,BLK%,BPM,DBPM,DRB%,DWS,...,TOV%,TRB%,TS%,Tm,USG%,Unnamed: 19,Unnamed: 24,VORP,WS,WS/48
0,,,0.001,13.6,33,4.0,5.3,1.4,21.5,4.6,...,12.8,15.0,0.616,LAL,26.3,,,5.4,14.3,0.23
1,,,0.017,8.0,26,0.6,-3.2,-0.7,10.6,0.2,...,10.4,8.8,0.459,TOT,10.3,,,-0.1,0.4,0.062
2,,,0.0,24.5,26,1.9,5.3,3.3,20.5,4.5,...,18.7,14.7,0.567,PHO,23.0,,,3.8,7.7,0.18
3,,,0.015,12.5,22,1.7,-5.6,0.4,19.7,0.2,...,23.1,13.4,0.385,DAL,15.1,,,-0.2,-0.3,-0.054
4,,,0.012,28.5,32,0.4,-1.3,-1.6,5.4,2.3,...,21.8,3.6,0.582,BOS,17.5,,,0.5,6.9,0.118


## Remove duplicates

Considering the data is from 1981 to 2015 it will contain duplicates for each player.
One intereseting approach would be to compute the average of each player, and calucate career PER, 
but for the sake of simplicity I compute the PER for their first season

non_duplicates = frame.drop_duplicates(subset=['Player'])

Update: Not necessary, because eliminates a lot of relevant data - rather, delete det "Player" column
Resulted in 0.1 increase 

## Need the numerical values

In [6]:
numerical_data_frames = ['3PAr', 'AST%', 'BLK%', 'BPM', 'DBPM', 'DWS', 'FTr', 'G', 'MP', 'OBPM', 'ORB%', 'OWS', 
                         'PER', 'Rk', 'STL%', 'TOV%', 'TRB%', 'TS%', 'USG%', 'VORP', 'WS', 'WS/48']
num_data = frame[numerical_data_frames]

## Find columns with missing values 

In [7]:
num_data.isnull().any()

3PAr      True
AST%     False
BLK%     False
BPM      False
DBPM     False
DWS      False
FTr       True
G        False
MP       False
OBPM     False
ORB%     False
OWS      False
PER      False
Rk       False
STL%     False
TOV%      True
TRB%     False
TS%       True
USG%     False
VORP     False
WS       False
WS/48    False
dtype: bool

## Replacing missing values with the number 0 and verifying that no values are missing 

In [8]:
num_data = num_data.fillna(0)
num_data.isnull().any()

3PAr     False
AST%     False
BLK%     False
BPM      False
DBPM     False
DWS      False
FTr      False
G        False
MP       False
OBPM     False
ORB%     False
OWS      False
PER      False
Rk       False
STL%     False
TOV%     False
TRB%     False
TS%      False
USG%     False
VORP     False
WS       False
WS/48    False
dtype: bool

## If there are any strings, replace with 0

In [9]:
cols = num_data.columns
num_data[cols] = num_data[cols].apply(pd.to_numeric, errors='coerce').fillna(0)

## Standard train and fit, using a LinearRegression model

In [10]:
from sklearn.model_selection import train_test_split
X = num_data.drop('PER', axis=1)
y = num_data[['PER']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

lin_reg.predict(X_test)
lin_reg.score(X_test, y_test)

0.97233623078908282

## Trying an example from GitHub

In [12]:
from sklearn.pipeline import make_pipeline                                                                                               
from sklearn.preprocessing import StandardScaler                                                
from sklearn.neural_network import MLPRegressor                         

pipeline = make_pipeline(StandardScaler(),                                                      
                         MLPRegressor(solver='lbfgs', hidden_layer_sizes=50))                   
pipeline.fit(X_train, y_train)                                                                  
pipeline.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.98051806184336787