# A. Reading the Data

In [15]:
import pandas as pd
import numpy as np

players = pd.read_csv("nba_logreg.csv")

players.head()

Unnamed: 0,name,gp,min,pts,fgm,fga,fg,3p_made,3pa,3p,...,fta,ft,oreb,dreb,reb,ast,stl,blk,tov,target_5yrs
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0.0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1.0
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0


# B. Formatting the Features

In [16]:
# Total assists made, total minutes played etc do not take into account the number of games played as well. ie, it
# may be possible that a certain player has more minutes in one game while he has less in the other.

# So, it's better to have metrics like minutes, rebounds, assists, steals etc seen per game.

players["min/game"] = players["min"]/players["gp"]
players["rebound/game"] = players["reb"]/players["gp"]
players["assist/game"] = players["ast"]/players["gp"]
players["steals/game"] = players["stl"]/players["gp"]
players["blocks/game"] = players["blk"]/players["gp"]
players["turnovers/game"] = players["tov"]/players["gp"]
players["5_years?"] = players["target_5yrs"]

# It also helps in reducing variance in the results because the number of features decrease!!

players.head()

Unnamed: 0,name,gp,min,pts,fgm,fga,fg,3p_made,3pa,3p,...,blk,tov,target_5yrs,min/game,rebound/game,assist/game,steals/game,blocks/game,turnovers/game,5_years?
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,0.4,1.3,0.0,0.761111,0.113889,0.052778,0.011111,0.011111,0.036111,0.0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,0.5,1.6,0.0,0.768571,0.068571,0.105714,0.031429,0.014286,0.045714,0.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,0.3,1.0,0.0,0.206757,0.02973,0.013514,0.006757,0.004054,0.013514,0.0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,0.1,1.0,1.0,0.2,0.032759,0.013793,0.010345,0.001724,0.017241,1.0
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,0.4,0.8,1.0,0.239583,0.052083,0.00625,0.00625,0.008333,0.016667,1.0


### Removing columns not needed for the model

In [17]:
players = players.drop(columns=["gp", "min", "fgm", "fga", "3p_made", "3pa", "ftm", "fta", "oreb", "dreb", "reb", "ast", "stl", "blk", "tov", "target_5yrs"])
players.head()

Unnamed: 0,name,pts,fg,3p,ft,min/game,rebound/game,assist/game,steals/game,blocks/game,turnovers/game,5_years?
0,Brandon Ingram,7.4,34.7,25.0,69.9,0.761111,0.113889,0.052778,0.011111,0.011111,0.036111,0.0
1,Andrew Harrison,7.2,29.6,23.5,76.5,0.768571,0.068571,0.105714,0.031429,0.014286,0.045714,0.0
2,JaKarr Sampson,5.2,42.2,24.4,67.0,0.206757,0.02973,0.013514,0.006757,0.004054,0.013514,0.0
3,Malik Sealy,5.7,42.6,22.6,68.9,0.2,0.032759,0.013793,0.010345,0.001724,0.017241,1.0
4,Matt Geiger,4.5,52.4,0.0,67.4,0.239583,0.052083,0.00625,0.00625,0.008333,0.016667,1.0


### Giving the columns convenient names

In [18]:
players = players.rename(columns={"pts": "points/game", "fg": "field_goal_%", "3p": "3_point_%", "ft": "free_throw_%"})

players.head()

Unnamed: 0,name,points/game,field_goal_%,3_point_%,free_throw_%,min/game,rebound/game,assist/game,steals/game,blocks/game,turnovers/game,5_years?
0,Brandon Ingram,7.4,34.7,25.0,69.9,0.761111,0.113889,0.052778,0.011111,0.011111,0.036111,0.0
1,Andrew Harrison,7.2,29.6,23.5,76.5,0.768571,0.068571,0.105714,0.031429,0.014286,0.045714,0.0
2,JaKarr Sampson,5.2,42.2,24.4,67.0,0.206757,0.02973,0.013514,0.006757,0.004054,0.013514,0.0
3,Malik Sealy,5.7,42.6,22.6,68.9,0.2,0.032759,0.013793,0.010345,0.001724,0.017241,1.0
4,Matt Geiger,4.5,52.4,0.0,67.4,0.239583,0.052083,0.00625,0.00625,0.008333,0.016667,1.0


### Shuffling rows

In [19]:
shuffled_rows = np.random.permutation(players.index)

players = players.iloc[shuffled_rows]

players.head()

players.isnull().sum()

name               0
points/game        0
field_goal_%       0
3_point_%         11
free_throw_%       0
min/game           0
rebound/game       0
assist/game        0
steals/game        0
blocks/game        0
turnovers/game     0
5_years?           0
dtype: int64

In [20]:
players.dropna(subset=["3_point_%"], inplace=True)

players.corr()["points/game"]

points/game       1.000000
field_goal_%      0.261587
3_point_%         0.151072
free_throw_%      0.254404
min/game          0.525724
rebound/game      0.416978
assist/game       0.381396
steals/game       0.399350
blocks/game       0.246219
turnovers/game    0.581167
5_years?          0.314632
Name: points/game, dtype: float64

# C. Creating train and test dataframes

In [21]:
A = int(len(players)*0.75)
train = players[:A]
test = players[A:]

In [22]:
cols = players.columns.drop(["name", "5_years?"])
cols

Index(['points/game', 'field_goal_%', '3_point_%', 'free_throw_%', 'min/game',
       'rebound/game', 'assist/game', 'steals/game', 'blocks/game',
       'turnovers/game'],
      dtype='object')

# D. Training model on 'Train' set

In [23]:
from sklearn.linear_model import LogisticRegression

accuracies = []

lr = LogisticRegression()
    
lr.fit(train[cols], train["5_years?"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Predicting on Test set

In [24]:
labels = lr.predict(test[cols])
        
test["pred_labels"] = labels

test["pred_labels"].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


261     1.0
1171    1.0
688     0.0
1001    1.0
970     1.0
Name: pred_labels, dtype: float64

In [25]:
### Predicting Accuracy
    
test["actual_labels"] = test["5_years?"]
    
matches = test["actual_labels"] == test["pred_labels"]
    
correct_pred = test[matches]
    
accuracy = len(correct_pred)/len(test)
        
accuracy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0.6996996996996997

In [26]:
test["pred_labels"].value_counts()

1.0    243
0.0     90
Name: pred_labels, dtype: int64

#### Therefore, our model is preidcting career length by 69.96% accuracy.

#### Predicting on train set to check overfitting

In [27]:
labels_train = lr.predict(train[cols])
        
train["pred_labels"] = labels_train

train["actual_labels"] = train["5_years?"]
    
matches = train["actual_labels"] == train["pred_labels"]
    
correct_pred = train[matches]
    
accuracy = len(correct_pred)/len(train)
        
accuracy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


0.6877510040160643

#### Therefore, our model is preidcting career length on the training set by by 68.77% accuracy.

#### Our model is very slightly overfitting, as the prediction accuracy differs by only about 1.2%. This maybe due to a our features differing only moderately with respect to our target variable.

### Thus, our model is acceptable.