# Random Forest
 
Use the Random Forest Algorithm to train and test a model.

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics

nba_data = pd.read_csv('../output/new_nba_data.csv')
nba_data.sample(10, random_state=13)

Unnamed: 0,Name,GamesPlayed,MinutesPlayed,PointsPerGame,FieldGoalsMade,FieldGoalAttempts,FieldGoalPercent,3PointMade,3PointAttempts,3PointAttemptsPercent,...,FreeThrowAttempts,FreeThrowPercent,OffensiveRebounds,DefensiveRebounds,Rebounds,Assists,Steals,Blocks,Turnovers,CareerLongerThan5Years
1243,Tyler Johnson,32,18.8,5.9,2.2,5.2,41.9,0.6,1.5,37.5,...,1.5,68.1,0.4,2.1,2.5,1.3,1.0,0.3,0.9,0.0
1134,Eric Washington,66,23.3,7.7,3.0,7.5,40.4,0.7,2.1,32.1,...,1.3,78.3,0.7,1.2,1.9,1.2,0.8,0.4,1.1,0.0
507,Sam Williams,59,18.2,6.1,2.6,4.7,55.6,0.0,0.0,0.0,...,1.5,55.1,1.5,3.7,5.2,0.6,0.8,1.3,1.1,0.0
279,John Williams,78,22.7,9.2,3.6,8.0,45.4,0.1,0.5,22.2,...,2.9,64.6,1.7,3.0,4.7,2.4,1.6,0.4,1.6,1.0
512,Jim Brogan,63,16.3,6.3,2.6,5.8,45.3,0.1,0.5,28.1,...,1.3,72.6,1.0,0.9,1.9,2.5,0.8,0.2,1.3,0.0
103,Tate George,56,10.6,3.4,1.4,3.4,41.5,0.0,0.0,0.0,...,0.7,80.0,0.3,0.5,0.8,1.9,0.5,0.1,0.8,0.0
33,Bison Dele,48,18.9,9.1,3.6,6.8,52.8,0.0,0.0,0.0,...,3.0,66.9,2.4,3.3,5.7,0.7,0.8,1.1,1.8,1.0
783,Corey Brewer,79,22.8,5.8,2.3,6.2,37.4,0.1,0.5,19.4,...,1.4,80.0,1.0,2.7,3.7,1.4,1.0,0.3,1.1,1.0
1254,Monty Williams,41,12.3,3.3,1.5,3.2,45.1,0.0,0.2,0.0,...,0.9,44.7,1.0,1.4,2.4,1.2,0.5,0.1,1.0,1.0
291,Dennis Rodman*,77,15.0,6.5,2.8,5.1,54.5,0.0,0.0,0.0,...,1.6,58.7,2.1,2.2,4.3,0.7,0.5,0.6,1.2,1.0


The names of the players are not required for decisions, so we will be ignoring it

In [2]:
nba_data.drop('Name', inplace=True, axis=1)

In [3]:
X = nba_data[nba_data.columns[:-1]]
y = nba_data['CareerLongerThan5Years']
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=52,stratify=y)
dt_model = DecisionTreeClassifier(random_state=52)
dt_model.fit(train_X, train_y)
pred_y = dt_model.predict(test_X)

print('Accuracy: {:.2%}'.format(metrics.accuracy_score(test_y, pred_y))) 
print('Recall: {:.2%}'.format(metrics.recall_score(test_y, pred_y))) 
print('Precision: {:.2%}'.format(metrics.precision_score(test_y, pred_y))) 
print('F1 Score: {:.2%}'.format(metrics.f1_score(test_y, pred_y)))

Accuracy: 65.07%
Recall: 68.75%
Precision: 73.33%
F1 Score: 70.97%


The accuracy of 61% is pretty low. So let's take a better look at the data.

In [4]:
X = nba_data[nba_data.columns[:-1]]
y = nba_data['CareerLongerThan5Years']
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=120,stratify=y)
dt_model = DecisionTreeClassifier(random_state=120)
dt_model.fit(train_X, train_y)
pred_y = dt_model.predict(test_X)

print('Accuracy: {:.2%}'.format(metrics.accuracy_score(test_y, pred_y))) 
print('Recall: {:.2%}'.format(metrics.recall_score(test_y, pred_y))) 
print('Precision: {:.2%}'.format(metrics.precision_score(test_y, pred_y))) 
print('F1 Score: {:.2%}'.format(metrics.f1_score(test_y, pred_y)))

Accuracy: 61.19%
Recall: 69.71%
Precision: 68.40%
F1 Score: 69.05%
