# Random Forest
 
Use the Random Forest Algorithm to train and test a model.

In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics

nba_data = pd.read_csv('../output/new_nba_data.csv')
nba_data.sample(10, random_state=13)

Unnamed: 0,Name,GamesPlayed,MinutesPlayed,PointsPerGame,FieldGoalsMade,FieldGoalAttempts,FieldGoalPercent,3PointMade,3PointAttempts,3PointAttemptsPercent,...,FreeThrowAttempts,FreeThrowPercent,OffensiveRebounds,DefensiveRebounds,Rebounds,Assists,Steals,Blocks,Turnovers,CareerLongerThan5Years
394,Steve Burtt,47,8.9,4.2,1.5,4.0,38.3,0.0,0.0,0.0,...,1.6,68.8,0.2,0.4,0.6,0.4,0.5,0.1,0.7,0.0
881,Antoine Wright,39,9.5,1.8,0.7,2.1,35.8,0.0,0.4,6.7,...,0.6,50.0,0.2,0.6,0.8,0.3,0.1,0.1,0.5,1.0
358,Melvin Turpin,79,24.7,10.6,4.6,9.0,51.1,0.0,0.0,0.0,...,1.8,78.4,2.0,3.8,5.7,0.5,0.5,1.1,1.5,1.0
367,Charles Jones,29,16.4,3.7,1.3,4.2,31.7,0.7,2.1,31.1,...,0.8,50.0,0.3,1.1,1.4,1.4,0.6,0.2,1.0,1.0
259,Michael Jackson,58,13.1,2.7,1.1,2.9,37.4,0.1,0.4,24.0,...,0.6,71.9,0.3,0.7,1.0,3.1,0.3,0.1,1.0,0.0
874,Ike Diogu,69,14.9,7.0,2.5,4.8,52.4,0.0,0.0,0.0,...,2.4,81.0,1.4,1.9,3.3,0.4,0.2,0.4,1.1,1.0
1203,Alan Henderson,79,17.9,6.4,2.4,5.5,44.2,0.0,0.0,0.0,...,2.5,59.5,2.1,2.4,4.5,0.6,0.6,0.5,1.1,1.0
1221,Elijah Millsap,47,19.7,5.3,1.7,5.1,34.0,0.6,1.9,31.1,...,1.8,67.4,0.6,2.6,3.2,1.2,1.2,0.3,1.4,0.0
900,Matt Freije,23,19.2,4.0,1.6,5.5,29.1,0.6,2.3,25.9,...,0.3,62.5,0.7,2.0,2.7,0.9,0.6,0.1,0.7,0.0
710,DeMar DeRozan,77,21.6,8.6,3.3,6.6,49.8,0.1,0.2,25.0,...,2.5,76.3,0.9,2.0,2.9,0.7,0.6,0.2,0.8,1.0


The names of the players are not required for decisions, so we will be ignoring it

In [6]:
nba_data.drop('Name', inplace=True, axis=1)

In [10]:
X = nba_data[nba_data.columns[:-1]]
y = nba_data['CareerLongerThan5Years']
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=52,stratify=y)
dt_model = DecisionTreeClassifier(random_state=52)
dt_model.fit(train_X, train_y)
pred_y = dt_model.predict(test_X)

print('Accuracy: {:.2%}'.format(metrics.accuracy_score(test_y, pred_y))) 
print('Recall: {:.2%}'.format(metrics.recall_score(test_y, pred_y))) 
print('Precision: {:.2%}'.format(metrics.precision_score(test_y, pred_y))) 
print('F1 Score: {:.2%}'.format(metrics.f1_score(test_y, pred_y)))

Accuracy: 65.07%
Recall: 68.75%
Precision: 73.33%
F1 Score: 70.97%


The accuracy of 61% is pretty low. So let's take a better look at the data.

In [18]:
X = nba_data[nba_data.columns[:-1]]
y = nba_data['CareerLongerThan5Years']
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=120,stratify=y)
dt_model = DecisionTreeClassifier(random_state=120)
dt_model.fit(train_X, train_y)
pred_y = dt_model.predict(test_X)

print('Accuracy: {:.2%}'.format(metrics.accuracy_score(test_y, pred_y))) 
print('Recall: {:.2%}'.format(metrics.recall_score(test_y, pred_y))) 
print('Precision: {:.2%}'.format(metrics.precision_score(test_y, pred_y))) 
print('F1 Score: {:.2%}'.format(metrics.f1_score(test_y, pred_y)))

Accuracy: 61.19%
Recall: 69.71%
Precision: 68.40%
F1 Score: 69.05%
