In [1]:
# import dependencies
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
# read in csv from S3 bucket
cleaned_df = pd.read_csv("https://uci-dataproject3.s3-us-west-1.amazonaws.com/AllTimeNbaSeason4Categories1990.csv")
cleaned_df.columns
cleaned_df["NBA_PER_Range"].unique()

array(['End of the Bench', 'Starter', 'MVP candidate', 'All-Star'],
      dtype=object)

In [3]:
# assign X (data) and y (target)
X = cleaned_df[['PPG','APG','RPG','SPG','BPG','FG%','FT%','3P%']]

X_names = X.columns

y = cleaned_df['NBA_PER_Range'].astype('str')

y_names = ["End of the Bench", "Starter", "MVP candidate", "All-Star"]

In [4]:
# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# create a standard scaler model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# use random forest classifer in order to train the model   
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.8405199746353836

In [7]:
# find importance of each feature and their percentages
sorted(zip(rf.feature_importances_, X_names), reverse=True)

[(0.26456247363325947, 'PPG'),
 (0.1524913323305852, 'FG%'),
 (0.13238187154630424, 'RPG'),
 (0.1143720572398016, 'APG'),
 (0.1110747345123404, 'SPG'),
 (0.09085098009654521, 'BPG'),
 (0.07904575108836377, 'FT%'),
 (0.055220799552800064, '3P%')]

In [8]:
# save your model
filename = 'Random_Forest_Model.sav'
joblib.dump(rf, filename)

['Random_Forest_Model.sav']