In [13]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
import tensorflow as tf

In [3]:
# Import our input dataset
games_df = pd.read_csv("../Resources/games_final.csv")

games_df.head()

Unnamed: 0.1,Unnamed: 0,SEASON_ID,HOME_TEAM_ID,GAME_ID,GAME_DATE,HOME_PTS,HOME_FGM,HOME_FGA,HOME_FG_PCT,HOME_FG3M,...,AWAY_FT_PCT,AWAY_OREB,AWAY_DREB,AWAY_REB,AWAY_AST,AWAY_STL,AWAY_BLK,AWAY_TOV,AWAY_PF,AWAY_PLUS_MINUS
0,0,22020.0,1610613000.0,22000904,2021-04-24,120.0,42.0,75.0,0.56,16.0,...,0.737,5.0,28.0,33.0,31.0,6.0,3.0,7.0,19.0,-6.0
1,1,22020.0,1610613000.0,22000906,2021-04-24,56.0,20.0,44.0,0.455,6.0,...,0.75,10.0,19.0,29.0,11.0,1.0,3.0,10.0,11.0,-10.4
2,2,22020.0,1610613000.0,22000905,2021-04-24,132.0,49.0,95.0,0.516,18.0,...,0.735,10.0,31.0,41.0,15.0,11.0,4.0,17.0,20.0,-38.0
3,3,22020.0,1610613000.0,22000902,2021-04-24,109.0,39.0,93.0,0.419,10.0,...,0.938,12.0,38.0,50.0,25.0,7.0,7.0,20.0,23.0,20.0
4,4,22020.0,1610613000.0,22000900,2021-04-24,108.0,40.0,94.0,0.426,14.0,...,0.714,11.0,36.0,47.0,27.0,4.0,1.0,16.0,20.0,-6.0


In [4]:
# Get datatypes
games_df.dtypes

Unnamed: 0           int64
SEASON_ID          float64
HOME_TEAM_ID       float64
GAME_ID              int64
GAME_DATE           object
HOME_PTS           float64
HOME_FGM           float64
HOME_FGA           float64
HOME_FG_PCT        float64
HOME_FG3M          float64
HOME_FG3A          float64
HOME_FG3_PCT       float64
HOME_FTM           float64
HOME_FTA           float64
HOME_FT_PCT        float64
HOME_OREB          float64
HOME_DREB          float64
HOME_REB           float64
HOME_AST           float64
HOME_STL           float64
HOME_BLK           float64
HOME_TOV           float64
HOME_PF            float64
HOME_PLUS_MINUS    float64
HOME_WIN             int64
AWAY_TEAM_ID       float64
AWAY_PTS           float64
AWAY_FGM           float64
AWAY_FGA           float64
AWAY_FG_PCT        float64
AWAY_FG3M          float64
AWAY_FG3A          float64
AWAY_FG3_PCT       float64
AWAY_FTM           float64
AWAY_FTA           float64
AWAY_FT_PCT        float64
AWAY_OREB          float64
A

In [5]:
# Generate our categorical variable list
games_cat = games_df.dtypes[games_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
games_df[games_cat].nunique()

GAME_DATE    900
dtype: int64

In [6]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(games_df[games_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(games_cat)
encode_df.head()

Unnamed: 0,GAME_DATE_2018-01-07,GAME_DATE_2018-01-08,GAME_DATE_2018-01-09,GAME_DATE_2018-01-10,GAME_DATE_2018-01-11,GAME_DATE_2018-01-12,GAME_DATE_2018-01-13,GAME_DATE_2018-01-14,GAME_DATE_2018-01-15,GAME_DATE_2018-01-16,...,GAME_DATE_2021-04-15,GAME_DATE_2021-04-16,GAME_DATE_2021-04-17,GAME_DATE_2021-04-18,GAME_DATE_2021-04-19,GAME_DATE_2021-04-20,GAME_DATE_2021-04-21,GAME_DATE_2021-04-22,GAME_DATE_2021-04-23,GAME_DATE_2021-04-24
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
# Merge one-hot encoded features and drop the originals
games_df = games_df.merge(encode_df,left_index=True, right_index=True)
games_df = games_df.drop(games_cat,1)
games_df.head()

Unnamed: 0.1,Unnamed: 0,SEASON_ID,HOME_TEAM_ID,GAME_ID,HOME_PTS,HOME_FGM,HOME_FGA,HOME_FG_PCT,HOME_FG3M,HOME_FG3A,...,GAME_DATE_2021-04-15,GAME_DATE_2021-04-16,GAME_DATE_2021-04-17,GAME_DATE_2021-04-18,GAME_DATE_2021-04-19,GAME_DATE_2021-04-20,GAME_DATE_2021-04-21,GAME_DATE_2021-04-22,GAME_DATE_2021-04-23,GAME_DATE_2021-04-24
0,0,22020.0,1610613000.0,22000904,120.0,42.0,75.0,0.56,16.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,22020.0,1610613000.0,22000906,56.0,20.0,44.0,0.455,6.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,22020.0,1610613000.0,22000905,132.0,49.0,95.0,0.516,18.0,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,22020.0,1610613000.0,22000902,109.0,39.0,93.0,0.419,10.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,22020.0,1610613000.0,22000900,108.0,40.0,94.0,0.426,14.0,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# Remove loan status target from features data
y = games_df.HOME_WIN
X = games_df.drop(columns=["HOME_WIN"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.996


In [14]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
117/117 - 0s - loss: 0.1258 - accuracy: 0.9750
Loss: 0.12580901384353638, Accuracy: 0.9750066995620728
