In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
import tensorflow as t

In [2]:
# Import our input dataset
games_df = pd.read_csv("games.csv")

games_df.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2021-03-21,22000645,Final,1610612748,1610612754,2020,1610612748,106.0,0.402,0.826,...,27.0,51.0,1610612754,109.0,0.388,0.9,0.333,27.0,50.0,0
1,2021-03-21,22000016,Final,1610612745,1610612760,2020,1610612745,112.0,0.494,0.741,...,22.0,38.0,1610612760,114.0,0.462,0.68,0.333,22.0,52.0,0
2,2021-03-21,22000646,Final,1610612743,1610612740,2020,1610612743,108.0,0.5,0.8,...,30.0,37.0,1610612740,113.0,0.482,0.767,0.286,26.0,45.0,0
3,2021-03-21,22000167,Final,1610612738,1610612753,2020,1610612738,112.0,0.455,1.0,...,27.0,45.0,1610612753,96.0,0.381,0.7,0.324,20.0,44.0,1
4,2021-03-21,22000647,Final,1610612751,1610612764,2020,1610612751,113.0,0.489,0.727,...,24.0,35.0,1610612764,106.0,0.532,0.789,0.333,24.0,43.0,1


In [3]:
# Get datatypes
games_df.dtypes

GAME_DATE_EST        object
GAME_ID               int64
GAME_STATUS_TEXT     object
HOME_TEAM_ID          int64
VISITOR_TEAM_ID       int64
SEASON                int64
TEAM_ID_home          int64
PTS_home            float64
FG_PCT_home         float64
FT_PCT_home         float64
FG3_PCT_home        float64
AST_home            float64
REB_home            float64
TEAM_ID_away          int64
PTS_away            float64
FG_PCT_away         float64
FT_PCT_away         float64
FG3_PCT_away        float64
AST_away            float64
REB_away            float64
HOME_TEAM_WINS        int64
dtype: object

In [4]:
# Generate our categorical variable list
games_cat = games_df.dtypes[games_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
games_df[games_cat].nunique()

GAME_DATE_EST       3892
GAME_STATUS_TEXT       1
dtype: int64

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(games_df[games_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(games_cat)
encode_df.head()

Unnamed: 0,GAME_DATE_EST_2003-10-05,GAME_DATE_EST_2003-10-06,GAME_DATE_EST_2003-10-07,GAME_DATE_EST_2003-10-08,GAME_DATE_EST_2003-10-09,GAME_DATE_EST_2003-10-10,GAME_DATE_EST_2003-10-11,GAME_DATE_EST_2003-10-12,GAME_DATE_EST_2003-10-13,GAME_DATE_EST_2003-10-14,...,GAME_DATE_EST_2021-03-13,GAME_DATE_EST_2021-03-14,GAME_DATE_EST_2021-03-15,GAME_DATE_EST_2021-03-16,GAME_DATE_EST_2021-03-17,GAME_DATE_EST_2021-03-18,GAME_DATE_EST_2021-03-19,GAME_DATE_EST_2021-03-20,GAME_DATE_EST_2021-03-21,GAME_STATUS_TEXT_Final
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [6]:
# Merge one-hot encoded features and drop the originals
games_df = games_df.merge(encode_df,left_index=True, right_index=True)
games_df = games_df.drop(games_cat,1)
games_df.head()

Unnamed: 0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,...,GAME_DATE_EST_2021-03-13,GAME_DATE_EST_2021-03-14,GAME_DATE_EST_2021-03-15,GAME_DATE_EST_2021-03-16,GAME_DATE_EST_2021-03-17,GAME_DATE_EST_2021-03-18,GAME_DATE_EST_2021-03-19,GAME_DATE_EST_2021-03-20,GAME_DATE_EST_2021-03-21,GAME_STATUS_TEXT_Final
0,22000645,1610612748,1610612754,2020,1610612748,106.0,0.402,0.826,0.243,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,22000016,1610612745,1610612760,2020,1610612745,112.0,0.494,0.741,0.368,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,22000646,1610612743,1610612740,2020,1610612743,108.0,0.5,0.8,0.353,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,22000167,1610612738,1610612753,2020,1610612738,112.0,0.455,1.0,0.426,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,22000647,1610612751,1610612764,2020,1610612751,113.0,0.489,0.727,0.31,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [7]:
# Remove loan status target from features data
y = games_df.HOME_TEAM_WINS
X = games_df.drop(columns=["HOME_TEAM_WINS"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [9]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


NameError: name 'tf' is not defined