In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

## Split the Data into Training and Testing Sets

In [3]:
# Read the CSV file 
nba_df = pd.read_csv("C:/Users/tsswi/OneDrive/Desktop/DU_Classwork/Module_23_Project4/merged_df.csv")

# Review the DataFrame
nba_df.head()

Unnamed: 0,GAME_DATE_EST,game_id,home_team,away_team,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,...,over_odds,PTS_comb_actual,book_name_under,book_id_under,total1_under,under_odds,point_average_last10,point_againts_average_last10,away_point_average_last10,away_point_againts_average_last10
0,2017-09-30,11700001,1610612744,1610612743,2017,102.0,0.411,0.833,0.121,17.0,...,-110.0,210.0,Pinnacle Sports,238,222.5,108.0,116.226415,104.320755,111.176893,110.534142
1,2017-09-30,11700002,1610612747,1610612750,2017,99.0,0.441,0.706,0.167,27.0,...,-110.0,207.0,Bookmaker,93,216.0,-102.0,104.43554,111.298407,105.205303,105.693687
2,2017-11-01,21700111,1610612752,1610612745,2017,97.0,0.433,0.846,0.308,23.0,...,-107.0,216.0,Pinnacle Sports,238,215.5,-102.0,99.916667,107.183333,110.787037,104.050926
3,2017-11-01,21700107,1610612755,1610612737,2017,119.0,0.465,0.56,0.371,36.0,...,-104.0,228.0,5Dimes,19,209.0,-105.0,105.028571,107.771429,101.091912,104.283088
4,2017-11-01,21700108,1610612764,1610612756,2017,116.0,0.446,0.8,0.333,20.0,...,-105.0,238.0,Heritage,169,228.0,-105.0,109.972222,104.277778,105.366667,115.866667


In [4]:
# Rename columns 
nba_df.rename(columns = {'point_againts_average_last10':'point_against_average_last10', 'away_point_againts_average_last10':'away_point_against_average_last10'}, inplace = True)

In [6]:
# Drop non-beneficial columns if applicable 
# nba_df.drop(columns=[''], inplace=True)

In [7]:
# Determine the number of unique values in each column.
nba_df.nunique()

GAME_DATE_EST                         198
game_id                              1210
home_team                              30
away_team                              30
SEASON                                  1
PTS_home                               68
FG_PCT_home                           223
FT_PCT_home                           174
FG3_PCT_home                          219
AST_home                               29
REB_home                               37
PTS_away                               68
FG_PCT_away                           225
FT_PCT_away                           181
FG3_PCT_away                          207
AST_away                               33
REB_away                               42
HOME_TEAM_WINS                          2
book_name_over                          9
book_id_over                            9
total1_over                            92
over_odds                              21
PTS_comb_actual                       106
book_name_under                   

In [14]:
# Look at XXXX value counts for binning
nba_df['point_average_last10'].value_counts()

107.050000    10
101.050000     9
102.150000     9
105.650000     9
102.250000     9
              ..
108.225000     1
93.337500      1
98.750000      1
102.011111     1
101.300000     1
Name: point_average_last10, Length: 491, dtype: int64

In [21]:
# Choose a cutoff value and create a list of XXXX to be replaced
# use the variable name `XXXX_types_to_replace`
last10_types = pd.DataFrame(nba_df[['point_average_last10', 'point_against_average_last10', 'away_point_average_last10', 'away_point_against_average_last10'].value_counts().reset_index()

last10_types_to_replace = []

for index, row in last10_types.iterrows():
    if row['point_average_last10','point_against_average_last10', 'away_point_average_last10', 'away_point_against_average_last10'] < 10:
        last10_types_to_replace.append(row['index'])
last10_types_to_replace 


# Replace in dataframe
for last10_types in last10_types_to_replace:
    nba_df[['point_average_last10'], ['point_against_average_last10'], ['away_point_average_last10'], ['away_point_against_average_last10']] = nba_df[['point_average_last10'], ['point_against_average_last10'], ['away_point_average_last10'], ['away_point_against_average_last10']].replace(last10_types,"Other")

# Check to make sure binning was successful
nba_df[['point_average_last10'], ['point_against_average_last10'], ['away_point_average_last10'], ['away_point_against_average_last10']].value_counts()

SyntaxError: invalid syntax (1168585527.py, line 5)

In [None]:
# Look at YYYY value counts for binning
yyyy_counts = nba_df['YYYY'].value_counts()
yyyy_counts

In [None]:
# Choose a cutoff value and create a list of YYYY to be replaced
# use the variable name `yyyy_to_replace`
yyyy_types = pd.DataFrame(nba_df['YYYY'].value_counts()).reset_index()

yyyy_to_replace = []

for index, row in yyyy.iterrows():
    if row['YYYY'] < #y:
        yyyy_to_replace.append(row['index'])
yyyy_to_replace 

# Replace in dataframe
for yyyy in yyyy_to_replace:
    nba_df['YYYY'] = nba_df['YYYY'].replace(yyyy,"Other")
    
# Check to make sure binning was successful
nba_df['YYYY'].value_counts()

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
nba_df = pd.get_dummies(nba_df)

In [None]:
# Split our preprocessed data into our features and target arrays
y = nba_df['']
X = nba_df.drop(columns='')

In [None]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
from tensorflow import keras
from tensorflow.keras import layers

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(layers.Dense(units = 100, activation = 'relu', input_dim = ))

# Second hidden layer
nn.add(layers.Dense(units = 50, activation = 'elu'))

# Output layer
nn.add(layers.Dense(units = 1, activation = 'sigmoid'))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Train the model
fit_modelnn = nn.fit(X_train_scaled, y_train, epochs = 100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")