In [77]:
import pandas as pd 

import sqlalchemy as db
from sqlalchemy import create_engine

from config import db_password
import psycopg2 

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [78]:
#Create string to use to connect to database
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/NBA_Analysis_Final_Project"

In [79]:
# Start the database engine
engine = db.create_engine(db_string)
connection = engine.connect()
metadata = db.MetaData()

In [80]:
# Load the "games" table
games_data = db.Table('games', metadata, autoload=True, autoload_with=engine)

In [81]:
# Query for the data
query = db.select([games_data])

In [82]:
# Use the connection function to return the "games" data
results = connection.execute(query).fetchall()

In [83]:
# Transform the queried data into a DataFrame
games_df = pd.DataFrame(results)
games_df.columns = results[0].keys()

games_df.head()

Unnamed: 0,game_id,season,team_id_home,pts_home,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,reb_home,team_id_away,pts_away,fg_pct_away,ft_pct_away,fg3_pct_away,ast_away,reb_away,home_team_wins
0,21900895,2019,1610612766,85.0,0.354,0.9,0.2289999999999999,22.0,47.0,1610612749,93.0,0.402,0.762,0.226,20.0,61.0,0
1,21900896,2019,1610612750,91.0,0.364,0.4,0.31,19.0,57.0,1610612742,111.0,0.4679999999999999,0.632,0.275,28.0,56.0,0
2,21900897,2019,1610612746,136.0,0.5920000000000001,0.805,0.542,25.0,37.0,1610612755,130.0,0.505,0.65,0.488,27.0,37.0,1
3,21900898,2019,1610612743,133.0,0.5660000000000001,0.7,0.5,38.0,41.0,1610612761,118.0,0.461,0.897,0.263,24.0,36.0,1
4,21900899,2019,1610612758,106.0,0.407,0.885,0.257,18.0,51.0,1610612765,100.0,0.413,0.667,0.429,23.0,42.0,1


In [84]:
games_df.dtypes

game_id            object
season             object
team_id_home       object
pts_home          float64
fg_pct_home        object
ft_pct_home        object
fg3_pct_home       object
ast_home          float64
reb_home          float64
team_id_away       object
pts_away          float64
fg_pct_away        object
ft_pct_away        object
fg3_pct_away       object
ast_away          float64
reb_away          float64
home_team_wins     object
dtype: object

In [85]:
# Convert data types to floats
games_df['fg_pct_home'] = games_df['fg_pct_home'].astype(float)
games_df['ft_pct_home'] = games_df['ft_pct_home'].astype(float)
games_df['fg3_pct_home'] = games_df['fg3_pct_home'].astype(float)
games_df['fg_pct_away'] = games_df['fg_pct_away'].astype(float)
games_df['ft_pct_away'] = games_df['ft_pct_away'].astype(float)
games_df['fg3_pct_away'] = games_df['fg3_pct_away'].astype(float)

In [86]:
games_df

Unnamed: 0,game_id,season,team_id_home,pts_home,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,reb_home,team_id_away,pts_away,fg_pct_away,ft_pct_away,fg3_pct_away,ast_away,reb_away,home_team_wins
0,21900895,2019,1610612766,85.0,0.354,0.900,0.229,22.0,47.0,1610612749,93.0,0.402,0.762,0.226,20.0,61.0,0
1,21900896,2019,1610612750,91.0,0.364,0.400,0.310,19.0,57.0,1610612742,111.0,0.468,0.632,0.275,28.0,56.0,0
2,21900897,2019,1610612746,136.0,0.592,0.805,0.542,25.0,37.0,1610612755,130.0,0.505,0.650,0.488,27.0,37.0,1
3,21900898,2019,1610612743,133.0,0.566,0.700,0.500,38.0,41.0,1610612761,118.0,0.461,0.897,0.263,24.0,36.0,1
4,21900899,2019,1610612758,106.0,0.407,0.885,0.257,18.0,51.0,1610612765,100.0,0.413,0.667,0.429,23.0,42.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23091,11400007,2014,1610612737,93.0,0.419,0.821,0.421,24.0,50.0,1610612740,87.0,0.366,0.643,0.375,17.0,43.0,1
23092,11400004,2014,1610612741,81.0,0.338,0.719,0.381,18.0,40.0,1610612764,85.0,0.411,0.636,0.267,17.0,47.0,0
23093,11400005,2014,1610612747,98.0,0.448,0.682,0.500,29.0,45.0,1610612743,95.0,0.387,0.659,0.500,19.0,43.0,1
23094,11400002,2014,1610612761,99.0,0.440,0.771,0.333,21.0,30.0,1610612758,94.0,0.469,0.725,0.385,18.0,45.0,1


In [87]:
# Determine which columns in our original games_df will be used for the ML model
model_df_columns = [ 'fg_pct_home', 
                    'ft_pct_home', 
                    'fg3_pct_home', 
                    'ast_home', 
                    'reb_home',  
                    'fg_pct_away', 
                    'ft_pct_away', 
                    'fg3_pct_away', 
                    'ast_away', 
                    'reb_away', 
                   'home_team_wins']

model_df = games_df[model_df_columns]
model_df.head()

Unnamed: 0,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,reb_home,fg_pct_away,ft_pct_away,fg3_pct_away,ast_away,reb_away,home_team_wins
0,0.354,0.9,0.229,22.0,47.0,0.402,0.762,0.226,20.0,61.0,0
1,0.364,0.4,0.31,19.0,57.0,0.468,0.632,0.275,28.0,56.0,0
2,0.592,0.805,0.542,25.0,37.0,0.505,0.65,0.488,27.0,37.0,1
3,0.566,0.7,0.5,38.0,41.0,0.461,0.897,0.263,24.0,36.0,1
4,0.407,0.885,0.257,18.0,51.0,0.413,0.667,0.429,23.0,42.0,1


In [88]:
# Separate dataset into features (X) and target (y)
y = model_df["home_team_wins"]
X = model_df.drop(columns="home_team_wins")

In [89]:
# Split dataset into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y)

## SVC Model

In [90]:
# Import the SVC model
svcmodel = SVC(kernel='linear')

# Train the model
svcmodel.fit(X_train, y_train)

SVC(kernel='linear')

In [91]:
# Create predictions on the SVC model
y_pred_svc = svcmodel.predict(X_test)

In [92]:
# Test accuracy score
accuracy_score(y_test, y_pred_svc)

0.8325251125736058

In [93]:
# Generate a confusion matrix
confusion_matrix(y_test, y_pred_svc)

array([[1819,  520],
       [ 447, 2988]], dtype=int64)

In [94]:
# Print the classification report
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79      2339
           1       0.85      0.87      0.86      3435

    accuracy                           0.83      5774
   macro avg       0.83      0.82      0.83      5774
weighted avg       0.83      0.83      0.83      5774



## Logistic Regression

In [95]:
# Import the logistic regression model
lrmodel = LogisticRegression(solver='lbfgs',max_iter=200, random_state=1)

lrmodel.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [96]:
# Create predictions on the logistic regression model
y_pred_lr = lrmodel.predict(X_test)

In [97]:
# Test accuracy score
accuracy_score(y_test, y_pred_lr)

0.8321787322480083

In [98]:
# Generate a confusion matrix
confusion_matrix(y_test, y_pred_lr)

array([[1806,  533],
       [ 436, 2999]], dtype=int64)

In [99]:
# Print the classification report
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.81      0.77      0.79      2339
           1       0.85      0.87      0.86      3435

    accuracy                           0.83      5774
   macro avg       0.83      0.82      0.82      5774
weighted avg       0.83      0.83      0.83      5774



## Random Forest

In [100]:
# Create a random forest classifier.
rfmodel = RandomForestClassifier(n_estimators=128, random_state=1) 

# Train the model
rfmodel.fit(X_train, y_train)

RandomForestClassifier(n_estimators=128, random_state=1)

In [101]:
# Create predictions on the random forest classifier
y_pred_rf = rfmodel.predict(X_test)

In [102]:
# Test accuracy score
accuracy_score(y_test, y_pred_rf)

0.8278489781780395

In [103]:
# Generate a confusion matrix
confusion_matrix(y_test, y_pred_rf)

array([[1810,  529],
       [ 465, 2970]], dtype=int64)

In [104]:
# Print the classification report
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.80      0.77      0.78      2339
           1       0.85      0.86      0.86      3435

    accuracy                           0.83      5774
   macro avg       0.82      0.82      0.82      5774
weighted avg       0.83      0.83      0.83      5774



## Easy Ensemble AdaBoost Classifier

In [105]:
# Create an Easy Ensemble AdaBoost classifier.
eamodel = EasyEnsembleClassifier(n_estimators=100,random_state=1)

# Train the model
eamodel.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [106]:
# Create predictions on the Easy Ensemble AdaBoost classifier
y_pred_ea = eamodel.predict(X_test)

In [107]:
# Test accuracy score
accuracy_score(y_test, y_pred_ea)

0.826117076550052

In [108]:
# Generate a confusion matrix
confusion_matrix(y_test, y_pred_ea)

array([[1958,  381],
       [ 623, 2812]], dtype=int64)

In [109]:
# Print the classification report
print(classification_report(y_test, y_pred_ea))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80      2339
           1       0.88      0.82      0.85      3435

    accuracy                           0.83      5774
   macro avg       0.82      0.83      0.82      5774
weighted avg       0.83      0.83      0.83      5774

