In [1409]:
#standard ds imports
import pandas as pd
import numpy as np
#viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# .py imports
#import wranglerer as wr
#import modeling as md
import os
#sklearn imports
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#CATboost imports
from catboost import CatBoostClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
import explore_r as ex
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('prepped_data.csv')

In [11]:
df.columns

Index(['day_of_week', 'start_time', 'week_num', 'stadium', 'temp', 'humidity',
       'wind', 'spread', 'ou', 'is_under', 'abnormal_start', 'is_playoff',
       'playoff_implications', 'is_turf', 'is_outdoor'],
      dtype='object')

In [6]:
df = df.drop(columns=['date','home_score',
       'home_wins', 'away_score', 'away_wins','total_scores'])
df['spread'] = abs(df['spread'])

In [7]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [8]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape 

((6471, 14), (6471,), (2394, 14), (2394,), (1946, 14), (1946,))

In [None]:
X_train.head()

# MODELING
## CATBoost

In [9]:
# Create and fit the thing
CATb = CatBoostClassifier(verbose=False,depth=5)
CATb.fit(X_train,y_train,cat_features=['day_of_week','start_time','stadium'])
CATb_preds = CATb.predict(X_train)
pd.crosstab(CATb_preds,y_train) # a confusion matrix with ACTUALS as columns and PREDICTIONS as rows

is_under,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2111,1137
1,1069,2154


In [10]:
print(f'Accuracy-Train {round(CATb.score(X_train,y_train),4)}')
print(f'Accuracy-Validate {round(CATb.score(X_validate,y_validate),4)}')
print(classification_report(y_train,CATb_preds))
print(classification_report(y_validate,CATb.predict(X_validate)))

Accuracy-Train 0.6591
Accuracy-Validate 0.5063
              precision    recall  f1-score   support

           0       0.65      0.66      0.66      3180
           1       0.67      0.65      0.66      3291

    accuracy                           0.66      6471
   macro avg       0.66      0.66      0.66      6471
weighted avg       0.66      0.66      0.66      6471

              precision    recall  f1-score   support

           0       0.50      0.51      0.50      1178
           1       0.51      0.51      0.51      1216

    accuracy                           0.51      2394
   macro avg       0.51      0.51      0.51      2394
weighted avg       0.51      0.51      0.51      2394



## CATboost grid_search CROSS_VALIDATION k=5

In [14]:
param_grid = {
    'verbose': [False],
    'depth': [5, 10,15]
}
gr_search = GridSearchCV(CatBoostClassifier(),
                      param_grid)

In [15]:
gr_search

In [16]:
gr_search.fit(X_train, y_train,cat_features=['day_of_week','start_time','stadium'])

In [17]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(3, 15)

In [18]:
params = pd.DataFrame(results['params'])
results_df_init.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_verbose,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.767949,0.020285,0.002929,0.00022,5,False,"{'depth': 5, 'verbose': False}",0.528185,0.512365,0.51391,0.515456,0.525502,0.519084,0.006467,1
1,6.124596,0.038884,0.004495,0.000132,10,False,"{'depth': 10, 'verbose': False}",0.498842,0.503864,0.518547,0.507728,0.513138,0.508424,0.006896,2
2,87.300167,1.279427,0.039538,0.034079,15,False,"{'depth': 15, 'verbose': False}",0.495753,0.513138,0.50541,0.512365,0.510046,0.507342,0.00639,3


# start removing features 
### Decision Tree

In [227]:
df = pd.read_csv('prepped_data.csv')

In [228]:
df.columns

Index(['date', 'day_of_week', 'start_time', 'week_num', 'home_score',
       'home_wins', 'away_score', 'away_wins', 'stadium', 'temp', 'humidity',
       'wind', 'spread', 'ou', 'is_under', 'abnormal_start', 'total_scores',
       'is_playoff', 'playoff_implications', 'is_turf', 'is_outdoor'],
      dtype='object')

In [229]:
df.drop(columns=['date', 'day_of_week', 'spread', 'home_score',
       'home_wins', 'away_score', 'away_wins', 'stadium',
        'abnormal_start', 'total_scores', 'is_outdoor'], inplace=True)

In [230]:
df

Unnamed: 0,start_time,week_num,temp,humidity,wind,ou,is_under,is_playoff,playoff_implications,is_turf
0,6,19,72,0,0,51.0,0,1,1,0
1,3,19,52,48,14,45.5,1,1,1,0
2,6,19,22,55,13,48.0,1,1,1,0
3,3,19,32,10,0,49.0,1,1,1,1
4,6,19,55,47,19,47.0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...
10806,4,1,76,71,8,37.0,1,0,0,1
10807,4,1,73,76,10,41.0,0,0,0,0
10808,4,1,70,77,10,36.5,0,0,0,0
10809,4,1,72,0,0,42.5,0,0,0,1


In [231]:
X_train, y_train, X_validate, y_validate, X_test, y_test = ex.train_validate_test(df,'is_under')

In [259]:
depth = [i for i in range(1, 21, 1)] + [None]
param_grid = {
    'criterion':['gini', "entropy", "log_loss"],
    'splitter':['best'],
    'max_depth': depth,
    'min_samples_split':[i for i in range(1, 21, 1)],
    'min_samples_leaf':[i for i in range(1, 21, 1)],
}
gr_search = GridSearchCV(DecisionTreeClassifier(),
                      param_grid)

In [260]:
gr_search

In [261]:
gr_search.fit(X_train, y_train)

In [262]:
results = gr_search.cv_results_
results_df_init = pd.DataFrame(results)
results_df_init.shape

(25200, 18)

In [263]:
results_df_init.loc[results_df_init['mean_test_score'].idxmax()]

mean_fit_time                                                       0.005544
std_fit_time                                                        0.000097
mean_score_time                                                       0.0005
std_score_time                                                      0.000006
param_criterion                                                      entropy
param_max_depth                                                            6
param_min_samples_leaf                                                     5
param_min_samples_split                                                    2
param_splitter                                                          best
params                     {'criterion': 'entropy', 'max_depth': 6, 'min_...
split0_test_score                                                   0.505792
split1_test_score                                                   0.528594
split2_test_score                                                   0.523184

In [264]:
results_df_init[results_df_init['mean_test_score'] == results_df_init['mean_test_score'].max()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10481,0.005544,9.7e-05,0.0005,6e-06,entropy,6,5,2,best,"{'criterion': 'entropy', 'max_depth': 6, 'min_...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1
10482,0.005527,3.4e-05,0.000503,6e-06,entropy,6,5,3,best,"{'criterion': 'entropy', 'max_depth': 6, 'min_...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1
10485,0.005484,3.9e-05,0.00055,9.4e-05,entropy,6,5,6,best,"{'criterion': 'entropy', 'max_depth': 6, 'min_...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1
10487,0.005513,4.8e-05,0.000503,7e-06,entropy,6,5,8,best,"{'criterion': 'entropy', 'max_depth': 6, 'min_...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1
10489,0.005523,6.2e-05,0.000506,1.5e-05,entropy,6,5,10,best,"{'criterion': 'entropy', 'max_depth': 6, 'min_...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1
10490,0.005552,7.5e-05,0.000535,4.3e-05,entropy,6,5,11,best,"{'criterion': 'entropy', 'max_depth': 6, 'min_...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1
10491,0.005484,2.9e-05,0.000503,3e-06,entropy,6,5,12,best,"{'criterion': 'entropy', 'max_depth': 6, 'min_...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1
18882,0.005751,0.000134,0.000619,8.9e-05,log_loss,6,5,3,best,"{'criterion': 'log_loss', 'max_depth': 6, 'min...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1
18883,0.005602,0.000119,0.00055,3.5e-05,log_loss,6,5,4,best,"{'criterion': 'log_loss', 'max_depth': 6, 'min...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1
18884,0.005513,5.9e-05,0.000508,1.5e-05,log_loss,6,5,5,best,"{'criterion': 'log_loss', 'max_depth': 6, 'min...",0.505792,0.528594,0.523184,0.528594,0.523184,0.521869,0.008395,1


In [265]:
param_grid = {
    'criterion': ['gini'],
    'max_depth': [6],
      'min_samples_leaf': [5],
      'min_samples_split': [2],
      'splitter': ['best']}
gr_search1 = GridSearchCV(DecisionTreeClassifier(),
                      param_grid)

In [266]:
gr_search1


In [267]:
gr_search1.fit(X_train, y_train)

In [268]:
results1 = gr_search1.cv_results_
results_df_init1 = pd.DataFrame(results1)
results_df_init1.shape

(1, 18)

In [269]:
results_df_init1

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.010926,0.00622,0.001102,0.000564,gini,6,5,2,best,"{'criterion': 'gini', 'max_depth': 6, 'min_sam...",0.495753,0.507728,0.508501,0.517774,0.527048,0.511361,0.010509,1


# Trying something different

In [1479]:
 def train_validate_test(df,target):
    """
    Splits data into 3 segments and stratifies on target
    requires the dataframe and target as args
    """
    train_val, test = train_test_split(df,
                                       train_size=0.8,
                                       random_state=706,
                                       stratify=df[target])
    train, validate = train_test_split(train_val,
                                       train_size=0.7,
                                       random_state=706,
                                       stratify=train_val[target])
    return train, validate, test

In [1488]:
df = pd.read_csv('prepped_data.csv')

In [1489]:
# df = pd.get_dummies(df, columns=['day_of_week', 'stadium'])

In [1490]:
columns = df.columns.to_list()

In [1491]:
columns.remove('is_under')
columns.remove('date')
columns.remove('total_scores')
columns.remove('home_score')
columns.remove('away_score')
columns.remove('day_of_week')
columns.remove('stadium')

In [1492]:
def scale_data(train, val, test):
    x_cols = columns
    split = [train, val, test]
    scale_list= []
    scaler = MinMaxScaler()
    scaler.fit(train[x_cols])
    for cut in split:
        cut_copy = cut.copy()
        cut_copy[x_cols] = scaler.transform(cut_copy[x_cols])
        scale_list.append(cut_copy)
    return scale_list[0], scale_list[1], scale_list[2] 

In [1493]:
train, val, test = train_validate_test(df, 'is_under')

In [1494]:
train.shape, val.shape, test.shape

((6053, 21), (2595, 21), (2163, 21))

In [1495]:
train_scaled, val_scaled, test_scaled = scale_data(train, val, test)



In [1496]:
baseline_accuracy = train.is_under.value_counts(normalize=True)[1]

In [1497]:
baseline_accuracy

0.5073517264166529

In [669]:
def get_target_and_features(train_scale, val_scale, test_scale):    
    x_cols = columns
    y_cols = 'is_under'

    x_train = train_scale[x_cols]
    y_train = train_scale[y_cols]

    x_val = val_scale[x_cols]
    y_val = val_scale[y_cols]

    x_test = test_scale[x_cols]
    y_test = test_scale[y_cols]
    return x_train, y_train, x_val, y_val, x_test, y_test

In [1498]:
x_train, y_train, x_val, y_val, x_test, y_test = get_target_and_features(train_scaled, val_scaled, test_scaled)

In [1500]:
def get_decisionTree_model(depth):
    """
    Returns a decision treen model with a max depth arg
    prints out the Accuracy of train and validate and the 
    classification report
    """
    clf = DecisionTreeClassifier(max_depth=depth, random_state=706)
    #class_weight='balanced'
    # fit the thing
    clf.fit(x_train, y_train)

    model_proba = clf.predict_proba(x_train)
    model_preds = clf.predict(x_train)

    model_score = clf.score(x_train, y_train)

    #classification report:
    print(
        classification_report(y_train,
                          model_preds))
    print('Accuracy of Random Tree classifier on training set: {:.3f}'
     .format(clf.score(x_train, y_train)))
    print('Accuracy of Random Tree classifier on validation set: {:.3f}'
     .format(clf.score(x_val, y_val)))
    return clf, model_preds

In [1656]:
get_decisionTree_model(3)

              precision    recall  f1-score   support

           0       0.50      0.90      0.65      2982
           1       0.60      0.14      0.22      3071

    accuracy                           0.52      6053
   macro avg       0.55      0.52      0.44      6053
weighted avg       0.55      0.52      0.43      6053

Accuracy of Random Tree classifier on training set: 0.515
Accuracy of Random Tree classifier on validation set: 0.503


(DecisionTreeClassifier(max_depth=3, random_state=706),
 array([0, 0, 0, ..., 0, 0, 0]))

In [790]:
x_train

Unnamed: 0,start_time,week_num,home_wins,away_wins,temp,humidity,wind,spread,ou,abnormal_start,...,stadium_The Coliseum,stadium_Three Rivers Stadium,stadium_Tiger Stadium (LSU),stadium_Tottenham Stadium,stadium_Twickenham Stadium,stadium_U.S. Bank Stadium,stadium_University of Phoenix Stadium,stadium_Vanderbilt Stadium,stadium_Veterans Stadium,stadium_Wembley Stadium
10538,0.000000,0.166667,0.15,0.10,0.742268,0.000000,0.000000,0.226415,0.422535,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1913,0.272727,0.888889,0.50,0.35,0.381443,0.565657,0.128571,0.264151,0.422535,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9647,0.000000,0.444444,0.00,0.10,0.742268,0.000000,0.000000,0.169811,0.338028,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9828,0.272727,0.666667,0.50,0.25,0.443299,0.616162,0.114286,0.377358,0.746479,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8253,0.000000,0.833333,0.50,0.30,0.082474,0.686869,0.171429,0.245283,0.197183,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5080,0.000000,0.777778,0.45,0.30,0.381443,0.898990,0.142857,0.188679,0.253521,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4856,0.000000,0.611111,0.30,0.30,0.742268,0.000000,0.000000,0.226415,0.492958,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6428,0.272727,0.555556,0.30,0.25,0.597938,0.656566,0.142857,0.056604,0.281690,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.272727,1.000000,0.65,0.45,0.742268,0.000000,0.000000,0.113208,0.563380,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [560]:
def get_random_forest():
    """
    Runs through two for loops from range 1 - 5 each time increasing the max depth 
    and min sample leaf
    puts all of the models in a pandas data frame and sorts for the hightes valadation 
    Prints out the classification report on the best model
    """
    
    model_list = []

    for j in range (1, 15):
        for i in range(2, 15):
            rf = RandomForestClassifier(n_estimators=101 ,max_depth=i, min_samples_leaf=j, random_state=706)

            rf = rf.fit(x_train, y_train)
            train_accuracy = rf.score(x_train, y_train)
            validate_accuracy = rf.score(x_val, y_val)
            model_preds = rf.predict(x_train)

            output = {
                "min_samples_per_leaf": j,
                "max_depth": i,
                "train_accuracy": train_accuracy,
                "validate_accuracy": validate_accuracy,
                'model_preds': model_preds
            }
            model_list.append(output)
            
    df = pd.DataFrame(model_list)
    df["difference"] = df.train_accuracy - df.validate_accuracy
    df["baseline_accuracy"] = baseline_accuracy
    # df[df.validate_accuracy > df.baseline_accuracy + .05].sort_values(by=['difference'], ascending=True).head(15)
    df.sort_values(by=['validate_accuracy'], ascending=False).head(1)
    
    #classification report:
    print(classification_report(y_train, df['model_preds'][1]))
    return df.sort_values(by=['validate_accuracy'], ascending=False).head(1)
    


In [980]:
get_random_forest()

              precision    recall  f1-score   support

           0       0.74      0.16      0.26      2982
           1       0.54      0.95      0.68      3071

    accuracy                           0.56      6053
   macro avg       0.64      0.55      0.47      6053
weighted avg       0.63      0.56      0.47      6053



Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,model_preds,difference,baseline_accuracy
4,1,6,0.616884,0.532948,"[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",0.083936,0.507352


In [1637]:
def get_logReg_model(data):
    """
    build a logistical regression model and prints out the accuracy on training and validation along with the classification report. 
    Must type in train_val as your data arrg to get the train val result.
    Type test if you want to test the model
    if you want a csv of the model preds and preds proba then un comment all of the stuff at the bottom
    """
    logit = LogisticRegression(random_state=706)
    logit.fit(x_train, y_train)
    y_pred = logit.predict(x_train)
    y_pred_val = logit.predict(x_val)
    y_proba = logit.predict_proba(x_train)
    logit_val = logit.predict(x_val)
    if data == 'train_val':
        print('Accuracy of Logistic Regression classifier on training set: {:.3f}'
         .format(logit.score(x_train, y_train)))
        print('Accuracy of Logistic Regression classifier on validation set: {:.3f}'
         .format(logit.score(x_val, y_val)))
        print(
        classification_report(y_train,
                          y_pred))
        print(
        classification_report(y_val,
                          y_pred_val))
    else: 
        print('Accuracy of logistic regression classifier on test set: {:.3f}'
         .format(logit.score(x_test, y_test)))

In [1653]:
get_logReg_model('train_val')


Accuracy of Logistic Regression classifier on training set: 0.556
Accuracy of Logistic Regression classifier on validation set: 0.525
              precision    recall  f1-score   support

           0       0.55      0.54      0.54      2982
           1       0.56      0.57      0.57      3071

    accuracy                           0.56      6053
   macro avg       0.56      0.56      0.56      6053
weighted avg       0.56      0.56      0.56      6053

              precision    recall  f1-score   support

           0       0.52      0.48      0.50      1278
           1       0.53      0.57      0.55      1317

    accuracy                           0.52      2595
   macro avg       0.52      0.52      0.52      2595
weighted avg       0.52      0.52      0.52      2595



In [1644]:
bnb = BernoulliNB()
# Fit the model using the training data
bnb.fit(x_train, y_train)

# Make predictions on the testing data
y_pred = bnb.predict(x_train)
y_pred_val = bnb.predict(x_val)

# Calculate the accuracy of the model
print(classification_report(y_train,
                          y_pred))
print('Accuracy of Logistic Regression classifier on training set: {:.3f}'
 .format(bnb.score(x_train, y_train)))
print('Accuracy of Logistic Regression classifier on validation set: {:.3f}'
 .format(bnb.score(x_val, y_val)))
print(classification_report(y_val,
                          y_pred_val))

              precision    recall  f1-score   support

           0       0.55      0.26      0.35      2982
           1       0.53      0.80      0.63      3071

    accuracy                           0.53      6053
   macro avg       0.54      0.53      0.49      6053
weighted avg       0.54      0.53      0.49      6053

Accuracy of Logistic Regression classifier on training set: 0.531
Accuracy of Logistic Regression classifier on validation set: 0.514
              precision    recall  f1-score   support

           0       0.51      0.24      0.33      1278
           1       0.51      0.78      0.62      1317

    accuracy                           0.51      2595
   macro avg       0.51      0.51      0.48      2595
weighted avg       0.51      0.51      0.48      2595



In [1502]:
k = 8

x_train.columns

x = train[['start_time', 'week_num', 'home_wins', 'away_wins', 'temp', 'humidity',
       'wind', 'spread', 'ou', 'abnormal_start', 'is_playoff',
       'playoff_implications', 'is_turf', 'is_outdoor']]

y = y_train

y_train

selector = SelectKBest(score_func=f_classif, k=k)

selector.fit(x, y)

mask = selector.get_support()

X_selected = x.iloc[:, mask]


In [1244]:
X_selected.columns.to_list()

['start_time', 'home_wins', 'wind', 'ou', 'is_turf', 'is_outdoor']

In [761]:
columns = X_selected.columns.to_list()

In [555]:
df[columns]

Unnamed: 0,start_time,home_wins,humidity,wind,ou,is_turf,is_outdoor
0,6,16,0,0,51.0,0,0
1,3,15,48,14,45.5,0,1
2,6,15,55,13,48.0,0,1
3,3,14,10,0,49.0,1,1
4,6,14,47,19,47.0,0,1
...,...,...,...,...,...,...,...
10806,4,0,71,8,37.0,1,1
10807,4,0,76,10,41.0,0,1
10808,4,0,77,10,36.5,0,1
10809,4,0,0,0,42.5,1,0


## Neural Net sklearn

In [565]:
from sklearn.neural_network import MLPClassifier

In [1516]:
x_train

Unnamed: 0,start_time,week_num,home_wins,away_wins,temp,humidity,wind,spread,ou,abnormal_start,is_playoff,playoff_implications,is_turf,is_outdoor
10538,0.000000,0.166667,0.15,0.10,0.742268,0.000000,0.000000,0.226415,0.422535,0.0,0.0,0.0,1.0,0.0
1913,0.272727,0.888889,0.50,0.35,0.381443,0.565657,0.128571,0.264151,0.422535,0.0,0.0,1.0,0.0,1.0
9647,0.000000,0.444444,0.00,0.10,0.742268,0.000000,0.000000,0.169811,0.338028,0.0,0.0,0.0,1.0,0.0
9828,0.272727,0.666667,0.50,0.25,0.443299,0.616162,0.114286,0.377358,0.746479,1.0,0.0,1.0,1.0,1.0
8253,0.000000,0.833333,0.50,0.30,0.082474,0.686869,0.171429,0.245283,0.197183,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5080,0.000000,0.777778,0.45,0.30,0.381443,0.898990,0.142857,0.188679,0.253521,0.0,0.0,1.0,0.0,1.0
4856,0.000000,0.611111,0.30,0.30,0.742268,0.000000,0.000000,0.226415,0.492958,0.0,0.0,1.0,1.0,0.0
6428,0.272727,0.555556,0.30,0.25,0.597938,0.656566,0.142857,0.056604,0.281690,0.0,0.0,1.0,0.0,1.0
9,0.272727,1.000000,0.65,0.45,0.742268,0.000000,0.000000,0.113208,0.563380,0.0,1.0,1.0,1.0,0.0


In [996]:
# x_train.columns.to_list()

In [1721]:
# Create an MLP classifier object
clf = MLPClassifier(hidden_layer_sizes=(120, 60, 30), activation='relu', solver='adam', learning_rate='constant', random_state=706)

# Fit the model to the training data
clf.fit(x_train, y_train)


model_proba = clf.predict_proba(x_train)
model_preds = clf.predict(x_train)
model_preds_val = clf.predict(x_val)
model_score = clf.score(x_train, y_train)

#classification report:
print(
    classification_report(y_train,
                      model_preds))
print('Accuracy of Random Tree classifier on training set: {:.3f}'
 .format(clf.score(x_train, y_train)))
print('Accuracy of Random Tree classifier on validation set: {:.3f}'
 .format(clf.score(x_val, y_val)))
print(
    classification_report(y_val,
                      model_preds_val))

              precision    recall  f1-score   support

           0       0.57      0.45      0.50      2982
           1       0.56      0.68      0.61      3071

    accuracy                           0.56      6053
   macro avg       0.57      0.56      0.56      6053
weighted avg       0.57      0.56      0.56      6053

Accuracy of Random Tree classifier on training set: 0.564
Accuracy of Random Tree classifier on validation set: 0.505
              precision    recall  f1-score   support

           0       0.50      0.38      0.43      1278
           1       0.51      0.63      0.56      1317

    accuracy                           0.50      2595
   macro avg       0.50      0.50      0.50      2595
weighted avg       0.50      0.50      0.50      2595



## Knn

In [1720]:
k = 3
knn = KNeighborsClassifier(n_neighbors=k)

knn.fit(x_train, y_train)

y_pred = knn.predict(x_val)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
 .format(knn.score(x_train, y_train)))
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
 .format(knn.score(x_val, y_val)))

print(classification_report(y_val,
                          y_pred))

Accuracy of Logistic Regression classifier on training set: 0.68
Accuracy of Logistic Regression classifier on validation set: 0.49
              precision    recall  f1-score   support

           0       0.48      0.36      0.41      1278
           1       0.50      0.62      0.55      1317

    accuracy                           0.49      2595
   macro avg       0.49      0.49      0.48      2595
weighted avg       0.49      0.49      0.48      2595



In [1754]:
def scale_df(df):
    x_cols = columns
    scaler = StandardScaler()
    scaler.fit(df[x_cols])
    df[x_cols] = scaler.transform(df[x_cols])
    return df
    

# DBScan

In [1755]:
df2 = df.copy()

df2.drop(columns=['date', 'total_scores', 'home_score', 'away_score'], inplace=True)

columns = df2.columns.to_list()

columns.remove('is_under')
# columns.remove('total_scores')
# columns.remove('home_score')
# columns.remove('away_score')
columns.remove('week_num')
columns.remove('day_of_week')
columns.remove('stadium')
# columns.remove('start_time')

df2_scaled = scale_df(df2)


drop_cols = ['start_time', 'is_under']
cols = ['start_time', 'is_under', 'humidity','wind', 'temp']
df2_scaled = df2_scaled[cols]


dbscan = DBSCAN(eps=0.05, min_samples=5)

# Fit the DBSCAN model to the data
dbscan.fit(df2_scaled.drop(columns=drop_cols))

# Access the labels assigned to each data point
labels = dbscan.labels_

# Access the core samples identified by DBSCAN
core_samples = dbscan.core_sample_indices_

# Print the number of clusters (excluding noise points)
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print("Number of clusters:", num_clusters)

# Print the cluster labels and core sample indices
print("Cluster labels:", labels)
print("Core samples indices:", core_samples)

df2_scaled['cluster']= labels

# kmeans = KMeans(n_clusters=7)

# # Fit the K-means model to your data
# kmeans.fit(df2_scaled.drop(columns=['is_under']))

# # Obtain the cluster labels for each data point
# labels = kmeans.labels_

# # Add the cluster labels as a new column to the DataFrame
# df2_scaled['cluster'] = labels

df2_scaled = pd.get_dummies(df2_scaled ,columns=['cluster'])


train, val, test = train_validate_test(df2_scaled, 'is_under')


train.shape, val.shape, test.shape

columns = df2_scaled.columns.to_list()

columns.remove('is_under')

x_train, y_train, x_val, y_val, x_test, y_test = get_target_and_features(train, val, test)

Number of clusters: 64
Cluster labels: [ 0 -1 -1 ... -1  0 -1]
Core samples indices: [    0     9    13 ... 10790 10804 10809]


In [1756]:
x_train

Unnamed: 0,start_time,humidity,wind,temp,cluster_-1,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,...,cluster_54,cluster_55,cluster_56,cluster_57,cluster_58,cluster_59,cluster_60,cluster_61,cluster_62,cluster_63
10538,-0.781333,-1.319991,-1.128770,0.736348,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1913,0.270771,0.382945,0.348826,-1.472806,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9647,-0.781333,-1.319991,-1.128770,0.736348,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9828,0.270771,0.534993,0.184648,-1.094094,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8253,-0.781333,0.747860,0.841357,-3.303248,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5080,-0.781333,1.386461,0.513003,-1.472806,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4856,-0.781333,-1.319991,-1.128770,0.736348,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6428,0.270771,0.656631,0.513003,-0.147314,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0.270771,-1.319991,-1.128770,0.736348,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1757]:
get_logReg_model('train_val')

Accuracy of Logistic Regression classifier on training set: 0.519
Accuracy of Logistic Regression classifier on validation set: 0.528
              precision    recall  f1-score   support

           0       0.51      0.50      0.50      2982
           1       0.53      0.54      0.53      3071

    accuracy                           0.52      6053
   macro avg       0.52      0.52      0.52      6053
weighted avg       0.52      0.52      0.52      6053

              precision    recall  f1-score   support

           0       0.52      0.48      0.50      1278
           1       0.53      0.57      0.55      1317

    accuracy                           0.53      2595
   macro avg       0.53      0.53      0.53      2595
weighted avg       0.53      0.53      0.53      2595



In [None]:
# Create a DBSCAN object
dbscan = DBSCAN(eps=0.3, min_samples=10, algorithm='brute')

# Fit the DBSCAN model to the data
dbscan.fit(df2_scaled)

# Access the labels assigned to each data point
labels = dbscan.labels_

# Access the core samples identified by DBSCAN
core_samples = dbscan.core_sample_indices_

# Print the number of clusters (excluding noise points)
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print("Number of clusters:", num_clusters)

# Print the cluster labels and core sample indices
print("Cluster labels:", labels)
print("Core samples indices:", core_samples)

df2_scaled['cluster']= labels