In [1]:
from sklearn import tree
import numpy as np
import pandas as pd
import os

rng_seed = 0

In [2]:
# FGA, FGP, TPP, FT, DRB, TRB, TOV, PF

feature_cols = ["attempted_field_goals", \
                "field_goal_percentage",\
                "three_point_percentage",\
                "made_free_throws",\
                "defensive_rebounds",\
                "total_rebounds",
                "turnovers",\
                "personal_fouls"]
label_col = ["outcome"]

#year_range = range(2019, 1999, -1)
year_range = range(2019, 2007, -1)

# SKIPPING PLAYOFFS
month_range = [10, 11, 12, 1, 2, 3]

day_range = range(1,32)


processed_file_dir = "data_preprocessed/team_box_scores/"
team_full_df = pd.DataFrame()


#
# Collects all team box score data into single pd.DataFrame
#
for season_start_year in year_range:
    season_str = str(season_start_year) + "_" \
                 + str(season_start_year + 1)
    if not os.path.exists(processed_file_dir + season_str):
        os.mkdir(processed_file_dir + season_str)

    year = season_start_year
    season_file_path = season_str + "_season_schedule.csv"

    print("\nLoading game data for season " + str(year)\
          + "-" + str(year+1) + ": ", end = '')
    for month in month_range:
        print(str(month) + " ", end = '')
        for day in day_range:
            if month == 1 and day == 1:
                year = year + 1
            if day < 10:
                day_str = "0" + str(day)
            else:
                day_str = str(day)
            if month < 10:
                month_str = "0" + str(month)
            else:
                month_str = str(month)
                
            processed_temp_file_path = "./" + processed_file_dir \
                + season_str + "/" + str(year) + "_" + month_str \
                + "_" + day_str + "_" + "team_box_scores.csv"
            
            if os.path.exists(processed_temp_file_path):
                team_df = pd.read_csv(processed_temp_file_path)
                team_full_df = pd.concat([team_full_df, team_df]).reset_index(drop=True)

                
#            
# Adds derived features to pd.DataFrame
#
if "field_goal_percentage" in feature_cols:
    team_full_df["field_goal_percentage"] \
        = team_full_df["made_field_goals"] \
          / (team_full_df["made_field_goals"] \
             + team_full_df["attempted_field_goals"])
if "three_point_percentage" in feature_cols:
    team_full_df["three_point_percentage"] \
        = team_full_df["made_three_point_field_goals"] \
          / (team_full_df["made_three_point_field_goals"] \
             + team_full_df["attempted_three_point_field_goals"])
if "total_rebounds" in feature_cols:
    team_full_df["total_rebounds"] \
        = team_full_df["offensive_rebounds"] \
          + team_full_df["defensive_rebounds"]
 


Loading game data for season 2019-2019: 10 11 12 1 2 3 
Loading game data for season 2018-2018: 10 11 12 1 2 3 
Loading game data for season 2017-2017: 10 11 12 1 2 3 
Loading game data for season 2016-2016: 10 11 12 1 2 3 
Loading game data for season 2015-2015: 10 11 12 1 2 3 
Loading game data for season 2014-2014: 10 11 12 1 2 3 
Loading game data for season 2013-2013: 10 11 12 1 2 3 
Loading game data for season 2012-2012: 10 11 12 1 2 3 
Loading game data for season 2011-2011: 10 11 12 1 2 3 
Loading game data for season 2010-2010: 10 11 12 1 2 3 
Loading game data for season 2009-2009: 10 11 12 1 2 3 
Loading game data for season 2008-2008: 10 11 12 1 2 3 

In [None]:
def TrainAndTestModel(model_type=model_type, X_train=X_train, y_train=y_train,\
                      X_test=X_test, y_test=y_test):
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)

    # Test accuracy
    n_correct = 0
    for idx_test in range(0,n_test):
        if clf.predict([X_test.iloc[idx_test,:]])[0] \
                    == y_test.iloc[idx_test][0]:
            n_correct = n_correct + 1
    return

In [11]:
#
# DECISION TREE: CLASSIFICATION - Train and Test
#
np.random.seed(seed=rng_seed)
n_models = 10
accuracy_arr = np.zeros((n_models,))

# Test method over n_models
for idx_model in range(0, n_models):
    X = team_full_df.loc[:, feature_cols]
    y = team_full_df.loc[:, label_col]

    n_samples = X.shape[0]
    rand_idx = np.arange(n_samples)
    np.random.shuffle(rand_idx)
    train_ratio = 0.80
    n_train = int(np.round(train_ratio*n_samples))
    n_test = n_samples - n_train
    train_idx = rand_idx[:n_train]
    test_idx = rand_idx[n_train:]

    X_train = X.loc[train_idx, :]
    X_test = X.loc[test_idx, :]
    y_train = y.loc[train_idx, :]
    y_test = y.loc[test_idx, :]
    
    # Train / fit model
    print("Training and testing model :", idx_model+1, "of", n_models)

    

    
    accuracy_arr[idx_model] = n_correct / n_test

print("Mean : % 1.6f" %np.mean(accuracy_arr))
print("Var  : % 1.6f" %np.var(accuracy_arr))

Training and testing model :  1  of  10
Training and testing model :  2  of  10
Training and testing model :  3  of  10
Training and testing model :  4  of  10
Training and testing model :  5  of  10
Training and testing model :  6  of  10
Training and testing model :  7  of  10
Training and testing model :  8  of  10
Training and testing model :  9  of  10
Training and testing model :  10  of  10
Mean :  0.716003
Var  :  0.000028


In [6]:
#
# DECISION TREE: REGRESSION - Train and Test
#
np.random.seed(seed=rng_seed)
n_models = 10
accuracy_arr = np.zeros((n_models,))

# Test method over n_models
for idx_model in range(0, n_models):
    X = team_full_df.loc[:, feature_cols]
    y = team_full_df.loc[:, label_col]

    n_samples = X.shape[0]
    rand_idx = np.arange(n_samples)
    np.random.shuffle(rand_idx)
    train_ratio = 0.80
    n_train = int(np.round(train_ratio*n_samples))
    n_test = n_samples - n_train
    train_idx = rand_idx[:n_train]
    test_idx = rand_idx[n_train:]

    X_train = X.loc[train_idx, :]
    X_test = X.loc[test_idx, :]
    y_train = y.loc[train_idx, :]
    y_test = y.loc[test_idx, :]
    
    # Train / fit model
    print("Training model : ", idx_model+1)            
    outcome_type = {'win': 1, 'loss': 0}
    y_train_binary = [outcome_type[item] for item in y_train.outcome]
    y_test_binary = [outcome_type[item] for item in y_test.outcome]

    clf = tree.DecisionTreeRegressor()
    clf = clf.fit(X_train, y_train_binary)
    # Test accuracy
    n_correct = 0
    print("Testing model  : ", idx_model+1)
    for idx_test in range(0,n_test):
        if abs( clf.predict([X_test.iloc[idx_test,:]])[0] \
                    - y_test_binary[idx_test] ) \
                < 0.5 :
            n_correct = n_correct + 1
    
    accuracy_arr[idx_model] = n_correct / n_test

print("Mean : % 1.6f" %np.mean(accuracy_arr))
print("Var  : % 1.6f" %np.var(accuracy_arr))

Training model :  1
Testing model  :   1
Training model :  2
Testing model  :   2
Training model :  3
Testing model  :   3
Training model :  4
Testing model  :   4
Training model :  5
Testing model  :   5
Training model :  6
Testing model  :   6
Training model :  7
Testing model  :   7
Training model :  8
Testing model  :   8
Training model :  9
Testing model  :   9
Training model :  10
Testing model  :   10
Mean :  0.716291
Var  :  0.000025
