## Defensive Team Classifiers Stats

In [13]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

# uncomment once you paste your myclassifiers.py into mysklearn package
import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import (MyKNeighborsClassifier, MyDummyClassifier, 
                                     MyNaiveBayesClassifier, MyDecisionTreeClassifier,
                                     MyRandomForestClassifier)

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

In [14]:
import numpy as np

table = MyPyTable()
table.load_from_file("data/Balanced_Defensive_Team_Data_Sampled.csv")


# 2. Define the features you want to use
feature_names = [
    "drb_per_game", "stl_per_game", "blk_per_game", "stl_percent", "blk_percent",
    "dws", "dbpm", "drb_percent"
]

# This replaces any "NA" strings with the column's average, making the column fully numeric.
for col in feature_names:
    table.replace_missing_values_with_column_average(col)

# 4. Extract X (Features)
# We find the index of each desired column, then grab those values from table.data
col_indices = [table.column_names.index(name) for name in feature_names]

X = []
for row in table.data:
    # Create a new row containing only the selected features
    sample = [row[i] for i in col_indices]
    X.append(sample)

# 5. Extract y (Target)
y_col = table.get_column("voted")
# Ensure y is integer (0/1) not float (0.0/1.0)
y = [int(val) for val in y_col]

# 6. Preprocessing
X_normalized = myutils.normalize_table(X)
#X_discretized = discretize_data(X, n_bins=10)

print(f"Loaded {len(X)} samples using MyPyTable.")
print(f"Features: {feature_names}")
print(f"Sample Raw: {X[0]}")
print(f"Sample Norm: {X_normalized[0]}")
#print(f"Sample Disc: {X_discretized[0]}")

Loaded 3412 samples using MyPyTable.
Features: ['drb_per_game', 'stl_per_game', 'blk_per_game', 'stl_percent', 'blk_percent', 'dws', 'dbpm', 'drb_percent']
Sample Raw: [2.0, 0.9, 0.3, 1.4, 0.6, 1.5, -1.3, 7.1]
Sample Norm: [0.16260162601626016, 0.24324324324324323, 0.06, 0.08, 0.009538950715421303, 0.19999999999999998, 0.5548098434004474, 0.071]


In [15]:
# Define Classifier Configurations (not instances - we'll create fresh instances per fold)
classifier_configs = [
    ("Dummy Classifier", MyDummyClassifier, {}, "X_normalized"),
    ("kNN (k=5)", MyKNeighborsClassifier, {"n_neighbors": 5}, "X_normalized"),
    ("Decision Tree (D=5)", MyDecisionTreeClassifier, {"max_depth": 5}, "X_normalized"),
    ("Random Forest (20T, D=5, max_f=3)", MyRandomForestClassifier, {"n_trees": 20, "max_depth": 5, "max_features": 3}, "X_normalized")
]

n_splits = 10
folds = myevaluation.stratified_kfold_split(X, y, n_splits=n_splits, shuffle=True, random_state=42)

for clf_name, clf_class, clf_params, dataset_name in classifier_configs:
    print(f"--- Evaluating {clf_name} ---")
    
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    all_y_true = []
    all_y_pred = []
    
    X_dataset = X_normalized  # Use the appropriate dataset
    
    for train_idx, test_idx in folds:
        X_train = [X_dataset[i] for i in train_idx]
        y_train = [y[i] for i in train_idx]
        X_test = [X_dataset[i] for i in test_idx]
        y_test = [y[i] for i in test_idx]

        # Create a fresh classifier instance for each fold
        clf = clf_class(**clf_params)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        accuracies.append(myevaluation.accuracy_score(y_test, y_pred))
        precisions.append(myevaluation.binary_precision_score(y_test, y_pred, pos_label=1))
        recalls.append(myevaluation.binary_recall_score(y_test, y_pred, pos_label=1))
        f1_scores.append(myevaluation.binary_f1_score(y_test, y_pred, pos_label=1))
        
        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)

    print(f"Accuracy:  {sum(accuracies)/len(accuracies):.3f}")
    print(f"Precision: {sum(precisions)/len(precisions):.3f}")
    print(f"Recall:    {sum(recalls)/len(recalls):.3f}")
    print(f"F1 Score:  {sum(f1_scores)/len(f1_scores):.3f}")
    
    matrix = myevaluation.confusion_matrix(all_y_true, all_y_pred, labels=[0, 1])
    print("Confusion Matrix (0=No, 1=Yes):")
    print(f"     Pred 0   Pred 1")
    print(f"True 0:  {matrix[0][0]}      {matrix[0][1]}")
    print(f"True 1:  {matrix[1][0]}      {matrix[1][1]}")
    print("\n")

--- Evaluating Dummy Classifier ---
Accuracy:  0.481
Precision: 0.140
Recall:    0.300
F1 Score:  0.191
Confusion Matrix (0=No, 1=Yes):
     Pred 0   Pred 1
True 0:  1160      546
True 1:  1226      480


--- Evaluating kNN (k=5) ---
Accuracy:  0.888
Precision: 0.858
Recall:    0.929
F1 Score:  0.892
Confusion Matrix (0=No, 1=Yes):
     Pred 0   Pred 1
True 0:  1444      262
True 1:  121      1585


--- Evaluating Decision Tree (D=5) ---
Accuracy:  0.864
Precision: 0.826
Recall:    0.923
F1 Score:  0.871
Confusion Matrix (0=No, 1=Yes):
     Pred 0   Pred 1
True 0:  1374      332
True 1:  133      1573


--- Evaluating Random Forest (20T, D=5, max_f=3) ---
Accuracy:  0.880
Precision: 0.836
Recall:    0.946
F1 Score:  0.887
Confusion Matrix (0=No, 1=Yes):
     Pred 0   Pred 1
True 0:  1389      317
True 1:  93      1613


