In [1]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np


spark = SparkSession.builder.config("spark.driver.host", "localhost").config("spark.python.worker.reuse", "false").getOrCreate()
file_path = 'file:///C:/Users/seanz/VSCode_WS/BigData/NFL_PBP_V1.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

df = df.withColumn(
    'play_success', 
    F.when(
        (F.col('play_type') == 'run') & (F.col('rush_attempt') == 1) & (F.col('yards_gained') >= 4), 1
    ).when(
        (F.col('play_type') == 'pass') & (F.col('pass_attempt') == 1) & (F.col('yards_gained') >= 4), 1
    ).otherwise(0)
)

window_spec = Window.partitionBy('game_id', 'posteam').orderBy('play_id')
df = df.withColumn(
    'cumulative_rush_attempts', 
    F.sum(F.when((F.col('play_type') == 'run') & (F.col('rush_attempt') == 1), 1).otherwise(0)).over(window_spec)
)

df = df.withColumn(
    'cumulative_pass_attempts', 
    F.sum(F.when((F.col('play_type') == 'pass') & (F.col('pass_attempt') == 1), 1).otherwise(0)).over(window_spec)
)

df = df.withColumn(
    'cumulative_rush_successes', 
    F.sum(F.when(
        (F.col('play_type') == 'run') & (F.col('rush_attempt') == 1) & (F.col('yards_gained') >= 4), 1).otherwise(0)
    ).over(window_spec)
)

df = df.withColumn(
    'cumulative_pass_successes', 
    F.sum(F.when(
        (F.col('play_type') == 'pass') & (F.col('pass_attempt') == 1) & (F.col('yards_gained') >= 4), 1).otherwise(0)
    ).over(window_spec)
)

df = df.withColumn(
    'rush_success_rate', 
    F.when(F.col('cumulative_rush_attempts') > 0, F.col('cumulative_rush_successes') / F.col('cumulative_rush_attempts')).otherwise(0)
)

df = df.withColumn(
    'pass_success_rate', 
    F.when(F.col('cumulative_pass_attempts') > 0, F.col('cumulative_pass_successes') / F.col('cumulative_pass_attempts')).otherwise(0)
)

df = df.withColumn('posteam_leading', F.col('score_differential_post') > 0)
df = df.withColumn('posteam_trailing', F.col('score_differential_post') < 0)
df = df.withColumn('yards_gained', F.col('yards_gained').cast('float'))
df = df.withColumn('shotgun', F.col('shotgun').cast('float'))
df = df.withColumn('no_huddle', F.col('no_huddle').cast('float'))
df = df.withColumn('timeout', F.col('timeout').cast('float'))
df = df.withColumn('posteam_timeouts_remaining', F.col('posteam_timeouts_remaining').cast('float'))
df = df.withColumn('defteam_timeouts_remaining', F.col('defteam_timeouts_remaining').cast('float'))
offensive_playtypes = ['field_goal', 'run', 'punt', 'pass']
df = df.filter(df.play_type.isin(offensive_playtypes))

df_indexed = df.withColumn('play_type_index', 
                           F.when(F.col('play_type') == 'pass', 0)
                            .when(F.col('play_type') == 'run', 1)
                            .when(F.col('play_type') == 'punt', 2)
                            .when(F.col('play_type') == 'field_goal', 3)
                            .otherwise(-1))  

feature_columns = [ 'yardline_100', 'game_seconds_remaining', 'qtr', 'down', 'goal_to_go', 'ydstogo', 'yards_gained', 
                   'first_down_penalty', 'third_down_converted', 'third_down_failed', 'shotgun', 'no_huddle', 'timeout', 
                   'posteam_timeouts_remaining','defteam_timeouts_remaining', 'score_differential_post', 'posteam_score_post', 
                   'play_success', 'cumulative_rush_attempts', 'cumulative_pass_attempts', 'cumulative_rush_successes', 
                   'cumulative_pass_successes', 'rush_success_rate', 'pass_success_rate', 'posteam_leading', 'posteam_trailing']


assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df_vector = assembler.transform(df_indexed)

games = df_vector.select("game_id").distinct().collect()  
train_games = games[:int(0.8 * len(games))] 
test_games = games[int(0.8 * len(games)):]

train_df = df_vector.filter(col("game_id").isin([game["game_id"] for game in train_games]))
test_df = df_vector.filter(col("game_id").isin([game["game_id"] for game in test_games]))

mlp = MultilayerPerceptronClassifier(
    blockSize=128,
    featuresCol="features",
    labelCol="play_type_index",
    maxIter=100,
    predictionCol="prediction",
    probabilityCol="probability",
    rawPredictionCol="rawPrediction",
    seed=42,
    solver="l-bfgs",
    stepSize=0.03,
    tol=1e-06,
    layers=[26, 32, 32, 4]  # 26 input features, two hidden layers of 32 neurons, 4 output classes
)


mlp_model = mlp.fit(train_df)
predictions = mlp_model.transform(test_df)
predictions.select("play_type_index", "prediction", "probability").show(50, truncate=False)

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='accuracy')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='weightedRecall')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='f1')

accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)

print(f"Multilayer Perceptron Classifier Results:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1_score:.4f}")

+---------------+----------+----------------------------------------------------------------------------------+
|play_type_index|prediction|probability                                                                       |
+---------------+----------+----------------------------------------------------------------------------------+
|0              |0.0       |[0.5600219803045725,0.2982996536321055,0.09202397198505541,0.04965439407826663]   |
|1              |0.0       |[0.47240880841006744,0.3982236570074158,0.08163699429180632,0.04773054029071044]  |
|0              |0.0       |[0.5144287773751747,0.34981281238574813,0.08685890852171457,0.04889950171736271]  |
|2              |0.0       |[0.5379895350726587,0.32305175580258383,0.08959887708406908,0.049359832040688334] |
|1              |0.0       |[0.48353663871793556,0.3889577262126407,0.08187683739333891,0.04562879767608491]  |
|0              |0.0       |[0.5618461476703132,0.3057824806702052,0.08891406914599641,0.043457302513485

In [11]:
from sklearn.metrics import classification_report, f1_score

manual_thresholds = np.array([0.55, 0.45, 0.05, 0.02])  # Run, Pass, Punt, FG

# Extract true labels
y_test_true = np.array(predictions.select("play_type_index").collect()).flatten()

# Extract probability predictions
y_test_proba = np.array(predictions.select("probability").collect())  # Extract probabilities
y_test_proba = np.stack(y_test_proba[:, 0])  # Convert nested structure to matrix (samples x num_classes)

# Apply manually set thresholds
y_test_pred = np.argmax((y_test_proba >= manual_thresholds).astype(int), axis=1)

# Print classification report
print("\n=== Test Set (After Manually Adjusting Thresholds) ===")
print(classification_report(y_test_true, y_test_pred, zero_division=0))


=== Test Set (After Manually Adjusting Thresholds) ===
              precision    recall  f1-score   support

           0       0.54      0.33      0.41     41452
           1       0.38      0.26      0.31     29021
           2       0.07      0.41      0.11      5139
           3       0.00      0.00      0.00      2175

    accuracy                           0.30     77787
   macro avg       0.24      0.25      0.21     77787
weighted avg       0.43      0.30      0.34     77787



In [13]:
df_indexed.columns

['play_id',
 'game_id',
 'home_team',
 'away_team',
 'posteam',
 'posteam_type',
 'defteam',
 'side_of_field',
 'yardline_100',
 'game_date',
 'quarter_seconds_remaining',
 'half_seconds_remaining',
 'game_seconds_remaining',
 'game_half',
 'quarter_end',
 'drive',
 'sp',
 'qtr',
 'down',
 'goal_to_go',
 'time',
 'yrdln',
 'ydstogo',
 'ydsnet',
 'desc',
 'play_type',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_kneel',
 'qb_spike',
 'qb_scramble',
 'pass_length',
 'pass_location',
 'air_yards',
 'yards_after_catch',
 'run_location',
 'run_gap',
 'field_goal_result',
 'kick_distance',
 'extra_point_result',
 'two_point_conv_result',
 'home_timeouts_remaining',
 'away_timeouts_remaining',
 'timeout',
 'timeout_team',
 'td_team',
 'posteam_timeouts_remaining',
 'defteam_timeouts_remaining',
 'total_home_score',
 'total_away_score',
 'posteam_score',
 'defteam_score',
 'score_differential',
 'posteam_score_post',
 'defteam_score_post',
 'score_differential_post',
 'punt_