In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.config("spark.driver.host", "localhost").config("spark.python.worker.reuse", "false").getOrCreate()
file_path = 'file:///C:/Users/seanz/VSCode_WS/BigData/NFL_PBP_V1.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)

df = df.withColumn(
    'play_success', 
    F.when(
        (F.col('play_type') == 'run') & (F.col('rush_attempt') == 1) & (F.col('yards_gained') >= 4), 1
    ).when(
        (F.col('play_type') == 'pass') & (F.col('pass_attempt') == 1) & (F.col('yards_gained') >= 4), 1
    ).otherwise(0)
)

window_spec = Window.partitionBy('game_id', 'posteam').orderBy('play_id')
df = df.withColumn(
    'cumulative_rush_attempts', 
    F.sum(F.when((F.col('play_type') == 'run') & (F.col('rush_attempt') == 1), 1).otherwise(0)).over(window_spec)
)

df = df.withColumn(
    'cumulative_pass_attempts', 
    F.sum(F.when((F.col('play_type') == 'pass') & (F.col('pass_attempt') == 1), 1).otherwise(0)).over(window_spec)
)

df = df.withColumn(
    'cumulative_rush_successes', 
    F.sum(F.when(
        (F.col('play_type') == 'run') & (F.col('rush_attempt') == 1) & (F.col('yards_gained') >= 4), 1).otherwise(0)
    ).over(window_spec)
)

df = df.withColumn(
    'cumulative_pass_successes', 
    F.sum(F.when(
        (F.col('play_type') == 'pass') & (F.col('pass_attempt') == 1) & (F.col('yards_gained') >= 4), 1).otherwise(0)
    ).over(window_spec)
)

df = df.withColumn(
    'rush_success_rate', 
    F.when(F.col('cumulative_rush_attempts') > 0, F.col('cumulative_rush_successes') / F.col('cumulative_rush_attempts')).otherwise(0)
)

df = df.withColumn(
    'pass_success_rate', 
    F.when(F.col('cumulative_pass_attempts') > 0, F.col('cumulative_pass_successes') / F.col('cumulative_pass_attempts')).otherwise(0)
)

df = df.withColumn('posteam_leading', F.col('score_differential_post') > 0)
df = df.withColumn('posteam_trailing', F.col('score_differential_post') < 0)
df = df.withColumn('yards_gained', F.col('yards_gained').cast('float'))
df = df.withColumn('shotgun', F.col('shotgun').cast('float'))
df = df.withColumn('no_huddle', F.col('no_huddle').cast('float'))
df = df.withColumn('timeout', F.col('timeout').cast('float'))
df = df.withColumn('posteam_timeouts_remaining', F.col('posteam_timeouts_remaining').cast('float'))
df = df.withColumn('defteam_timeouts_remaining', F.col('defteam_timeouts_remaining').cast('float'))
offensive_playtypes = ['field_goal', 'run', 'punt', 'pass', 'qb_kneel', 'qb_spike']
df = df.filter(df.play_type.isin(offensive_playtypes))

df_indexed = df.withColumn('play_type_index', 
                           F.when(F.col('play_type') == 'pass', 0)
                            .when(F.col('play_type') == 'run', 1)
                            .when(F.col('play_type') == 'punt', 2)
                            .when(F.col('play_type') == 'field_goal', 3)
                            .when(F.col('play_type') == 'qb_kneel', 4)
                            .when(F.col('play_type') == 'qb_spike', 5)
                            .otherwise(-1))  

feature_columns = [ 'yardline_100', 'game_seconds_remaining', 'qtr', 'down', 'goal_to_go', 'ydstogo', 'yards_gained',
                   'shotgun', 'no_huddle', 'qb_kneel', 'qb_spike', 'timeout', 'posteam_timeouts_remaining', 
                   'defteam_timeouts_remaining', 'score_differential_post', 'rush_attempt', 'pass_attempt', 
                   'field_goal_attempt', 'punt_attempt']

In [None]:
'''
from pyspark.sql.functions import col, udf

assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df_vector = assembler.transform(df_indexed)

class_counts = df_vector.groupBy('play_type_index').count().collect()
total_counts = df_vector.count()
class_weights = {row['play_type_index']: total_counts / row['count'] for row in class_counts}

def get_weight(play_type_index):
    return float(class_weights.get(play_type_index, 1.0))

weights_data = [(row['play_type_index'], float(total_counts / row['count'])) for row in class_counts]
weights_df = spark.createDataFrame(weights_data, schema=["play_type_index", "class_weight"])
df_weighted = df_vector.join(weights_df, on="play_type_index", how="left")
df_weighted = df_weighted.withColumn("class_weight", col("class_weight").cast("double"))

games = df_weighted.select("game_id").distinct().collect()  
train_games = games[:int(0.8 * len(games))] 
test_games = games[int(0.8 * len(games)):]

train_df = df_weighted.filter(df_vector.game_id.isin([game["game_id"] for game in train_games]))
test_df = df_weighted.filter(df_vector.game_id.isin([game["game_id"] for game in test_games]))

rf = RandomForestClassifier(labelCol='play_type_index', featuresCol='features', maxDepth=10, weightCol='class_weight')
rf_model = rf.fit(train_df)

predictions = rf_model.transform(test_df)
predictions.select('play_type_index', 'prediction').show(50)

print("Confusion Matrix:")
predictions.groupBy("play_type_index", "prediction").count().show()

evaluator = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='accuracy')
evaluator_precision = MulticlassClassificationEvaluator(labelCol="play_type_index", predictionCol="prediction", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="play_type_index", predictionCol="prediction", metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="play_type_index", predictionCol="prediction", metricName="f1")

precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)
accuracy = evaluator.evaluate(predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"Accuracy: {accuracy}")
'''
#Weights implemented, could not get it to work


Training RandomForest with maxDepth = 5, numTrees = 10
Results for maxDepth = 5, numTrees = 10:
  Accuracy: 0.9999
  Precision: 0.9999
  Recall: 0.9999
  F1 Score: 0.9999

Training RandomForest with maxDepth = 5, numTrees = 50
Results for maxDepth = 5, numTrees = 50:
  Accuracy: 0.9996
  Precision: 0.9996
  Recall: 0.9996
  F1 Score: 0.9996

Training RandomForest with maxDepth = 5, numTrees = 100
Results for maxDepth = 5, numTrees = 100:
  Accuracy: 0.9997
  Precision: 0.9997
  Recall: 0.9997
  F1 Score: 0.9997

Training RandomForest with maxDepth = 10, numTrees = 10
Results for maxDepth = 10, numTrees = 10:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000

Training RandomForest with maxDepth = 10, numTrees = 50
Results for maxDepth = 10, numTrees = 50:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000

Training RandomForest with maxDepth = 10, numTrees = 100
Results for maxDepth = 10, numTrees = 100:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000

Training RandomForest with maxDepth = 15, numTrees = 10
Results for maxDepth = 15, numTrees = 10:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000

Training RandomForest with maxDepth = 15, numTrees = 50
Results for maxDepth = 15, numTrees = 50:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000

Training RandomForest with maxDepth = 15, numTrees = 100
Results for maxDepth = 15, numTrees = 100:
  Accuracy: 1.0000
  Precision: 1.0000
  Recall: 1.0000
  F1 Score: 1.0000

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df_vector = assembler.transform(df_indexed)

games = df_vector.select("game_id").distinct().collect()  
train_games = games[:int(0.8 * len(games))] 
test_games = games[int(0.8 * len(games)):]

train_df = df_vector.filter(col("game_id").isin([game["game_id"] for game in train_games]))
test_df = df_vector.filter(col("game_id").isin([game["game_id"] for game in test_games]))

max_depths = [5, 10, 15]
num_trees_list = [10, 50, 100]

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='accuracy')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='weightedRecall')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='f1')

for depth in max_depths:
    for num_trees in num_trees_list:
        print(f"\nTraining RandomForest with maxDepth = {depth}, numTrees = {num_trees}")
        rf = RandomForestClassifier(labelCol='play_type_index', featuresCol='features', 
                                    maxDepth=depth, numTrees=num_trees)
        rf_model = rf.fit(train_df)
        predictions = rf_model.transform(test_df)
        accuracy = evaluator_accuracy.evaluate(predictions)
        precision = evaluator_precision.evaluate(predictions)
        recall = evaluator_recall.evaluate(predictions)
        f1_score = evaluator_f1.evaluate(predictions)
        
        print(f"Results for maxDepth = {depth}, numTrees = {num_trees}:")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1 Score: {f1_score:.4f}")

In [2]:

assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df_vector = assembler.transform(df_indexed)

classes = [row["play_type_index"] for row in df_vector.select("play_type_index").distinct().collect()]
fractions = {c: 0.8 for c in classes}
train_df = df_vector.sampleBy("play_type_index", fractions, seed=42)
test_df = df_vector.subtract(train_df)

rf = RandomForestClassifier(labelCol='play_type_index', featuresCol='features', maxDepth=10)
rf_model = rf.fit(train_df)

predictions = rf_model.transform(test_df)
predictions.select("play_type_index", "prediction").show(50)

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='accuracy')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='weightedRecall')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='f1')

accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)

print("Stratified Sampling Results:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1_score:.4f}")

+---------------+----------+
|play_type_index|prediction|
+---------------+----------+
|              0|       0.0|
|              1|       1.0|
|              0|       0.0|
|              0|       0.0|
|              1|       1.0|
|              0|       0.0|
|              0|       0.0|
|              1|       1.0|
|              1|       1.0|
|              0|       0.0|
|              1|       1.0|
|              2|       2.0|
|              1|       1.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              1|       1.0|
|              3|       3.0|
|              0|       0.0|
|              0|       0.0|
|              1|       1.0|
|              1|       1.0|
|             

In [3]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.sql.functions import col

assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df_vector = assembler.transform(df_indexed)

games = df_vector.select("game_id").distinct().collect()  
train_games = games[:int(0.8 * len(games))] 
test_games = games[int(0.8 * len(games)):]

train_df = df_vector.filter(col("game_id").isin([game["game_id"] for game in train_games]))
test_df = df_vector.filter(col("game_id").isin([game["game_id"] for game in test_games]))

input_size = len(feature_columns)
num_classes = df_vector.select("play_type_index").distinct().count()

layers = [input_size, 10, num_classes]
print(f"Network layers: {layers}")

mlp = MultilayerPerceptronClassifier(labelCol="play_type_index",
                                     featuresCol="features",
                                     layers=layers,
                                     blockSize=128,
                                     seed=42,
                                     maxIter=100)
mlp_model = mlp.fit(train_df)
predictions = mlp_model.transform(test_df)
predictions.select("play_type_index", "prediction", "probability").show(50, truncate=False)

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='accuracy')
evaluator_precision = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='weightedRecall')
evaluator_f1 = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='f1')

accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)

print(f"Multilayer Perceptron Classifier Results:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1_score:.4f}")

Network layers: [19, 10, 6]
+---------------+----------+------------------------------------------------------------------------------------------------------------------------------+
|play_type_index|prediction|probability                                                                                                                   |
+---------------+----------+------------------------------------------------------------------------------------------------------------------------------+
|0              |1.0       |[0.41407416218425996,0.4461948507901783,0.09438924081115807,0.042372412982437005,0.002701288808602586,2.680444233641358E-4]   |
|0              |0.0       |[0.7494328850903264,0.21752925779114066,0.01850829909024133,0.01416320358527597,2.5792818752864034E-4,1.0842625548683016E-4]  |
|0              |0.0       |[0.749341212830947,0.21760199587417975,0.018520359517154656,0.014169801291514218,2.581609262277899E-4,1.084695599766474E-4]   |
|1              |1.0       |[0.41407