In [13]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.config("spark.driver.host", "localhost").getOrCreate()
file_path = 'file:///C:/Users/seanz/VSCode_WS/BigData/NFL_PBP_V1.csv'
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.head(10)


[Row(play_id=35, game_id=2019090500, home_team='CHI', away_team='GB', posteam='GB', posteam_type='away', defteam='CHI', side_of_field='CHI', yardline_100=35.0, game_date=datetime.date(2019, 9, 5), quarter_seconds_remaining=900.0, half_seconds_remaining=1800.0, game_seconds_remaining=3600.0, game_half='Half1', quarter_end=0, drive=1, sp=0, qtr=1, down=0.0, goal_to_go=0.0, time=datetime.datetime(2025, 2, 2, 15, 0), yrdln='CHI 35', ydstogo=0, ydsnet=-10, desc='E.Pineiro kicks 65 yards from CHI 35 to end zone, Touchback.', play_type='kickoff', yards_gained='0.0', shotgun='0', no_huddle='0', qb_dropback=0.0, qb_kneel=0.0, qb_spike=0, qb_scramble=0.0, pass_length='None', pass_location='None', air_yards='0.0', yards_after_catch='0.0', run_location='None', run_gap='None', field_goal_result='None', kick_distance='0.0', extra_point_result='None', two_point_conv_result='None', home_timeouts_remaining='3', away_timeouts_remaining='3', timeout='0.0', timeout_team='None', td_team='None', posteam_tim

In [14]:

df = df.withColumn(
    'play_success', 
    F.when(
        (F.col('play_type') == 'run') & (F.col('rush_attempt') == 1) & (F.col('yards_gained') >= 4), 1
    ).when(
        (F.col('play_type') == 'pass') & (F.col('pass_attempt') == 1) & (F.col('yards_gained') >= 4), 1
    ).otherwise(0)
)

window_spec = Window.partitionBy('game_id', 'posteam').orderBy('play_id')
df = df.withColumn(
    'cumulative_rush_attempts', 
    F.sum(F.when((F.col('play_type') == 'run') & (F.col('rush_attempt') == 1), 1).otherwise(0)).over(window_spec)
)

df = df.withColumn(
    'cumulative_pass_attempts', 
    F.sum(F.when((F.col('play_type') == 'pass') & (F.col('pass_attempt') == 1), 1).otherwise(0)).over(window_spec)
)

df = df.withColumn(
    'cumulative_rush_successes', 
    F.sum(F.when(
        (F.col('play_type') == 'run') & (F.col('rush_attempt') == 1) & (F.col('yards_gained') >= 4), 1).otherwise(0)
    ).over(window_spec)
)

df = df.withColumn(
    'cumulative_pass_successes', 
    F.sum(F.when(
        (F.col('play_type') == 'pass') & (F.col('pass_attempt') == 1) & (F.col('yards_gained') >= 4), 1).otherwise(0)
    ).over(window_spec)
)

df = df.withColumn(
    'rush_success_rate', 
    F.when(F.col('cumulative_rush_attempts') > 0, F.col('cumulative_rush_successes') / F.col('cumulative_rush_attempts')).otherwise(0)
)

df = df.withColumn(
    'pass_success_rate', 
    F.when(F.col('cumulative_pass_attempts') > 0, F.col('cumulative_pass_successes') / F.col('cumulative_pass_attempts')).otherwise(0)
)

df = df.withColumn('posteam_leading', F.col('score_differential_post') > 0)
df = df.withColumn('posteam_trailing', F.col('score_differential_post') < 0)
df = df.withColumn('yards_gained', F.col('yards_gained').cast('float'))
df = df.withColumn('shotgun', F.col('shotgun').cast('float'))
df = df.withColumn('no_huddle', F.col('no_huddle').cast('float'))
df = df.withColumn('timeout', F.col('timeout').cast('float'))
df = df.withColumn('posteam_timeouts_remaining', F.col('posteam_timeouts_remaining').cast('float'))
df = df.withColumn('defteam_timeouts_remaining', F.col('defteam_timeouts_remaining').cast('float'))
offensive_playtypes = ['field_goal', 'run', 'punt', 'pass', 'qb_kneel', 'qb_spike']
df = df.filter(df.play_type.isin(offensive_playtypes))


In [None]:
df_indexed = df.withColumn('play_type_index', 
                           F.when(F.col('play_type') == 'pass', 0)
                            .when(F.col('play_type') == 'run', 1)
                            .when(F.col('play_type') == 'punt', 2)
                            .when(F.col('play_type') == 'field_goal', 3)
                            .when(F.col('play_type') == 'qb_kneel', 4)
                            .when(F.col('play_type') == 'qb_spike', 5)
                            .otherwise(-1))  

feature_columns = [ 'yardline_100', 'game_seconds_remaining', 'qtr', 'down', 'goal_to_go', 'ydstogo', 'yards_gained',
                   'shotgun', 'no_huddle', 'qb_kneel', 'qb_spike', 'timeout', 'posteam_timeouts_remaining', 
                   'defteam_timeouts_remaining', 'score_differential_post', 'rush_attempt', 'pass_attempt', 
                   'field_goal_attempt', 'punt_attempt']

print("Class distribution:")
df_indexed.groupBy('play_type_index').count().show()

assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df_vector = assembler.transform(df_indexed)

rf = RandomForestClassifier(labelCol='play_type_index', featuresCol='features', maxDepth=10)
train_df, test_df = df_vector.randomSplit([0.8, 0.2], seed=42)
rf_model = rf.fit(train_df)
predictions = rf_model.transform(test_df)

predictions.select('play_type_index', 'prediction').show(50)
print("Confusion Matrix:")
predictions.groupBy("play_type_index", "prediction").count().show()

evaluator = MulticlassClassificationEvaluator(labelCol='play_type_index', predictionCol='prediction', metricName='accuracy')
evaluator_precision = MulticlassClassificationEvaluator(labelCol="play_type_index", predictionCol="prediction", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="play_type_index", predictionCol="prediction", metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="play_type_index", predictionCol="prediction", metricName="f1")

precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)
accuracy = evaluator.evaluate(predictions)

print(f"Model Accuracy: {accuracy}")
print(f"Model Precision: {precision}")
print(f"Model Recall: {recall}")
print(f"Model F1 Score: {f1_score}")

Class distribution:
+---------------+------+
|play_type_index| count|
+---------------+------+
|              1|146702|
|              3| 10819|
|              5|   762|
|              4|  4271|
|              2| 26260|
|              0|207184|
+---------------+------+

+---------------+----------+
|play_type_index|prediction|
+---------------+----------+
|              1|       1.0|
|              1|       1.0|
|              1|       1.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              1|       1.0|
|              1|       1.0|
|              0|       0.0|
|              1|       1.0|
|              1|       1.0|
|              1|       1.0|
|              0|       0.0|
|              1|       1.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              0|       0.0|
|              1|       1.0|
|              1|       1.0|
|              0|       0.0|
|   

Completed:
1)Went back to original dataset from one hot encoded, couldnt get it to work
2)Feature engineering same features but in not one hot encoded
3)Labeled prediction classes - Offensive Play Types - with integers 0 - 5
4)Chose to work with Random Forest Classifier over Gradient Boosted Tree, i couldnt get it to work for anything other than binary classification
5)Split data randomly 80-20
6)Trained and evaluated model - Results are too accurate so somethings wrong like overfitting due to data imbalance

Plan Next Week:
1)Balancing method - Undersample pass and run? Oversample synthetic data using SMOTE or something else? Modify weights in loss function?
2)Cross Validation Evaluation
3)Hyper Parameter Tuning
4)Possibly try logistic regression model with Softmax - Did some research and seems like a good fit if results are still bad after trying the above options


Do single class analysis thats being predicted before balancing!!!

Project Proposal Schedule:
• Week 1: Research and Planning ************Complete
• Week 2: Hadoop/Spark setup for data ingestion pipeline ***********Complete
• Week 3: Data Cleaning, Pre-Processing, and Feature Engineering *************Complete
• Week 4: Training of Machine Learning Model **************Complete-Ish
• Week 5: Model optimization  
• Week 6: Build some type of visualization dashboard 
• Week 7: Analysis and Evaluation 
• Week 8: Report and Presentation