<h2>Regex Explanations</h2>

In [24]:
# if you installed Spark on windows, 
# you may need findspark and need to initialize it prior to being able to use pyspark
# Also, you may need to initialize SparkContext yourself.
#import findspark
#findspark.find()
#findspark.init()
import pyspark
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml.feature import Imputer
from pyspark.sql.functions import *

appName = "Big Data Analytics"
master = "local"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master)

# Create Spark Context with the new configurations rather than rely on the default one
sc = SparkContext.getOrCreate(conf=conf)

# You need to create SQL Context to conduct some database operations like what we will see later.
#sqlContext = SQLContext(sc)

# If you have SQL context, you create the session from the Spark Context
spark = SparkSession.builder.getOrCreate()

#Ingest data from the players CSV into Spark Dataframe.
plays_df = (spark.read
         .format("csv")
         .option("inferSchema", "true")
         .option("header","true")
         .load("/home/bigdata/Big-Data-Bowl/Data/plays.csv")
      )

plays_df.printSchema()

root
 |-- gameId: integer (nullable = true)
 |-- playId: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- GameClock: string (nullable = true)
 |-- down: integer (nullable = true)
 |-- yardsToGo: integer (nullable = true)
 |-- possessionTeam: string (nullable = true)
 |-- yardlineSide: string (nullable = true)
 |-- yardlineNumber: string (nullable = true)
 |-- offenseFormation: string (nullable = true)
 |-- personnel.offense: string (nullable = true)
 |-- defendersInTheBox: string (nullable = true)
 |-- numberOfPassRushers: string (nullable = true)
 |-- personnel.defense: string (nullable = true)
 |-- HomeScoreBeforePlay: integer (nullable = true)
 |-- VisitorScoreBeforePlay: integer (nullable = true)
 |-- HomeScoreAfterPlay: integer (nullable = true)
 |-- VisitorScoreAfterPlay: integer (nullable = true)
 |-- isPenalty: boolean (nullable = true)
 |-- isSTPlay: boolean (nullable = true)
 |-- SpecialTeamsPlayType: string (nullable = true)
 |-- KickReturnYardage: stri

In [25]:
num_rows = plays_df.count()
num_rows

14193

In [26]:
num_columns = len(plays_df.columns)
num_columns

27

In [27]:
plays_df.select("playDescription").show(10)

+--------------------+
|     playDescription|
+--------------------+
|K.Redfern kicks 6...|
|(14:54) (Shotgun)...|
|(14:16) Da.Johnso...|
|(13:34) (Shotgun)...|
|(12:51) (Shotgun)...|
|(12:13) C.Palmer ...|
|(12:08) (Shotgun)...|
|(12:00) A.Abdulla...|
|(11:29) (Shotgun)...|
|P.Dawson extra po...|
+--------------------+
only showing top 10 rows



<h2>Process Regex</h2>

In [28]:
plays_df_example = plays_df.withColumn('Player_1', regexp_extract(col('playDescription'), '(\w+)(\.)(\w+)', 1)).select("Player_1").show(40)

+--------+
|Player_1|
+--------+
|       K|
|       C|
|      Da|
|       C|
|      Da|
|       C|
|       C|
|       A|
|       M|
|       P|
|       P|
|       A|
|       M|
|       M|
|       K|
|       C|
|      Da|
|      Da|
|       A|
|       M|
|       M|
|       M|
|       K|
|      Da|
|      Da|
|       C|
|       P|
|       C|
|      Da|
|       C|
|       P|
|       P|
|       A|
|       M|
|       M|
|       T|
|       M|
|       M|
|       A|
|       M|
+--------+
only showing top 40 rows



In [29]:
plays_df_example = plays_df.withColumn('Player_2', regexp_extract(col('playDescription'), '(\w+)(\.)(\w+)(.*?)(\w+)(\.)(\w+)', 5)).select("Player_2").show(40)

+--------+
|Player_2|
+--------+
|       K|
|       L|
|       E|
|      Da|
|       A|
|      Da|
|      Da|
|       H|
|       G|
|       A|
|       D|
|       K|
|       A|
|       K|
|       D|
|      Da|
|       N|
|       T|
|       A|
|       C|
|       T|
|       M|
|       K|
|       G|
|       T|
|       L|
|       A|
|      Da|
|       A|
|       J|
|       A|
|       D|
|       T|
|       G|
|       T|
|       C|
|       K|
|       G|
|       K|
|       G|
+--------+
only showing top 40 rows



In [30]:
plays_df_example = plays_df.withColumn('Player_2', regexp_extract(col('playDescription'), '(\w+)(\.)(\w+)(.*?)(\w+)(\.)(\w+)', 7)).select("Player_2").show(40)

+----------+
|  Player_2|
+----------+
|  Williams|
|Fitzgerald|
|     Ansah|
|   Johnson|
|    Zettel|
|   Johnson|
|   Johnson|
|   Reddick|
|      Tate|
|    Brewer|
|Washington|
|    Dansby|
|  Abdullah|
|  Golladay|
|  Muhlbach|
|   Johnson|
|    Lawson|
| Whitehead|
|    Brewer|
|     Jones|
|   Riddick|
|  Stafford|
|   Redfern|
|      Quin|
|    Wilson|
|Fitzgerald|
|    Brewer|
|   Johnson|
|    Zettel|
|   Gresham|
|    Brewer|
|Washington|
|   Mathieu|
|      Tate|
|   Riddick|
|     Jones|
|  Golladay|
|      Tate|
|    Dansby|
|      Tate|
+----------+
only showing top 40 rows



In [31]:
plays_df_with_players_columns = plays_df.withColumn('Player_1', regexp_extract(col('playDescription'), '(\w+)(\.)(\w+)', 3))\
                    .withColumn('Player_2', regexp_extract(col('playDescription'), '(\w+)(\.)(\w+)(.*?)(\w+)(\.)(\w+)', 7))\
                    .withColumn('Player_3', regexp_extract(col('playDescription'), '(\w+)(\.)(\w+)(.*?)(\w+)(\.)(\w+)(.*?)(\w+)(\.)(\w+)', 11))


In [32]:
plays_df_with_players_columns.printSchema()

root
 |-- gameId: integer (nullable = true)
 |-- playId: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- GameClock: string (nullable = true)
 |-- down: integer (nullable = true)
 |-- yardsToGo: integer (nullable = true)
 |-- possessionTeam: string (nullable = true)
 |-- yardlineSide: string (nullable = true)
 |-- yardlineNumber: string (nullable = true)
 |-- offenseFormation: string (nullable = true)
 |-- personnel.offense: string (nullable = true)
 |-- defendersInTheBox: string (nullable = true)
 |-- numberOfPassRushers: string (nullable = true)
 |-- personnel.defense: string (nullable = true)
 |-- HomeScoreBeforePlay: integer (nullable = true)
 |-- VisitorScoreBeforePlay: integer (nullable = true)
 |-- HomeScoreAfterPlay: integer (nullable = true)
 |-- VisitorScoreAfterPlay: integer (nullable = true)
 |-- isPenalty: boolean (nullable = true)
 |-- isSTPlay: boolean (nullable = true)
 |-- SpecialTeamsPlayType: string (nullable = true)
 |-- KickReturnYardage: stri

In [33]:
plays_df_with_players_columns.select("Player_1","Player_2","Player_3").show(5)

+--------+----------+---------+
|Player_1|  Player_2| Player_3|
+--------+----------+---------+
| Redfern|  Williams|Killebrew|
|  Palmer|Fitzgerald|    Diggs|
| Johnson|     Ansah|         |
|  Palmer|   Johnson|     Quin|
| Johnson|    Zettel|         |
+--------+----------+---------+
only showing top 5 rows



In [34]:
players_df = (spark.read
         .format("csv")
         .option("inferSchema", "true")
         .option("header","true")
         .load("/home/bigdata/Big-Data-Bowl/Data/players.csv")
      )

players_df.printSchema()

root
 |-- nflId: integer (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- PositionAbbr: string (nullable = true)
 |-- EntryYear: integer (nullable = true)
 |-- DraftRound: string (nullable = true)
 |-- DraftNumber: string (nullable = true)
 |-- Height: string (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- College: string (nullable = true)



In [35]:
players_id_and_lname_df = players_df.select("nflId","LastName").distinct()

In [36]:
players_id_and_lname_df.count()

1713

In [15]:
players_id_and_lname_df.groupBy("lastName").count().show()

+-----------+-----+
|   lastName|count|
+-----------+-----+
|     Grimes|    1|
|     Uzomah|    1|
|Artis-Payne|    1|
|    Gilliam|    1|
|    Parnell|    1|
|      Moats|    1|
|     McPhee|    1|
|   Brinkley|    1|
|    Mihalik|    1|
|    Pouncey|    2|
|   Fackrell|    1|
|   Harrison|    3|
|   Thornton|    1|
|       Ealy|    1|
|     Porter|    1|
|   Golladay|    1|
|     Wilson|    8|
|   McGovern|    1|
|   Abdullah|    1|
|     Elston|    1|
+-----------+-----+
only showing top 20 rows



In [37]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number

w2 = Window.partitionBy("LastName").orderBy(col("nflId"))
single_last_name_players_df = players_id_and_lname_df.withColumn("row",row_number().over(w2)) \
                                  .filter(col("row") == 1).drop("row")
                                    

In [38]:
single_last_name_players_df.count()

1187

In [39]:
plays_with_player_1 = plays_df_with_players_columns.join(single_last_name_players_df,plays_df_with_players_columns.Player_1==single_last_name_players_df.LastName)

In [40]:
plays_with_player_1.printSchema()

root
 |-- gameId: integer (nullable = true)
 |-- playId: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- GameClock: string (nullable = true)
 |-- down: integer (nullable = true)
 |-- yardsToGo: integer (nullable = true)
 |-- possessionTeam: string (nullable = true)
 |-- yardlineSide: string (nullable = true)
 |-- yardlineNumber: string (nullable = true)
 |-- offenseFormation: string (nullable = true)
 |-- personnel.offense: string (nullable = true)
 |-- defendersInTheBox: string (nullable = true)
 |-- numberOfPassRushers: string (nullable = true)
 |-- personnel.defense: string (nullable = true)
 |-- HomeScoreBeforePlay: integer (nullable = true)
 |-- VisitorScoreBeforePlay: integer (nullable = true)
 |-- HomeScoreAfterPlay: integer (nullable = true)
 |-- VisitorScoreAfterPlay: integer (nullable = true)
 |-- isPenalty: boolean (nullable = true)
 |-- isSTPlay: boolean (nullable = true)
 |-- SpecialTeamsPlayType: string (nullable = true)
 |-- KickReturnYardage: stri

In [41]:
plays_with_player_1.select("Player_1","nflId").show()

+--------+-------+
|Player_1|  nflId|
+--------+-------+
| Redfern|2550848|
|  Palmer|2505245|
| Johnson|    262|
|  Palmer|2505245|
| Johnson|    262|
|  Palmer|2505245|
|  Palmer|2505245|
|Abdullah|2552374|
|Stafford|  79860|
|  Dawson|2500351|
|  Dawson|2500351|
|Abdullah|2552374|
|Stafford|  79860|
|Stafford|  79860|
| Redfern|2550848|
|  Palmer|2505245|
| Johnson|    262|
| Johnson|    262|
|     Lee| 496937|
|Stafford|  79860|
+--------+-------+
only showing top 20 rows



In [42]:
playes_df_with_player_1_final = plays_with_player_1.drop("Player_1").withColumnRenamed("nflId","Player1")

In [43]:
playes_df_with_player_1_final.select("Player1").show(5)

+-------+
|Player1|
+-------+
|2550848|
|2505245|
|    262|
|2505245|
|    262|
+-------+
only showing top 5 rows

