In [1]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
# import pyspark.sql.SparkSession <= Error: no module named "pyspark.sql.SparkSession"
# Modules should have short, all-lowercase names. Underscores can be used in the module name if it improves readability. Python packages should also have short, all-lowercase names, although the use of underscores is discouraged.
# Class names should normally use the CapWords convention.
# SparkSession is a class defined in pyspark.sql module
# Where as pyspark.sql.functions and pyspark.sql.types both are standalone modules
# spark = pyspark.sql.SparkSession.builder.appName("Feature Engineering -- UFC Data").getOrCreate()
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Feature Engineering -- UFC Data").getOrCreate()
spark

In [4]:
import os
from os.path import isfile, join


In [5]:
currnet_loc = os.path.abspath("")
data_loc = f"{currnet_loc}/ignoreFiles/"
files = [f for f in os.listdir(data_loc) if isfile(join(data_loc,f))]
files

['data.csv', 'raw_fighter_details.csv', 'raw_total_fight_data.csv']

In [6]:
data = {}
for file in files:
    data[file[:-4]] = spark.read.csv(f'{data_loc}{file}', inferSchema=True, header=True)
    data[file[:-4]].createOrReplaceGlobalTempView(f"data_{file[:-4]}")

In [7]:
print(type(data))
print(data.keys())

<class 'dict'>
dict_keys(['data', 'raw_fighter_details', 'raw_total_fight_data'])


In [8]:
data["data"] = data["data"].where(F.col("date") > "2017-01-01") \
                           .withColumn("Winner_name", F.when(F.col("Winner") == "Red", F.col("R_fighter")).otherwise(F.col("B_fighter"))) \
                           .withColumn("Loser_name", F.when(F.col("Winner") == "Red", F.col("B_fighter")).otherwise(F.col("R_fighter")))

In [9]:
type(data["data"])

pyspark.sql.dataframe.DataFrame

In [10]:
data["data"].limit(1).collect()

[Row(R_fighter='Henry Cejudo', B_fighter='Marlon Moraes', Referee='Marc Goddard', date='2019-06-08', location='Chicago, Illinois, USA', Winner='Red', title_bout=True, weight_class='Bantamweight', no_of_rounds=5, B_current_lose_streak=0.0, B_current_win_streak=4.0, B_draw=0.0, B_avg_BODY_att=9.2, B_avg_BODY_landed=6.0, B_avg_CLINCH_att=0.2, B_avg_CLINCH_landed=0.0, B_avg_DISTANCE_att=62.6, B_avg_DISTANCE_landed=20.6, B_avg_GROUND_att=2.6, B_avg_GROUND_landed=2.0, B_avg_HEAD_att=48.6, B_avg_HEAD_landed=11.2, B_avg_KD=0.8, B_avg_LEG_att=7.6, B_avg_LEG_landed=5.4, B_avg_PASS=0.4, B_avg_REV=0.0, B_avg_SIG_STR_att=65.4, B_avg_SIG_STR_landed=22.6, B_avg_SIG_STR_pct=0.466, B_avg_SUB_ATT=0.4, B_avg_TD_att=0.8, B_avg_TD_landed=0.2, B_avg_TD_pct=0.1, B_avg_TOTAL_STR_att=66.4, B_avg_TOTAL_STR_landed=23.6, B_longest_win_streak=4.0, B_losses=1.0, B_avg_opp_BODY_att=6.4, B_avg_opp_BODY_landed=4.0, B_avg_opp_CLINCH_att=1.0, B_avg_opp_CLINCH_landed=0.6, B_avg_opp_DISTANCE_att=51.2, B_avg_opp_DISTANCE_l

In [11]:
all_red_fighters = data['data'].select('R_fighter')
all_blue_fighters = data['data'].select('B_fighter')
all_red_fighters.union(all_blue_fighters).distinct().count()

763

In [12]:
wins_table = data['data'].select('Winner_name').groupby('Winner_name').count()
wins_table = wins_table.withColumnRenamed('count', 'Wins')

In [13]:
wins_table.show()

+-------------------+----+
|        Winner_name|Wins|
+-------------------+----+
|        Chad Mendes|   1|
|          Zak Ottow|   3|
|Deiveson Figueiredo|   3|
|       Marcelo Golm|   1|
|           Jon Tuck|   1|
|    Danielle Taylor|   1|
|     Montel Jackson|   2|
| Ovince Saint Preux|   4|
| Alexander Yakovlev|   1|
|         Polo Reyes|   1|
|     Corey Anderson|   3|
|   Stephen Thompson|   1|
|     Augusto Mendes|   1|
|       Matt Schnell|   3|
|      Gilbert Burns|   4|
|        Josh Emmett|   3|
|         Juan Adams|   1|
|      Ricardo Lamas|   2|
|       Alex Caceres|   2|
|          Kevin Lee|   3|
+-------------------+----+
only showing top 20 rows



In [14]:
losses_table = data['data'].select('Loser_name').groupby('Loser_name').count()
losses_table = losses_table.withColumnRenamed('count', 'Losses')

In [15]:
losses_table.show()

+-------------------+------+
|         Loser_name|Losses|
+-------------------+------+
|        Kyle Nelson|     2|
|        Chad Mendes|     1|
|       Marcelo Golm|     3|
|Deiveson Figueiredo|     1|
|          Zak Ottow|     3|
|        Artem Lobov|     3|
|           Jon Tuck|     1|
|    Danielle Taylor|     2|
|        Sung Bin Jo|     1|
| Ovince Saint Preux|     4|
|     Montel Jackson|     1|
|        Johnny Case|     1|
|        Davey Grant|     1|
|     Johnny Eduardo|     2|
| Antonio Braga Neto|     1|
|        Jason Saggo|     1|
|         Polo Reyes|     2|
|       Scott Askham|     1|
|   Stephen Thompson|     3|
|    Marcin Prachnio|     2|
+-------------------+------+
only showing top 20 rows



In [16]:
full_join = wins_table.join(losses_table, wins_table.Winner_name == losses_table.Loser_name, how='full')

In [17]:
full_join.show()

+-------------------+----+-------------------+------+
|        Winner_name|Wins|         Loser_name|Losses|
+-------------------+----+-------------------+------+
|        Chad Mendes|   1|        Chad Mendes|     1|
|               null|null|        Kyle Nelson|     2|
|               null|null|        Artem Lobov|     3|
|    Danielle Taylor|   1|    Danielle Taylor|     2|
|Deiveson Figueiredo|   3|Deiveson Figueiredo|     1|
|           Jon Tuck|   1|           Jon Tuck|     1|
|       Marcelo Golm|   1|       Marcelo Golm|     3|
|          Zak Ottow|   3|          Zak Ottow|     3|
|               null|null|        Johnny Case|     1|
|     Montel Jackson|   2|     Montel Jackson|     1|
| Ovince Saint Preux|   4| Ovince Saint Preux|     4|
|               null|null|        Sung Bin Jo|     1|
| Alexander Yakovlev|   1|               null|  null|
|               null|null| Antonio Braga Neto|     1|
|               null|null|        Davey Grant|     1|
|               null|null|  

In [18]:
full_join = full_join.withColumn("Player", \
                                 F.when(F.col("Winner_name").isNotNull(), F.col("Winner_name")) \
                                 .otherwise(F.col("Loser_name")) \
                                 )

In [19]:
full_join = full_join.drop('Winner_name', 'Loser_name').select('Player', 'Wins', 'Losses')
full_join = full_join.na.fill(0)
full_join.show()

+-------------------+----+------+
|             Player|Wins|Losses|
+-------------------+----+------+
|        Chad Mendes|   1|     1|
|        Kyle Nelson|   0|     2|
|        Artem Lobov|   0|     3|
|    Danielle Taylor|   1|     2|
|Deiveson Figueiredo|   3|     1|
|           Jon Tuck|   1|     1|
|       Marcelo Golm|   1|     3|
|          Zak Ottow|   3|     3|
|        Johnny Case|   0|     1|
|     Montel Jackson|   2|     1|
| Ovince Saint Preux|   4|     4|
|        Sung Bin Jo|   0|     1|
| Alexander Yakovlev|   1|     0|
| Antonio Braga Neto|   0|     1|
|        Davey Grant|   0|     1|
|        Jason Saggo|   0|     1|
|     Johnny Eduardo|   0|     2|
|         Polo Reyes|   1|     2|
|       Scott Askham|   0|     1|
|     Augusto Mendes|   1|     1|
+-------------------+----+------+
only showing top 20 rows



In [20]:
df_final = full_join.withColumn("Win_rate", F.col("Wins")/(F.col('Losses') + F.col("Wins")))

In [21]:
df_final = df_final.orderBy(F.col('Wins').desc(), F.col('Win_rate').desc())
df_final.show()

+--------------------+----+------+------------------+
|              Player|Wins|Losses|          Win_rate|
+--------------------+----+------+------------------+
|       Thiago Santos|   8|     1|0.8888888888888888|
|      Dustin Poirier|   6|     0|               1.0|
|Alexander Volkano...|   6|     0|               1.0|
|     Israel Adesanya|   6|     0|               1.0|
|        Kamaru Usman|   6|     0|               1.0|
|      Curtis Blaydes|   6|     1|0.8571428571428571|
|     Jack Hermansson|   6|     1|0.8571428571428571|
|Anthony Rocco Martin|   6|     1|0.8571428571428571|
|    Charles Oliveira|   6|     1|0.8571428571428571|
|       Anthony Smith|   6|     2|              0.75|
|   Aljamain Sterling|   6|     2|              0.75|
|            Petr Yan|   5|     0|               1.0|
|Elizeu Zaleski do...|   5|     0|               1.0|
|Zabit Magomedshar...|   5|     0|               1.0|
|        Henry Cejudo|   5|     0|               1.0|
|    Gregor Gillespie|   5| 

In [22]:
type(data['raw_fighter_details'])

pyspark.sql.dataframe.DataFrame

In [23]:
df_palyer_detail = data['raw_fighter_details']
pd = df_palyer_detail.alias("pd")

In [24]:
df_with_detail = df_final.join(pd, pd.fighter_name == df_final.Player, how='left')

In [25]:
df_with_detail.show()

+-------------------+----+------+------------------+-------------------+-------+--------+-----+--------+------------+
|             Player|Wins|Losses|          Win_rate|       fighter_name| Height|  Weight|Reach|  Stance|         DOB|
+-------------------+----+------+------------------+-------------------+-------+--------+-----+--------+------------+
|        Chad Mendes|   1|     1|               0.5|        Chad Mendes| 5' 6""|145 lbs.| 66""|Orthodox|May 01, 1985|
|        Kyle Nelson|   0|     2|               0.0|        Kyle Nelson|5' 11""|155 lbs.| 71""|  Switch|Apr 20, 1991|
|        Artem Lobov|   0|     3|               0.0|        Artem Lobov| 5' 9""|145 lbs.| 65""|Southpaw|Aug 11, 1986|
|    Danielle Taylor|   1|     2|0.3333333333333333|    Danielle Taylor| 5' 0""|115 lbs.| 60""|Orthodox|Oct 25, 1989|
|Deiveson Figueiredo|   3|     1|              0.75|Deiveson Figueiredo| 5' 5""|125 lbs.| 68""|Orthodox|Dec 18, 1987|
|           Jon Tuck|   1|     1|               0.5|    

In [26]:
df_with_detail.count()

763

In [27]:
df_final.count()

763