In [35]:
import org.apache.spark.sql.functions._

In [17]:
val players = spark.read.format("csv").option("header","true").load("/Users/pulkit/data/player.csv")

In [18]:
players.printSchema

root
 |-- id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- height: string (nullable = true)
 |-- weight: string (nullable = true)



In [19]:
players.show(5)

+---+-------------+------------------+------------------+-------------------+------+------+
| id|player_api_id|       player_name|player_fifa_api_id|           birthday|height|weight|
+---+-------------+------------------+------------------+-------------------+------+------+
|  1|       505942|Aaron Appindangoye|            218353|1992-02-29 00:00:00|182.88|   187|
|  2|       155782|   Aaron Cresswell|            189615|1989-12-15 00:00:00|170.18|   146|
|  3|       162549|       Aaron Doran|            186170|1991-05-13 00:00:00|170.18|   163|
|  4|        30572|     Aaron Galindo|            140161|1982-05-08 00:00:00|182.88|   198|
|  5|        23780|      Aaron Hughes|             17725|1979-11-08 00:00:00|182.88|   154|
+---+-------------+------------------+------------------+-------------------+------+------+
only showing top 5 rows



In [11]:
val playerAttributes = spark.read.format("csv").option("header","true").load("/Users/pulkit/data/Player_Attributes.csv")

In [12]:
playerAttributes.printSchema

root
 |-- id: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- overall_rating: string (nullable = true)
 |-- potential: string (nullable = true)
 |-- preferred_foot: string (nullable = true)
 |-- attacking_work_rate: string (nullable = true)
 |-- defensive_work_rate: string (nullable = true)
 |-- crossing: string (nullable = true)
 |-- finishing: string (nullable = true)
 |-- heading_accuracy: string (nullable = true)
 |-- short_passing: string (nullable = true)
 |-- volleys: string (nullable = true)
 |-- dribbling: string (nullable = true)
 |-- curve: string (nullable = true)
 |-- free_kick_accuracy: string (nullable = true)
 |-- long_passing: string (nullable = true)
 |-- ball_control: string (nullable = true)
 |-- acceleration: string (nullable = true)
 |-- sprint_speed: string (nullable = true)
 |-- agility: string (nullable = true)
 |-- reactions: string (nullable = true

In [25]:
val playersd = players.drop("id","player_fifa_api_id")
playersd.columns

Name: Compile Error
Message: <console>:21: error: value dropna is not a member of org.apache.spark.sql.DataFrame
       val playersd = players.drop("id","player_fifa_api_id").dropna()
                                                              ^
StackTrace: 

In [27]:
val playerAttributesd = playerAttributes.drop(
"id",
"player_fifa_api_id",
"preferred_foot",
"attacking_work_rate",
"defensive_work_rate",
"crossing",
"crossing",
"jumping","aggression","balance","sprint_speed","potential","short_passing")

In [32]:
val playerAttributesdna = playerAttributesd.na.drop("all")
val playersna = playersd.na.drop("all")

In [33]:
playerAttributesdna.show(4)

+-------------+-------------------+--------------+---------+----------------+-------+---------+-----+------------------+------------+------------+------------+-------+---------+----------+-------+--------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+--------------+-----------+
|player_api_id|               date|overall_rating|finishing|heading_accuracy|volleys|dribbling|curve|free_kick_accuracy|long_passing|ball_control|acceleration|agility|reactions|shot_power|stamina|strength|long_shots|interceptions|positioning|vision|penalties|marking|standing_tackle|sliding_tackle|gk_diving|gk_handling|gk_kicking|gk_positioning|gk_reflexes|
+-------------+-------------------+--------------+---------+----------------+-------+---------+-----+------------------+------------+------------+------------+-------+---------+----------+-------+--------+----------+-------------+-----------+------+---------+-------+---------------

In [53]:
val sp: String => Int = _.split("-")(0).toInt

val yearExtractUDF = udf(sp)

In [54]:
val playerAttributesdnay = playerAttributesdna.withColumn("year",yearExtractUDF(col("date"))).drop("date")

In [55]:
playerAttributesdnay.show(4)

+-------------+--------------+---------+----------------+-------+---------+-----+------------------+------------+------------+------------+-------+---------+----------+-------+--------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+--------------+-----------+----+
|player_api_id|overall_rating|finishing|heading_accuracy|volleys|dribbling|curve|free_kick_accuracy|long_passing|ball_control|acceleration|agility|reactions|shot_power|stamina|strength|long_shots|interceptions|positioning|vision|penalties|marking|standing_tackle|sliding_tackle|gk_diving|gk_handling|gk_kicking|gk_positioning|gk_reflexes|year|
+-------------+--------------+---------+----------------+-------+---------+-----+------------------+------------+------------+------------+-------+---------+----------+-------+--------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+-

In [59]:
val pa2016 = playerAttributesdnay.where(col("year") ===2016)

In [60]:
pa2016.count()

14103

In [61]:
pa2016.select("player_api_id").distinct().count()

5586

In [62]:
val paStriker2016 = pa2016.groupBy("player_api_id").
                            agg(avg("finishing"), avg("shot_power"),avg("acceleration"))

In [63]:
paStriker2016.count()

5586

In [64]:
paStriker2016.show(5)

+-------------+-----------------+---------------+-----------------+
|player_api_id|   avg(finishing)|avg(shot_power)|avg(acceleration)|
+-------------+-----------------+---------------+-----------------+
|       309726|75.44444444444444|           76.0|74.11111111111111|
|        26112|             53.0|           76.0|             51.0|
|        38433|            68.25|           74.0|             74.0|
|       295060|             25.0|           40.0|             62.0|
|       161396|             29.0|           69.0|             72.0|
+-------------+-----------------+---------------+-----------------+
only showing top 5 rows



In [75]:
val weight_f : Int = 1
val weight_s : Int = 2
val weight_a  : Int = 1
val totalWeight : Int = weight_f + weight_s + weight_a
totalWeight

4

In [83]:
val strikers = paStriker2016.withColumn("strikerGrade", col("avg(finishing)") +
                                        col("avg(shot_power)") * weight_s +
                                        col("avg(acceleration)") * weight_a / totalWeight)

In [88]:
val strikera = strikers.drop("avg(finishing)","avg(shot_power)","avg(acceleration)")

In [89]:
strikera.show

+-------------+------------------+
|player_api_id|      strikerGrade|
+-------------+------------------+
|       309726|245.97222222222223|
|        26112|            217.75|
|        38433|            234.75|
|       295060|             120.5|
|       161396|             185.0|
|        37774|             213.0|
|        41157|            262.75|
|        40740|           226.375|
|        31432|            158.75|
|       109653|            245.25|
|       282680|             68.25|
|       210428|            146.25|
|       190851|             241.5|
|       419238|148.00000000000003|
|       664587|             198.5|
|       239352|            211.75|
|       190801|          191.9375|
|       196957|236.33333333333334|
|       173922|             216.5|
|       121080|             132.0|
+-------------+------------------+
only showing top 20 rows

