## Spark conf

In [1]:
from pyspark.sql import  SparkSession
from pyspark.sql import functions as F
from pyspark import SparkConf

In [20]:
from pyspark.sql import Window

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [3]:
sparkConf = SparkConf()
sparkConf.setAll(
    [
        ("spark.app.name", "olmypics spark app"),
        ("spark.master", "local[3]")
    ]
)

<pyspark.conf.SparkConf at 0x230bd1f2f30>

In [4]:
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

## Loading data

In [5]:
athletesSchema = StructType(
    [
        StructField("id", IntegerType(), False),
        StructField("name", StringType(), False),
        StructField("sex", StringType(), False),
        StructField("height", IntegerType(), True),
        StructField("weight", IntegerType(), True),
        StructField("team", StringType(), False)
    ]
)
# id,name,sex,height,weight,team

In [6]:
athletes_df = spark.read.format("csv").option("header", True).schema(athletesSchema).option("enforceSchema", True).option("path", r"D:\SQL telegram\Project 2 olympic history\athletes.csv").load()

In [7]:
athletes_df.show(truncate=False)

+---+--------------------------------------+---+------+------+--------------+
|id |name                                  |sex|height|weight|team          |
+---+--------------------------------------+---+------+------+--------------+
|1  |A Dijiang                             |M  |180   |80    |China         |
|2  |A Lamusi                              |M  |170   |60    |China         |
|3  |Gunnar Nielsen Aaby                   |M  |NULL  |NULL  |Denmark       |
|4  |Edgar Lindenau Aabye                  |M  |NULL  |NULL  |Denmark/Sweden|
|5  |Christine Jacoba Aaftink              |F  |185   |82    |Netherlands   |
|6  |Per Knut Aaland                       |M  |188   |75    |United States |
|7  |John Aalberg                          |M  |183   |72    |United States |
|8  |"Cornelia ""Cor"" Aalten (-Strannood)"|F  |168   |NULL  |Netherlands   |
|9  |Antti Sami Aalto                      |M  |186   |96    |Finland       |
|10 |"Einar Ferdinand ""Einari"" Aalto"    |M  |NULL  |NULL  |Fi

In [8]:
print(athletes_df.printSchema())
athletes_df.columns

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- height: integer (nullable = true)
 |-- weight: integer (nullable = true)
 |-- team: string (nullable = true)

None


['id', 'name', 'sex', 'height', 'weight', 'team']

In [9]:
for col in athletes_df.columns:
    print("============================================================================\n")
    print(athletes_df.select(col).agg(F.count(col).alias("total_"+col+"s")).show(), type(athletes_df.groupBy(F.col(col)).count()))


+---------+
|total_ids|
+---------+
|   135571|
+---------+

None <class 'pyspark.sql.dataframe.DataFrame'>

+-----------+
|total_names|
+-----------+
|     135571|
+-----------+

None <class 'pyspark.sql.dataframe.DataFrame'>

+----------+
|total_sexs|
+----------+
|    135571|
+----------+

None <class 'pyspark.sql.dataframe.DataFrame'>

+-------------+
|total_heights|
+-------------+
|       101394|
+-------------+

None <class 'pyspark.sql.dataframe.DataFrame'>

+-------------+
|total_weights|
+-------------+
|       100294|
+-------------+

None <class 'pyspark.sql.dataframe.DataFrame'>

+-----------+
|total_teams|
+-----------+
|     135476|
+-----------+

None <class 'pyspark.sql.dataframe.DataFrame'>


In [10]:
athletes_df.filter("team is null").orderBy("name", "id").show(10000, truncate=False)
# no team name, sex, height and only a few weight data - > remove these nan values

+------+--------------------------------------------------------+---------------+------+------+----+
|id    |name                                                    |sex            |height|weight|team|
+------+--------------------------------------------------------+---------------+------+------+----+
|127653|"Alena ""ja"" Vrzov (-Zanov                             | -Steindler)"  |NULL  |NULL  |NULL|
|42414 |"Alexander ""Alex"" Grant                               | Jr."          |NULL  |NULL  |NULL|
|123231|"Alvin Edmund ""Al"" Ulbrickson                         | Jr."          |NULL  |NULL  |NULL|
|100754|"Anthony ""Tony"" Ries                                  | Jr."          |NULL  |NULL  |NULL|
|89805 |"Arnold Carl ""Arnie"" Oss                              | Jr."          |NULL  |NULL  |NULL|
|58337 |"Arthur Hayward ""Art"" Keay                            | Jr."          |NULL  |NULL  |NULL|
|71240 |"Arthur Matthew ""Art"" Longsjo                         | Jr."          |NULL  |NUL

In [11]:
athletes_processed_df = athletes_df.filter("team is not null")

In [12]:
athletes_processed_df.show(truncate=False)

+---+--------------------------------------+---+------+------+--------------+
|id |name                                  |sex|height|weight|team          |
+---+--------------------------------------+---+------+------+--------------+
|1  |A Dijiang                             |M  |180   |80    |China         |
|2  |A Lamusi                              |M  |170   |60    |China         |
|3  |Gunnar Nielsen Aaby                   |M  |NULL  |NULL  |Denmark       |
|4  |Edgar Lindenau Aabye                  |M  |NULL  |NULL  |Denmark/Sweden|
|5  |Christine Jacoba Aaftink              |F  |185   |82    |Netherlands   |
|6  |Per Knut Aaland                       |M  |188   |75    |United States |
|7  |John Aalberg                          |M  |183   |72    |United States |
|8  |"Cornelia ""Cor"" Aalten (-Strannood)"|F  |168   |NULL  |Netherlands   |
|9  |Antti Sami Aalto                      |M  |186   |96    |Finland       |
|10 |"Einar Ferdinand ""Einari"" Aalto"    |M  |NULL  |NULL  |Fi

In [13]:
athlete_events_df = spark.read.format("csv").option("header", True).option("inferSchema", True).option("path", r"D:\SQL telegram\Project 2 olympic history\athlete_events.csv").load()

In [14]:
athlete_events_df.printSchema()

root
 |-- athlete_id: integer (nullable = true)
 |-- games: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- season: string (nullable = true)
 |-- city: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- event: string (nullable = true)
 |-- medal: string (nullable = true)



In [15]:
for col in athlete_events_df.columns:
    print("=================================================\n")
    print(athlete_events_df.agg(F.count(F.col(col)).alias(col+"_count")).show())


+----------------+
|athlete_id_count|
+----------------+
|          271116|
+----------------+

None

+-----------+
|games_count|
+-----------+
|     271116|
+-----------+

None

+----------+
|year_count|
+----------+
|    271116|
+----------+

None

+------------+
|season_count|
+------------+
|      271116|
+------------+

None

+----------+
|city_count|
+----------+
|    271116|
+----------+

None

+-----------+
|sport_count|
+-----------+
|     271116|
+-----------+

None

+-----------+
|event_count|
+-----------+
|     271116|
+-----------+

None

+-----------+
|medal_count|
+-----------+
|     271116|
+-----------+

None


In [16]:
athlete_events_df.show()

+----------+-----------+----+------+-----------+--------------------+--------------------+-----+
|athlete_id|      games|year|season|       city|               sport|               event|medal|
+----------+-----------+----+------+-----------+--------------------+--------------------+-----+
|         1|1992 Summer|1992|Summer|  Barcelona|          Basketball|Basketball Men's ...|   NA|
|         2|2012 Summer|2012|Summer|     London|                Judo|Judo Men's Extra-...|   NA|
|         3|1920 Summer|1920|Summer|  Antwerpen|            Football|Football Men's Fo...|   NA|
|         4|1900 Summer|1900|Summer|      Paris|          Tug-Of-War|Tug-Of-War Men's ...| Gold|
|         5|1988 Winter|1988|Winter|    Calgary|       Speed Skating|Speed Skating Wom...|   NA|
|         5|1988 Winter|1988|Winter|    Calgary|       Speed Skating|Speed Skating Wom...|   NA|
|         5|1992 Winter|1992|Winter|Albertville|       Speed Skating|Speed Skating Wom...|   NA|
|         5|1992 Winter|1992|W

## Processing

### which team has won the maximum gold medals over the years.

In [17]:
joined_athletes_and_events_df = athlete_events_df.join(athletes_processed_df, athletes_processed_df.id==athlete_events_df.athlete_id, "right")

In [37]:
joined_athletes_and_events_df.cache()

DataFrame[athlete_id: int, games: string, year: int, season: string, city: string, sport: string, event: string, medal: string, id: int, name: string, sex: string, height: int, weight: int, team: string]

In [18]:
joined_athletes_and_events_df.select("team", "medal").filter(F.col("medal") != "NA").groupBy("team").agg(F.count("medal").alias("medal_count")).orderBy(F.col("medal_count").desc()).limit(5).show()

+-------------+-----------+
|         team|medal_count|
+-------------+-----------+
|United States|       5131|
| Soviet Union|       2604|
|      Germany|       1887|
|Great Britain|       1698|
|       France|       1558|
+-------------+-----------+



In [30]:
print(athlete_events_df.rdd.getNumPartitions())
print(athletes_processed_df.rdd.getNumPartitions())
print(joined_athletes_and_events_df.rdd.getNumPartitions())

3
2
3


In [19]:
### for each team print total silver medals and year in which they won maximum silver medal..output 3 columns
# team,total_silver_medals, year_of_max_silver

In [21]:
year_of_max_silver_wind = Window.partitionBy("team").orderBy(F.desc("silver_medal_count"))

In [23]:
team_wise_silver_df = joined_athletes_and_events_df.select("team", "year", "medal")\
    .filter(F.col("medal") == "Silver")\
    .groupBy("team", "year")\
    .agg(F.count("medal").alias("silver_medal_count"))\
    .withColumn("year_of_max_silver", F.first_value("year").over(year_of_max_silver_wind))\
    .groupBy("team")\
    .agg(F.sum("silver_medal_count").alias("total_silver_medals"), F.min("year_of_max_silver").alias("year_of_max_silver"))

In [27]:
team_wise_silver_df.orderBy(F.desc("total_silver_medals")).show(truncate=False)

+--------------+-------------------+------------------+
|team          |total_silver_medals|year_of_max_silver|
+--------------+-------------------+------------------+
|United States |1507               |1984              |
|Soviet Union  |766                |1980              |
|Germany       |603                |1936              |
|Great Britain |590                |1908              |
|France        |524                |1920              |
|Italy         |507                |2016              |
|Sweden        |479                |1912              |
|Australia     |453                |2004              |
|Canada        |405                |1984              |
|China         |332                |1996              |
|Norway        |330                |1920              |
|Hungary       |329                |1972              |
|East Germany  |325                |1980              |
|Netherlands   |319                |2004              |
|Japan         |307                |2012        

In [39]:
# which player has won maximum gold medals  amongst the players 
# which have won only gold medal (never won silver or bronze) over the years

In [47]:
athlete_with_silver_or_bronze = athlete_events_df.where(F.col("medal").isin("Silver", "Bronze")).select("athlete_id")

In [None]:
result = athlete_with_silver_or_bronze.collect()
temp = set([result[i].asDict()["athlete_id"] for i in range(len(result))])

In [94]:
athlete_with_gold_only = athlete_events_df.select("athlete_id", "medal")\
    .where((F.col("medal") == "Gold") & (~F.col("athlete_id").isin(temp)))

In [97]:
athlete_with_gold_only.cache()

DataFrame[athlete_id: int, medal: string]

In [95]:
athlete_with_max_gold_over_years = athlete_with_gold_only.groupBy("athlete_id").agg(F.count("medal").alias("gold_count")).orderBy(F.desc("gold_count")).limit(3)

In [103]:
athlete_with_max_gold_over_years.show()

+----------+----------+
|athlete_id|gold_count|
+----------+----------+
|     33557|        10|
|     13029|         8|
|     84026|         6|
+----------+----------+



In [None]:
# in each year which player has won maximum gold medal . Write a query to print year,player name 
# and no of golds won in that year . In case of a tie print comma separated player names.


In [104]:
spark.stop()