## Create spark context

In [1]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("InjestionProcessing").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

## Read files

In [4]:
processed_folder_path = "/content/drive/MyDrive/DBDA_PROJECT/Formula1DataAnalytics-main/Main/processed_folder_path"
results = spark.read.option("header", True).parquet(f"{processed_folder_path}/results")
races = spark.read.option("header", True).parquet(f"{processed_folder_path}/races")
drivers = spark.read.option("header", True).parquet(f"{processed_folder_path}/drivers")
constructors = spark.read.option("header", True).parquet(f"{processed_folder_path}/constructors")

In [5]:
results.show(2)

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|       time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+
|        1|     18|        1|             1|    22|   1|       1|            1|             1|  10.0|  58|1:34:50.616|     5690616|         39|   2|        1:27.452|            218.3|
|        2|     18|        2|             2|     3|   5|       2|            2|             2|   8.0|  58|     +5.478|     5696094|         41|   3|        1:27.739|          217.586|
+---------+-------+---------+--------------+------+----+--------+-------------+-

In [6]:
races.show(2)

+-------+---------+-----+----------+--------------------+
|race_id|race_year|round|circuit_id|                name|
+-------+---------+-----+----------+--------------------+
|      1|     2009|    1|         1|Australian Grand ...|
|      2|     2009|    2|         2|Malaysian Grand Prix|
+-------+---------+-----+----------+--------------------+
only showing top 2 rows



In [7]:
drivers.show(2)

+---------+----------+------+----+--------+--------+----------+-----------+--------------+
|driver_id|driver_ref|number|code|forename| surname|       dob|nationality|          name|
+---------+----------+------+----+--------+--------+----------+-----------+--------------+
|        1|  hamilton|    44| HAM|   Lewis|Hamilton|1985-01-07|    British|Lewis Hamilton|
|        2|  heidfeld|    \N| HEI|    Nick|Heidfeld|1977-05-10|     German| Nick Heidfeld|
+---------+----------+------+----+--------+--------+----------+-----------+--------------+
only showing top 2 rows



In [8]:
constructors.show(2)

+--------------+---------------+----------+-----------+--------------------+
|constructor_id|constructor_ref|      name|nationality|                 url|
+--------------+---------------+----------+-----------+--------------------+
|             1|        mclaren|   McLaren|    British|http://en.wikiped...|
|             2|     bmw_sauber|BMW Sauber|     German|http://en.wikiped...|
+--------------+---------------+----------+-----------+--------------------+
only showing top 2 rows



In [9]:
results.createOrReplaceTempView("results")
races.createOrReplaceTempView("races")
drivers.createOrReplaceTempView("drivers")
constructors.createOrReplaceTempView("constructors")

In [10]:
query = """SELECT races.race_year,
constructors.name,
drivers.name,
results.position,
results.points
FROM results
JOIN drivers ON (results.driver_id = drivers.driver_id)
JOIN constructors ON (results.constructor_id = constructors.constructor_id)
JOIN races ON (results.race_id = races.race_id)
"""

In [11]:
spark.sql(query).show(10)

+---------+----------+------------------+--------+------+
|race_year|      name|              name|position|points|
+---------+----------+------------------+--------+------+
|     2008|   McLaren|    Lewis Hamilton|       1|  10.0|
|     2008|BMW Sauber|     Nick Heidfeld|       2|   8.0|
|     2008|  Williams|      Nico Rosberg|       3|   6.0|
|     2008|   Renault|   Fernando Alonso|       4|   5.0|
|     2008|   McLaren| Heikki Kovalainen|       5|   4.0|
|     2008|  Williams|   Kazuki Nakajima|       6|   3.0|
|     2008|Toro Rosso|Sébastien Bourdais|       7|   2.0|
|     2008|   Ferrari|    Kimi Räikkönen|       8|   1.0|
|     2008|BMW Sauber|     Robert Kubica|    NULL|   0.0|
|     2008|    Toyota|        Timo Glock|    NULL|   0.0|
+---------+----------+------------------+--------+------+
only showing top 10 rows



In [12]:
query = """
SELECT races.race_year,
    constructors.name AS team_name,
    drivers.name AS driver_name,
    results.position,
    results.points,
    11 - results.position AS calculated_points
FROM results
JOIN drivers ON (results.driver_id = drivers.driver_id)
JOIN constructors ON (results.constructor_id = constructors.constructor_id)
JOIN races ON (results.race_id = races.race_id)
WHERE results.position <= 10
"""

In [13]:
calculated_race_results = spark.sql(query)

In [14]:
calculated_race_results.show(2)

+---------+----------+--------------+--------+------+-----------------+
|race_year| team_name|   driver_name|position|points|calculated_points|
+---------+----------+--------------+--------+------+-----------------+
|     2008|   McLaren|Lewis Hamilton|       1|  10.0|               10|
|     2008|BMW Sauber| Nick Heidfeld|       2|   8.0|                9|
+---------+----------+--------------+--------+------+-----------------+
only showing top 2 rows



In [16]:
calculated_race_results.write.mode('overwrite').parquet(f"{processed_folder_path}/calculated_race_results")

In [17]:
!ls /content/drive/MyDrive/DBDA_PROJECT/Formula1DataAnalytics-main/Main/processed_folder_path/calculated_race_results


part-00000-18bee818-93c6-4c78-8e2e-86b32de4c510-c000.snappy.parquet  _SUCCESS
