<a href="https://colab.research.google.com/github/PBuenoc/f1ProjectInGoogleColab/blob/main/F1ProjectInColab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare spark environment

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"

In [123]:
import findspark
findspark.init()

In [124]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName("Iniciando com Spark").getOrCreate()

# Ingest cirtuits.csv file

In [125]:
circuits_df = spark.read.csv('/content/formula1/data/raw/circuits2.csv',inferSchema=True, header=True, sep=';')

In [126]:
circuits_df.show()

+---------+--------------+--------------------+------------+---------+-------+-------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|    lat|    lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+-------+-------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-378497| 144968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 276083| 101738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 260325| 505106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   4157| 226111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 409517|  29405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 437347| 742056|  7|http://en.wiki

### Select only the columns required

In [127]:
circuits_df = circuits_df.select('circuitId', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng', 'alt')
circuits_df.show()

+---------+--------------+--------------------+------------+---------+-------+-------+---+
|circuitId|    circuitRef|                name|    location|  country|    lat|    lng|alt|
+---------+--------------+--------------------+------------+---------+-------+-------+---+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-378497| 144968| 10|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 276083| 101738| 18|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 260325| 505106|  7|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   4157| 226111|109|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 409517|  29405|130|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 437347| 742056|  7|
|        7|    villeneuve|Circuit Gilles Vi...|    Montreal|   Canada|    455|-735228| 13|
|        8|   magny_cours|Circuit de Nevers...| Magny Cours|   France| 468642| 316361|228|

### Renamed the columns as required

In [128]:
circuits_df = circuits_df.withColumnRenamed('circuitId', 'circuit_id') \
.withColumnRenamed('circuitRef', 'circuit_ref') \
.withColumnRenamed('lat','latitude') \
.withColumnRenamed('lng','longitude') \
.withColumnRenamed('alt','altitude')
circuits_df.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia| -378497|   144968|      10|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia|  276083|   101738|      18|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain|  260325|   505106|       7|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|    4157|   226111|     109|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey|  409517|    29405|     130|
|         6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco|  437347|   742056|       7|
|         7|    villeneuve|Circuit Gilles Vi...|    Montreal|   Canada|     455|  -735228|      13|


### Add ingestion_date column

In [129]:
from pyspark.sql.functions import current_timestamp

In [130]:
circuits_df = circuits_df.withColumn('ingestion_date', current_timestamp())
circuits_df.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|      ingestion_date|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia| -378497|   144968|      10|2023-03-15 20:48:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia|  276083|   101738|      18|2023-03-15 20:48:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain|  260325|   505106|       7|2023-03-15 20:48:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|    4157|   226111|     109|2023-03-15 20:48:...|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey|  409517|    29405|     130|2023-03-15 20:48:...|
|         6|        monaco|   Ci

In [131]:
circuits_df.printSchema()

root
 |-- circuit_id: integer (nullable = true)
 |-- circuit_ref: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: integer (nullable = true)
 |-- longitude: integer (nullable = true)
 |-- altitude: integer (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)



### Specify the types as required

In [132]:
circuits_df = circuits_df.withColumn('latitude', circuits_df['latitude'].cast('double')) \
.withColumn('longitude', circuits_df['longitude'].cast('double')) \
.withColumn('altitude', circuits_df['altitude'].cast('integer'))
circuits_df.printSchema()

root
 |-- circuit_id: integer (nullable = true)
 |-- circuit_ref: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- altitude: integer (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)



### Write the data in parquet format on processed folder

In [133]:
circuits_df.write.mode('overwrite').parquet('/content/formula1/data/processed/cirtuits')

In [134]:
spark.read.parquet('/content/formula1/data/processed/cirtuits').show()

+----------+--------------+--------------------+------------+---------+---------+---------+--------+--------------------+
|circuit_id|   circuit_ref|                name|    location|  country| latitude|longitude|altitude|      ingestion_date|
+----------+--------------+--------------------+------------+---------+---------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-378497.0| 144968.0|      10|2023-03-15 20:48:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 276083.0| 101738.0|      18|2023-03-15 20:48:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 260325.0| 505106.0|       7|2023-03-15 20:48:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   4157.0| 226111.0|     109|2023-03-15 20:48:...|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 409517.0|  29405.0|     130|2023-03-15 20:48:...|
|         6|        mona

In [135]:
spark.read.parquet('/content/formula1/data/processed/cirtuits').printSchema()

root
 |-- circuit_id: integer (nullable = true)
 |-- circuit_ref: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- altitude: integer (nullable = true)
 |-- ingestion_date: timestamp (nullable = true)



# Ingest races.csv

In [136]:
races_df = spark.read.csv('/content/formula1/data/raw/races2.csv', header=True, inferSchema=True, sep=';')
races_df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|29/03/2009|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|05/04/2009|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|19/04/2009|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|26/04/2009|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|10/05/2009|12:00:00|http://en.wikiped...|
|     6|2009|    6|        6|   Monaco Grand Prix|24/05/2009|12:00:00|http://en.wikiped...|
|     7|2009|    7|        5|  Turkish Grand Prix|07/06/2009|12:00:00|http://en.wikiped...|
|     8|2009|    8|        9|  British Grand Prix|21/06/2009|12:00:00|http://en.

### Add the required columns

In [137]:
from pyspark.sql.functions import col, concat, lit

In [138]:
races_df = races_df.withColumn('ingestion_date', current_timestamp()) \
.withColumn('race_timestamp',concat(col('date'), lit(' '), col('time')))
races_df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|      ingestion_date|     race_timestamp|
+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|     1|2009|    1|        1|Australian Grand ...|29/03/2009|06:00:00|http://en.wikiped...|2023-03-15 20:48:...|29/03/2009 06:00:00|
|     2|2009|    2|        2|Malaysian Grand Prix|05/04/2009|09:00:00|http://en.wikiped...|2023-03-15 20:48:...|05/04/2009 09:00:00|
|     3|2009|    3|       17|  Chinese Grand Prix|19/04/2009|07:00:00|http://en.wikiped...|2023-03-15 20:48:...|19/04/2009 07:00:00|
|     4|2009|    4|        3|  Bahrain Grand Prix|26/04/2009|12:00:00|http://en.wikiped...|2023-03-15 20:48:...|26/04/2009 12:00:00|
|     5|2009|    5|        4|  Spanish Grand Prix|10/05/2009|12:00:00

### Select only the required columns

In [139]:
races_df = races_df.select(col('raceId').alias('race_id'),
                           col('year').alias('race_year'), 
                           col('round'), 
                           col('circuitId').alias('circuit_id'),
                           col('name'),
                           col('ingestion_date'),
                           col('race_timestamp'))

In [140]:
races_df.show()

+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|race_id|race_year|round|circuit_id|                name|      ingestion_date|     race_timestamp|
+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|      1|     2009|    1|         1|Australian Grand ...|2023-03-15 20:49:...|29/03/2009 06:00:00|
|      2|     2009|    2|         2|Malaysian Grand Prix|2023-03-15 20:49:...|05/04/2009 09:00:00|
|      3|     2009|    3|        17|  Chinese Grand Prix|2023-03-15 20:49:...|19/04/2009 07:00:00|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2023-03-15 20:49:...|26/04/2009 12:00:00|
|      5|     2009|    5|         4|  Spanish Grand Prix|2023-03-15 20:49:...|10/05/2009 12:00:00|
|      6|     2009|    6|         6|   Monaco Grand Prix|2023-03-15 20:49:...|24/05/2009 12:00:00|
|      7|     2009|    7|         5|  Turkish Grand Prix|2023-03-15 20:49:...|07/06/2009 12:00:00|
|      8| 

In [141]:
races_df.printSchema()

root
 |-- race_id: integer (nullable = true)
 |-- race_year: integer (nullable = true)
 |-- round: integer (nullable = true)
 |-- circuit_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)
 |-- race_timestamp: string (nullable = true)



### Write the data in parquet format on processed folder with partitionBy

In [143]:
races_df.write.mode('overwrite').partitionBy('race_year').parquet('/content/formula1/data/processed/races')

In [144]:
races_df.show()

+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|race_id|race_year|round|circuit_id|                name|      ingestion_date|     race_timestamp|
+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|      1|     2009|    1|         1|Australian Grand ...|2023-03-15 20:49:...|29/03/2009 06:00:00|
|      2|     2009|    2|         2|Malaysian Grand Prix|2023-03-15 20:49:...|05/04/2009 09:00:00|
|      3|     2009|    3|        17|  Chinese Grand Prix|2023-03-15 20:49:...|19/04/2009 07:00:00|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2023-03-15 20:49:...|26/04/2009 12:00:00|
|      5|     2009|    5|         4|  Spanish Grand Prix|2023-03-15 20:49:...|10/05/2009 12:00:00|
|      6|     2009|    6|         6|   Monaco Grand Prix|2023-03-15 20:49:...|24/05/2009 12:00:00|
|      7|     2009|    7|         5|  Turkish Grand Prix|2023-03-15 20:49:...|07/06/2009 12:00:00|
|      8| 

In [145]:
spark.read.parquet('/content/formula1/data/processed/races').show()

+-------+-----+----------+--------------------+--------------------+-------------------+---------+
|race_id|round|circuit_id|                name|      ingestion_date|     race_timestamp|race_year|
+-------+-----+----------+--------------------+--------------------+-------------------+---------+
|   1053|    2|        21|Emilia Romagna Gr...|2023-03-15 20:49:...|18/04/2021 13:00:00|     2021|
|   1052|    1|         3|  Bahrain Grand Prix|2023-03-15 20:49:...|28/03/2021 15:00:00|     2021|
|   1051|   21|         1|Australian Grand ...|2023-03-15 20:49:...|21/11/2021 06:00:00|     2021|
|   1054|    3|        20|                 TBC|2023-03-15 20:49:...|      02/05/2021 \N|     2021|
|   1055|    4|         4|  Spanish Grand Prix|2023-03-15 20:49:...|09/05/2021 13:00:00|     2021|
|   1056|    5|         6|   Monaco Grand Prix|2023-03-15 20:49:...|23/05/2021 13:00:00|     2021|
|   1057|    6|        73|Azerbaijan Grand ...|2023-03-15 20:49:...|06/06/2021 12:00:00|     2021|
|   1058| 

# Ingest constructors.json

In [147]:
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [150]:
constructors_df = spark.read.json('/content/formula1/data/raw/constructors.json', schema=constructors_schema)
constructors_df.show()

+-------------+--------------+-----------+-----------+--------------------+
|constructorId|constructorRef|       name|nationality|                 url|
+-------------+--------------+-----------+-----------+--------------------+
|            1|       mclaren|    McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber| BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|   Williams|    British|http://en.wikiped...|
|            4|       renault|    Renault|     French|http://en.wikiped...|
|            5|    toro_rosso| Toro Rosso|    Italian|http://en.wikiped...|
|            6|       ferrari|    Ferrari|    Italian|http://en.wikiped...|
|            7|        toyota|     Toyota|   Japanese|http://en.wikiped...|
|            8|   super_aguri|Super Aguri|   Japanese|http://en.wikiped...|
|            9|      red_bull|   Red Bull|   Austrian|http://en.wikiped...|
|           10|   force_india|Force India|     Indian|http://en.wikiped...|
|           

In [152]:
print('hello')

hello
