<a href="https://colab.research.google.com/github/PBuenoc/f1ProjectInGoogleColab/blob/main/F1ProjectInColab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare spark environment

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName("Iniciando com Spark").getOrCreate()

# Ingest csv files

## Ingest circuits.csv file

In [None]:
circuits_df = spark.read.csv('/content/formula1/data/raw/circuits2.csv',inferSchema=True, header=True, sep=';')

In [None]:
circuits_df.show()

### Select only the columns required

In [None]:
circuits_df = circuits_df.select('circuitId', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng', 'alt')
circuits_df.show()

### Renamed the columns as required

In [None]:
circuits_df = circuits_df.withColumnRenamed('circuitId', 'circuit_id') \
.withColumnRenamed('circuitRef', 'circuit_ref') \
.withColumnRenamed('lat','latitude') \
.withColumnRenamed('lng','longitude') \
.withColumnRenamed('alt','altitude')
circuits_df.show()

### Add ingestion_date column

In [None]:
from pyspark.sql.functions import current_timestamp

In [None]:
circuits_df = circuits_df.withColumn('ingestion_date', current_timestamp())
circuits_df.show()

In [None]:
circuits_df.printSchema()

### Specify the types as required

In [None]:
circuits_df = circuits_df.withColumn('latitude', circuits_df['latitude'].cast('double')) \
.withColumn('longitude', circuits_df['longitude'].cast('double')) \
.withColumn('altitude', circuits_df['altitude'].cast('integer'))
circuits_df.printSchema()

### Write the data in parquet format on processed folder

In [None]:
circuits_df.write.mode('overwrite').parquet('/content/formula1/data/processed/circuits')

In [None]:
spark.read.parquet('/content/formula1/data/processed/circuits').show()

In [None]:
spark.read.parquet('/content/formula1/data/processed/circuits').printSchema()

## Ingest races.csv

In [None]:
races_df = spark.read.csv('/content/formula1/data/raw/races2.csv', header=True, inferSchema=True, sep=';')
races_df.show()

### Add the required columns

In [None]:
from pyspark.sql.functions import col, concat, lit

In [None]:
races_df = races_df.withColumn('ingestion_date', current_timestamp()) \
.withColumn('race_timestamp',concat(col('date'), lit(' '), col('time')))
races_df.show()

### Select only the required columns

In [None]:
races_df = races_df.select(col('raceId').alias('race_id'),
                           col('year').alias('race_year'), 
                           col('round'), 
                           col('circuitId').alias('circuit_id'),
                           col('name'),
                           col('ingestion_date'),
                           col('race_timestamp'))

In [None]:
races_df.show()

In [None]:
races_df.printSchema()

### Write the data in parquet format on processed folder with partitionBy

In [None]:
races_df.write.mode('overwrite').partitionBy('race_year').parquet('/content/formula1/data/processed/races')

In [None]:
races_df.show()

In [None]:
spark.read.parquet('/content/formula1/data/processed/races').show()

# Ingest JSON files

## Ingest constructors.json

In [None]:
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [None]:
constructors_df = spark.read.json('/content/formula1/data/raw/constructors.json', schema=constructors_schema)
constructors_df.show()

### Drop unwanted columns from the dataframe

In [None]:
constructors_df = constructors_df.drop(constructors_df.url)
constructors_df.show()

### Rename columns and add ingestion date

In [None]:
constructors_df = constructors_df.withColumnRenamed('constructorId', 'constructor_id') \
                                .withColumnRenamed('constructorRef', 'constructor_ref') \
                                .withColumn('ingestion_date', current_timestamp())
constructors_df.show()

In [None]:
constructors_df.write.mode('overwrite').parquet('/content/formula1/data/processed/constructors')

In [None]:
spark.read.parquet('/content/formula1/data/processed/constructors').show()

## Ingest drivers.json - Nested JSON

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [None]:
name_schema = StructType(fields=[StructField("forename", StringType(), True),
                                 StructField("surname", StringType(), True)
  
])

In [None]:
drivers_schema = StructType(fields=[StructField("driverId", IntegerType(), False),
                                    StructField("driverRef", StringType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("code", StringType(), True),
                                    StructField("name", name_schema),
                                    StructField("dob", DateType(), True),
                                    StructField("nationality", StringType(), True),
                                    StructField("url", StringType(), True)])

In [None]:
drivers_df = spark.read \
.schema(drivers_schema) \
.json('/content/formula1/data/raw/drivers.json')

In [None]:
drivers_df.show()

In [None]:
drivers_df.printSchema()

### Rename and add new columns

In [None]:
from pyspark.sql.functions import col, concat, lit

In [None]:
drivers_df = drivers_df.withColumnRenamed("driverId", "driver_id") \
                                    .withColumnRenamed("driverRef", "driver_ref") \
                                    .withColumn("name", concat(col("name.forename"), lit(" "), col("name.surname")))

In [242]:
from pyspark.sql.functions import current_timestamp

In [None]:
drivers_df = drivers_df.withColumn('ingestion_date', current_timestamp())
drivers_df.show()

### Drop unwanted columns
url \
name.forename \
name.surname \

In [None]:
drivers_df = drivers_df.drop('url')
drivers_df.show()

In [None]:
drivers_df.printSchema()

### Write the output to processed folder in parquet format


In [None]:
drivers_df.write.mode('overwrite').parquet('/content/formula1/data/processed/drivers')

In [None]:
spark.read.parquet('/content/formula1/data/processed/drivers').show()

## Ingest results.json

In [243]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [244]:
results_schema = StructType(fields=[StructField("resultId", IntegerType(), False),
                                    StructField("raceId", IntegerType(), True),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("constructorId", IntegerType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("grid", IntegerType(), True),
                                    StructField("position", IntegerType(), True),
                                    StructField("positionText", StringType(), True),
                                    StructField("positionOrder", IntegerType(), True),
                                    StructField("points", FloatType(), True),
                                    StructField("laps", IntegerType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("milliseconds", IntegerType(), True),
                                    StructField("fastestLap", IntegerType(), True),
                                    StructField("rank", IntegerType(), True),
                                    StructField("fastestLapTime", StringType(), True),
                                    StructField("fastestLapSpeed", FloatType(), True),
                                    StructField("statusId", StringType(), True)])

In [245]:
results_df = spark.read \
.schema(results_schema) \
.json('/content/formula1/data/raw/results.json')
results_df.show()

+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|resultId|raceId|driverId|constructorId|number|grid|position|positionText|positionOrder|points|laps|       time|milliseconds|fastestLap|rank|fastestLapTime|fastestLapSpeed|statusId|
+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|       1|    18|       1|            1|    22|   1|       1|           1|            1|  10.0|  58|1:34:50.616|     5690616|        39|   2|      1:27.452|          218.3|       1|
|       2|    18|       2|            2|     3|   5|       2|           2|            2|   8.0|  58|     +5.478|     5696094|        41|   3|      1:27.739|        217.586|       1|
|       3|    18|       3|            3|     7|   7|       3|           3|            3|  

### Drop, rename and add required columns

In [246]:
results_df = results_df.drop('statusId')

In [247]:
results_df = results_df.withColumnRenamed('resultId', 'result_id') \
                       .withColumnRenamed('raceId', 'race_id') \
                       .withColumnRenamed('driverId', 'driver_id') \
                       .withColumnRenamed('constructorId', 'constructor_id') \
                       .withColumnRenamed('positionText', 'position_text') \
                       .withColumnRenamed('positionOrder', 'position_order') \
                       .withColumnRenamed('fastestLap', 'fastest_lap') \
                       .withColumnRenamed('fastestLapTime', 'fastest_lap_time') \
                       .withColumnRenamed('fastestLapSpeed', 'fastest_lap_speed')

In [251]:
from pyspark.sql.functions import current_timestamp

In [252]:
results_df = results_df.withColumn('ingestion_date', current_timestamp())
results_df.show()

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|       time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|      ingestion_date|
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+
|        1|     18|        1|             1|    22|   1|       1|            1|             1|  10.0|  58|1:34:50.616|     5690616|         39|   2|        1:27.452|            218.3|2023-03-16 01:43:...|
|        2|     18|        2|             2|     3|   5|       2|            2|             2|   8.0|  58|     +5.478|     5696094|         41|   3|        1:27.739|          217.5

### Write the output to processed folder in parquet format

In [253]:
results_df.write.mode('overwrite').partitionBy('race_id').parquet('/content/formula1/data/processed/results')


In [254]:
spark.read.parquet('/content/formula1/data/processed/results').show()

+---------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+-------+
|result_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|       time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|      ingestion_date|race_id|
+---------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+-------+
|    21544|       20|             9|     1|   3|       1|            1|             1|  25.0|  59|2:00:26.144|     7226144|         50|   3|        1:52.134|          162.865|2023-03-16 01:45:...|    873|
|    21545|       18|             1|     3|   4|       2|            2|             2|  18.0|  59| +8.959 sec|     7235103|         47|   4|        1:52.625|          162.155|2023-

## Ingest pitstops.json

In [255]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [256]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("stop", StringType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("duration", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])

In [258]:
pit_stops_df = spark.read \
.schema(pit_stops_schema) \
.option('multiline', True) \
.json('/content/formula1/data/raw/pit_stops.json')

### Rename and add columns as required

In [259]:
pit_stops_df = pit_stops_df.withColumnRenamed('raceId', 'race_id') \
                           .withColumnRenamed('driverId', 'driver_id')

In [261]:
pit_stops_df = pit_stops_df.withColumn('ingestion_date', current_timestamp())

In [262]:
pit_stops_df.show()

+-------+---------+----+---+--------+--------+------------+--------------------+
|race_id|driver_id|stop|lap|    time|duration|milliseconds|      ingestion_date|
+-------+---------+----+---+--------+--------+------------+--------------------+
|    841|      153|   1|  1|17:05:23|  26.898|       26898|2023-03-16 01:56:...|
|    841|       30|   1|  1|17:05:52|  25.021|       25021|2023-03-16 01:56:...|
|    841|       17|   1| 11|17:20:48|  23.426|       23426|2023-03-16 01:56:...|
|    841|        4|   1| 12|17:22:34|  23.251|       23251|2023-03-16 01:56:...|
|    841|       13|   1| 13|17:24:10|  23.842|       23842|2023-03-16 01:56:...|
|    841|       22|   1| 13|17:24:29|  23.643|       23643|2023-03-16 01:56:...|
|    841|       20|   1| 14|17:25:17|  22.603|       22603|2023-03-16 01:56:...|
|    841|      814|   1| 14|17:26:03|  24.863|       24863|2023-03-16 01:56:...|
|    841|      816|   1| 14|17:26:50|  25.259|       25259|2023-03-16 01:56:...|
|    841|       67|   1| 15|

### Write the output to processed folder in parquet format

In [267]:
pit_stops_df.write.mode("overwrite").parquet('/content/formula1/data/processed/pit_stops')

In [268]:
spark.read.parquet('/content/formula1/data/processed/pit_stops').show()

+-------+---------+----+---+--------+--------+------------+--------------------+
|race_id|driver_id|stop|lap|    time|duration|milliseconds|      ingestion_date|
+-------+---------+----+---+--------+--------+------------+--------------------+
|    841|      153|   1|  1|17:05:23|  26.898|       26898|2023-03-16 02:00:...|
|    841|       30|   1|  1|17:05:52|  25.021|       25021|2023-03-16 02:00:...|
|    841|       17|   1| 11|17:20:48|  23.426|       23426|2023-03-16 02:00:...|
|    841|        4|   1| 12|17:22:34|  23.251|       23251|2023-03-16 02:00:...|
|    841|       13|   1| 13|17:24:10|  23.842|       23842|2023-03-16 02:00:...|
|    841|       22|   1| 13|17:24:29|  23.643|       23643|2023-03-16 02:00:...|
|    841|       20|   1| 14|17:25:17|  22.603|       22603|2023-03-16 02:00:...|
|    841|      814|   1| 14|17:26:03|  24.863|       24863|2023-03-16 02:00:...|
|    841|      816|   1| 14|17:26:50|  25.259|       25259|2023-03-16 02:00:...|
|    841|       67|   1| 15|