In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("InjestionProcessing").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
results = spark.read.option("header", True).csv("/content/drive/MyDrive/Formula1DataAnalytics-main/Formula1DataAnalytics-main/data/results.csv")
races = spark.read.option("header", True).csv("/content/drive/MyDrive/Formula1DataAnalytics-main/Formula1DataAnalytics-main/data/races.csv")
qualifying = spark.read.option("header", True).csv("/content/drive/MyDrive/Formula1DataAnalytics-main/Formula1DataAnalytics-main/data/qualifying.csv")
drivers = spark.read.option("header", True).csv("/content/drive/MyDrive/Formula1DataAnalytics-main/Formula1DataAnalytics-main/data/drivers.csv")
constructors = spark.read.option("header", True).csv("/content/drive/MyDrive/Formula1DataAnalytics-main/Formula1DataAnalytics-main/data/constructors.csv")
circuits = spark.read.option("header", True).csv("/content/drive/MyDrive/Formula1DataAnalytics-main/Formula1DataAnalytics-main/data/circuits.csv")

In [4]:
results = results.withColumnRenamed("resultId", "result_id").withColumnRenamed("raceId", "race_id").withColumnRenamed("constructorId", "constructor_id").withColumnRenamed("statusId", "status_id").withColumnRenamed("number", "results_number").withColumnRenamed("time", "results_time").withColumnRenamed("driverId", "driver_id").withColumnRenamed("position", "result_position")

In [5]:
races = races.withColumnRenamed("raceId", "race_id").withColumnRenamed("circuitId", "circuit_id").withColumnRenamed("url", "race_url").withColumnRenamed("time", "race_time").withColumnRenamed("name", "race_name").withColumnRenamed("raceId", "race_id").withColumnRenamed("circuitId", "circuit_id")



In [6]:
qualifying = qualifying.withColumnRenamed("number", "qualifying_number").withColumnRenamed("qualifyingId", "qualifying_id").withColumnRenamed("raceId", "race_id").withColumnRenamed("driverId", "driver_id").withColumnRenamed("constructorId", "constructor_id").withColumnRenamed("position", "qualifying_position")

In [7]:
drivers = drivers.withColumnRenamed("number", "driver_number").withColumnRenamed("nationality", "driver_nationality").withColumnRenamed("url", "driver_url").withColumnRenamed("driverId", "driver_id").withColumnRenamed("driverRef", "driver_ref")

In [8]:
constructors = constructors.withColumnRenamed("name", "constructor_name").withColumnRenamed("nationality", "constructor_nationality").withColumnRenamed("url", "constructor_url").withColumnRenamed("constructorId", "constructor_id").withColumnRenamed("constructorRef", "constructor_ref")

In [9]:
circuits = circuits.withColumnRenamed("circuitId", "circuit_id").withColumnRenamed("circuitRef", "circuit_ref").withColumnRenamed("name", "circuit_name").withColumnRenamed("location", "circuit_location").withColumnRenamed("country", "circuit_country").withColumnRenamed("url", "circuit_url")

In [10]:
df1 = races.join(results, "race_id", "inner")

In [11]:
df2 = df1.join(qualifying, ["race_id", "driver_id", "constructor_id"], "inner")

In [12]:
df3 = df2.join(drivers, "driver_id", "inner")

In [13]:
df4 = df3.join(constructors, "constructor_id", "inner")

In [14]:
df5 = df4.join(circuits, "circuit_id", "inner")

In [15]:
df5

DataFrame[circuit_id: string, constructor_id: string, driver_id: string, race_id: string, year: string, round: string, race_name: string, date: string, race_time: string, race_url: string, result_id: string, results_number: string, grid: string, result_position: string, positionText: string, positionOrder: string, points: string, laps: string, results_time: string, milliseconds: string, fastestLap: string, rank: string, fastestLapTime: string, fastestLapSpeed: string, status_id: string, qualifyId: string, qualifying_number: string, qualifying_position: string, q1: string, q2: string, q3: string, driver_ref: string, driver_number: string, code: string, forename: string, surname: string, dob: string, driver_nationality: string, driver_url: string, constructor_ref: string, constructor_name: string, constructor_nationality: string, constructor_url: string, circuit_ref: string, circuit_name: string, circuit_location: string, circuit_country: string, lat: string, lng: string, alt: string, ci

In [16]:

df5.show()

+----------+--------------+---------+-------+----+-----+--------------------+----------+---------+--------------------+---------+--------------+----+---------------+------------+-------------+------+----+------------+------------+----------+----+--------------+---------------+---------+---------+-----------------+-------------------+--------+--------+--------+----------+-------------+----+---------+----------+----------+------------------+--------------------+---------------+----------------+-----------------------+--------------------+-----------+--------------------+----------------+---------------+--------+-------+---+--------------------+
|circuit_id|constructor_id|driver_id|race_id|year|round|           race_name|      date|race_time|            race_url|result_id|results_number|grid|result_position|positionText|positionOrder|points|laps|results_time|milliseconds|fastestLap|rank|fastestLapTime|fastestLapSpeed|status_id|qualifyId|qualifying_number|qualifying_position|      q1|     

In [17]:
df5.columns

['circuit_id',
 'constructor_id',
 'driver_id',
 'race_id',
 'year',
 'round',
 'race_name',
 'date',
 'race_time',
 'race_url',
 'result_id',
 'results_number',
 'grid',
 'result_position',
 'positionText',
 'positionOrder',
 'points',
 'laps',
 'results_time',
 'milliseconds',
 'fastestLap',
 'rank',
 'fastestLapTime',
 'fastestLapSpeed',
 'status_id',
 'qualifyId',
 'qualifying_number',
 'qualifying_position',
 'q1',
 'q2',
 'q3',
 'driver_ref',
 'driver_number',
 'code',
 'forename',
 'surname',
 'dob',
 'driver_nationality',
 'driver_url',
 'constructor_ref',
 'constructor_name',
 'constructor_nationality',
 'constructor_url',
 'circuit_ref',
 'circuit_name',
 'circuit_location',
 'circuit_country',
 'lat',
 'lng',
 'alt',
 'circuit_url']

In [18]:
data = df5.select(['year', 'date', 'grid', 'status_id', 'qualifying_position', 'forename', 'surname', 'dob', 'driver_nationality', 'constructor_name', 'constructor_nationality', 'race_name', 'circuit_country'])


In [19]:
data.columns


['year',
 'date',
 'grid',
 'status_id',
 'qualifying_position',
 'forename',
 'surname',
 'dob',
 'driver_nationality',
 'constructor_name',
 'constructor_nationality',
 'race_name',
 'circuit_country']

In [20]:
#considering data points from 2010
data = data[data['year']>=2010]

In [21]:
data = data.withColumnRenamed("race_name", "GP_name").withColumnRenamed("circuit_country", "country").withColumnRenamed("qualifying_position", "position").withColumnRenamed("grid", "quali_pos").withColumnRenamed("constructor_name", "constructor").withColumn("date", to_date(col("date"))).withColumn("dob", to_date(col("dob"))).withColumn("driver", concat(col("forename"), lit(" "), col("surname")))


In [22]:
# Creating driver age parameter
data = data.withColumn("age_at_gp_in_days", datediff(col("date"), col("dob")))
data = data.withColumn("age_at_gp_in_days", expr("CAST(age_at_gp_in_days AS STRING)"))

In [23]:
data = data.withColumn("constructor", when(col("constructor") == "Force India", "Racing Point")
                                   .when(col("constructor") == "Sauber", "Alfa Romeo")
                                   .when(col("constructor") == "Lotus F1", "Renault")
                                   .when(col("constructor") == "Toro Rosso", "AlphaTauri")
                                   .otherwise(col("constructor")))

In [24]:
data = data.withColumn('driver_nationality', data['driver_nationality'].substr(1, 3))
data = data.withColumn('constructor_nationality', data['constructor_nationality'].substr(1, 3))
data = data.withColumn('country', when(data['country'] == 'UK', 'Bri').otherwise(data['country']))
data = data.withColumn('country', when(data['country'] == 'USA', 'Ame').otherwise(data['country']))
data = data.withColumn('country', when(data['country'] == 'Fra', 'Fre').otherwise(data['country']))
data = data.withColumn('country', data['country'].substr(1, 3))
data = data.withColumn('driver_home', (data['driver_nationality'] == data['country']).cast("int"))
data = data.withColumn('constructor_home', (data['constructor_nationality'] == data['country']).cast("int"))

In [25]:
dnf_statuses = [3, 4, 20, 29, 31, 41, 68, 73, 81, 97, 82, 104, 107, 130, 137]
data = data.withColumn('driver_dnf', when(col('status_id').isin(dnf_statuses), 1).otherwise(0))
data = data.withColumn('constructor_dnf', when(~col('status_id').isin(dnf_statuses + [1]), 1).otherwise(0))
data = data.drop('forename', 'surname')

In [26]:
# Calculate DNF count by driver
dnf_by_driver = data.groupBy('driver').agg({'driver_dnf': 'sum'})

# Calculate race entered count by driver
driver_race_entered = data.groupBy('driver').count()

# Join the two calculated DataFrames
driver_stats = dnf_by_driver.join(driver_race_entered, 'driver')

# Calculate DNF ratio and driver confidence
driver_stats = driver_stats.withColumn('driver_dnf_ratio', driver_stats['sum(driver_dnf)'] / driver_stats['count'])
driver_stats = driver_stats.withColumn('driver_confidence', 1 - driver_stats['driver_dnf_ratio'])

# Select necessary columns and convert to a Pandas DataFrame for creating the dictionary
driver_confidence_dict = driver_stats.select('driver', 'driver_confidence').rdd.collectAsMap()


In [27]:
# Calculate DNF count by constructor
dnf_by_constructor = data.groupBy('constructor').agg({'constructor_dnf': 'sum'})

# Calculate race entered count by constructor
constructor_race_entered = data.groupBy('constructor').count()

# Join the two calculated DataFrames
constructor_stats = dnf_by_constructor.join(constructor_race_entered, 'constructor')

# Calculate DNF ratio and constructor reliability
constructor_stats = constructor_stats.withColumn('constructor_dnf_ratio', constructor_stats['sum(constructor_dnf)'] / constructor_stats['count'])
constructor_stats = constructor_stats.withColumn('constructor_reliability', 1 - constructor_stats['constructor_dnf_ratio'])

# Select necessary columns and convert to a Pandas DataFrame for creating the dictionary
constructor_reliability_dict = constructor_stats.select('constructor', 'constructor_reliability').rdd.collectAsMap()

In [28]:
# Create a DataFrame for driver confidence and constructor reliability dictionaries
driver_confidence_df = spark.createDataFrame(driver_confidence_dict.items(), ["driver", "driver_confidence"])
constructor_reliability_df = spark.createDataFrame(constructor_reliability_dict.items(), ["constructor", "constructor_reliability"])

# Adding 'driver_confidence' column
data = data.join(driver_confidence_df, on='driver', how='left')

# Adding 'constructor_reliability' column
data = data.join(constructor_reliability_df, on='constructor', how='left')

In [29]:
# Lists of active constructors and drivers
active_constructors = ['Renault', 'Williams', 'McLaren', 'Ferrari', 'Mercedes',
                       'AlphaTauri', 'Racing Point', 'Alfa Romeo', 'Red Bull',
                       'Haas F1 Team']
active_drivers = ['Daniel Ricciardo', 'Kevin Magnussen', 'Carlos Sainz',
                  'Valtteri Bottas', 'Lance Stroll', 'George Russell',
                  'Lando Norris', 'Sebastian Vettel', 'Kimi Räikkönen',
                  'Charles Leclerc', 'Lewis Hamilton', 'Daniil Kvyat',
                  'Max Verstappen', 'Pierre Gasly', 'Alexander Albon',
                  'Sergio Pérez', 'Esteban Ocon', 'Antonio Giovinazzi',
                  'Romain Grosjean', 'Nicholas Latifi']

# Adding 'active_driver' column
data = data.withColumn("active_driver", when(col("driver").isin(active_drivers), 1).otherwise(0))

# Adding 'active_constructor' column
data = data.withColumn("active_constructor", when(col("constructor").isin(active_constructors), 1).otherwise(0))

In [30]:
data.columns


['constructor',
 'driver',
 'year',
 'date',
 'quali_pos',
 'status_id',
 'position',
 'dob',
 'driver_nationality',
 'constructor_nationality',
 'GP_name',
 'country',
 'age_at_gp_in_days',
 'driver_home',
 'constructor_home',
 'driver_dnf',
 'constructor_dnf',
 'driver_confidence',
 'constructor_reliability',
 'active_driver',
 'active_constructor']

In [31]:

data.coalesce(1).write.option("header",True).csv("/content/drive/MyDrive/Formula1DataAnalytics-main/Formula1DataAnalytics-main/data/f1_data.csv")
