# F1 Analysis - Introduction to Data Sciences project

## Importing libraries

In [1]:
# Already added some that we most probably will use
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
# Add more whenever needed
# ...
from datetime import datetime

## Reading in the data

The data consists of most of the freely available Formula 1 data from 1950 to 2018. <br>
Link: http://ergast.com/mrd/

In [2]:
circuits = pd.read_csv('./f1db_csv/circuits.csv', header=None)
constructor_results = pd.read_csv('./f1db_csv/constructor_results.csv', header=None)
constructor_standings = pd.read_csv('./f1db_csv/constructor_standings.csv', header=None)
constructors = pd.read_csv('./f1db_csv/constructors.csv', header=None)
drivers = pd.read_csv('./f1db_csv/driver.csv', header=None)
driver_standings = pd.read_csv('./f1db_csv/driver_standings.csv', header=None)
lap_times = pd.read_csv('./f1db_csv/lap_times.csv', header=None)
pit_stops = pd.read_csv('./f1db_csv/pit_stops.csv', header=None)
qualifying = pd.read_csv('./f1db_csv/qualifying.csv', header=None)
races = pd.read_csv('./f1db_csv/races.csv', header=None)
results = pd.read_csv('./f1db_csv/results.csv', header=None)
seasons = pd.read_csv('./f1db_csv/seasons.csv', header=None)
status = pd.read_csv('./f1db_csv/status.csv', header=None)

## Adding correct column names to the datasets

Column names are based on the column names from here: https://www.kaggle.com/cjgdev/formula-1-race-data-19502017. <br>
Column names in our project are in snake case (snake_case).

In [3]:
circuits_column_names = ['circuit_id', 'circuit_ref', 'name', 'location', 'country', 'lat', 'lng', 'alt', 'url']
constructor_results_column_names = ['constructor_results_id', 'race_id', 'constructor_id', 'points', 'status']
constructor_standings_column_names = ['constructor_standings_id', 'race_id', 'constructor_id', 'points', 'position', 'position_text', 'wins']
constructors_column_names = ['constructor_id', 'constructor_ref', 'name', 'nationality', 'url']
drivers_column_names = ['driver_id', 'driver_ref', 'number', 'code', 'forename', 'surname', 'birth_date', 'nationality', 'url']
driver_standings_column_names = ['driver_standings_id', 'race_id', 'driver_id', 'points', 'position', 'position_text', 'wins']
lap_times_column_names = ['race_id', 'driver_id', 'lap', 'position', 'time', 'milliseconds']
pit_stops_column_names = ['race_id', 'driver_id', 'stop', 'lap', 'time', 'duration', 'milliseconds']
qualifying_column_names = ['qualify_id', 'race_id', 'driver_id', 'constructor_id', 'number', 'position', 'q1', 'q2', 'q3']
races_column_names = ['race_id', 'year', 'round', 'circuit_id', 'name', 'date', 'time', 'url']
results_column_names = ['result_id', 'race_id', 'driver_id', 'constructor_id', 'number', 'grid', 'position', 'position_text', 'position_order', 'points', 'laps', 'time', 'milliseconds', 'fastest_lap', 'rank', 'fastest_lap_time', 'fastest_lap_speed', 'status_id']
seasons_column_names = ['year', 'url']
status_column_names = ['status_id', 'status']

In [4]:
circuits.columns = circuits_column_names
constructor_results.columns = constructor_results_column_names
constructor_standings.columns = constructor_standings_column_names
constructors.columns = constructors_column_names
drivers.columns = drivers_column_names
driver_standings.columns = driver_standings_column_names
lap_times.columns = lap_times_column_names
pit_stops.columns = pit_stops_column_names
qualifying.columns = qualifying_column_names
races.columns = races_column_names
results.columns = results_column_names
seasons.columns = seasons_column_names
status.columns = status_column_names

## Replacing different types of non-existent (and non-sensical) values with a universal one

We are using np.nan as the universal non-existent value.

In [5]:
datasets = [circuits, constructor_results, constructor_standings, constructors, drivers, driver_standings, lap_times, pit_stops, qualifying, races, results, seasons, status]

for data in datasets:
    for column in data.columns:
        data[column] = data[column].replace('\\N', np.nan)  # Replacing previously used non-existent value with ours
        data[column] = data[column].replace('', np.nan)  # Replacing missing values

## Comparing the fastest lap times at frequent circuits during the 2004-2018 period

We are using the absolute fastest lap times at every track year by year (fastest lap during the race). <br>
There are 14 such circuits. <br>
Afterwards we shall make a 2x7 grid of the corresponding graphs.

In [10]:
years = [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

suitable_races = []

for index, row in races.iterrows():
    year = int(row['year'])
    if (year in years):  # If the year of the race is between 2004 and 2018
        suitable_races.append([row['race_id'], row['circuit_id'], year])

circuit_year_lap = []

for race in suitable_races:
    race_id = race[0]
    race_lap_times = results.loc[results['race_id'] == race_id]['fastest_lap_time']
    lowest_race_lap_time = 10000
    for lap_time in race_lap_times:
        if (not lap_time is np.nan):
            minutes_seconds = lap_time.split(":")
            lap_time = float(int(minutes_seconds[0]) * 60 + float(minutes_seconds[1]))
            if (lap_time < lowest_race_lap_time):
                lowest_race_lap_time = lap_time
    circuit_year_lap.append([race[1], race[2], lowest_race_lap_time])

for i in circuit_year_lap:
    circuit_id = i[0]
    circuit_name = circuits.loc[circuits['circuit_id'] == circuit_id]['name']
    i[0] = circuit_name.tolist()[0]
    
df_circuit_year_lap = pd.DataFrame(circuit_year_lap)
df_circuit_year_lap.columns = ['circuit', 'year', 'fastest lap time (s)']

df_circuit_year_lap = df_circuit_year_lap.sort_values(['circuit', 'year'], ascending=True)
    

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
df_circuit_year_lap = df_circuit_year_lap.groupby('circuit').filter(lambda x : len(x)>10)
print(df_circuit_year_lap)

                            circuit  year  fastest lap time (s)
89   Albert Park Grand Prix Circuit  2004  84.125              
70   Albert Park Grand Prix Circuit  2005  85.683              
54   Albert Park Grand Prix Circuit  2006  86.045              
35   Albert Park Grand Prix Circuit  2007  85.235              
17   Albert Park Grand Prix Circuit  2008  87.418              
0    Albert Park Grand Prix Circuit  2009  87.706              
108  Albert Park Grand Prix Circuit  2010  88.358              
126  Albert Park Grand Prix Circuit  2011  88.947              
145  Albert Park Grand Prix Circuit  2012  89.187              
165  Albert Park Grand Prix Circuit  2013  89.274              
184  Albert Park Grand Prix Circuit  2014  92.478              
208  Albert Park Grand Prix Circuit  2015  90.945              
222  Albert Park Grand Prix Circuit  2016  88.997              
243  Albert Park Grand Prix Circuit  2017  86.538              
263  Albert Park Grand Prix Circuit  201