In [1]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import warnings

from src.data import DataConnector
from datetime import datetime


# Load data
data = DataConnector()


# Merge dataframes into one
df = pd.merge(data.results, data.races, on='raceId')
df = pd.merge(df, data.drivers, on='driverId')
df = pd.merge(df, data.driver_standings, on='driverId')
df = pd.merge(df, data.constructors, on='constructorId')
df = pd.merge(df, data.status, on='statusId')


# Perform clean-up on columns
df['driverName'] = df['forename'] + ' ' + df['surname']

df = df.drop(['url', 'url_x', 'position_x', 'fastestLapTime', 'positionText_x', 'time_x', 'time_y', 'driverRef',
              'constructorRef', 'nationality_y', 'url_y', 'positionText_y', 'raceId_y', 'points_y', '?', '?.1',
              '?.2', '?.3', '?.4', '?.5', '?.6', '?.7', '?.8', '?.9', 'forename', 'surname'], axis=1)

new_names = {
    'number_x': 'number',
    'name_x': 'grandPrix',
    'number_y': 'driverNumber',
    'code': 'driverCode',
    'nationality_x': 'nationality',
    'name_y': 'company',
    'raceId_x': 'raceId',
    'points_x': 'points',
    'position_y': 'position'
}

df.rename(columns=new_names, inplace=True)


# Convert data types
pd.to_datetime(df.date)
df['dob'] = pd.to_datetime(df['dob'])
df['date'] = pd.to_datetime(df['date'])
df['age'] = round((datetime.today() - df['dob']).dt.days / 365)

nums = ['number', 'milliseconds', 'fastestLap', 'rank', 'fastestLapSpeed', 'driverNumber']
for i in nums:
    df[i] = pd.to_numeric(df[i], errors='coerce')
df.drop('driverNumber', axis=1, inplace=True)

print(df.head())


# Categorical and numerical values
categorical = []
numerical = []
for i in df.columns:
    print(f"{i} => {df[i].dtypes}")
    if df[i].dtypes == 'object':
        categorical.append(i)
    else:
        numerical.append(i)


# Filter data
df_finished = df[df['status'] == 'Finished']
print(df_finished.tail())

   raceId  driverId  constructorId  number  grid  positionOrder  points  laps  \
0      18         1              1    22.0     1              1    10.0    58   
1      18         1              1    22.0     1              1    10.0    58   
2      18         1              1    22.0     1              1    10.0    58   
3      18         1              1    22.0     1              1    10.0    58   
4      18         1              1    22.0     1              1    10.0    58   

   milliseconds  fastestLap  ...       date  driverCode        dob  \
0     5690616.0        39.0  ... 2008-03-16         HAM 1985-01-07   
1     5690616.0        39.0  ... 2008-03-16         HAM 1985-01-07   
2     5690616.0        39.0  ... 2008-03-16         HAM 1985-01-07   
3     5690616.0        39.0  ... 2008-03-16         HAM 1985-01-07   
4     5690616.0        39.0  ... 2008-03-16         HAM 1985-01-07   

   nationality  position  wins  company    status      driverName   age  
0      British    