In [2]:
import pandas as pd
from sqlalchemy import create_engine
!pip install psycopg2-binary
import psycopg2



In [46]:
### dados importantes para levantar a análise ###

# grid - posição de largada
# position - rank final

# results.csv -> raceId, driverId, grid, positionOrder
# pit_stops.csv -> raceId, driverId, stop, duration(seconds)
# drivers.csv -> driverId, forename, surname 
# races.csv -> raceId, circuitId, name(circuit name), date

# primeiro: Filtrar os dados apenas em 1 circuito (Interlagos -> circuitId = 18)
# com o pit_stops: MÉDIA DO TEMPO DE PITSTOP POR PILOTO (driverId) em cada corrida
# modelo de classificação/classificatorio

## Lendo e Limpando os dados

In [3]:
results = pd.read_csv('./results.csv')
drivers = pd.read_csv('./drivers.csv')
races = pd.read_csv('./races.csv')
driver_standings = pd.read_csv('./driver_standings.csv')
pit_stops = pd.read_csv('./pit_stops.csv')

In [4]:
for column in results.columns:
    if column not in ['raceId', 'driverId', 'grid', 'positionOrder']:
        results = results.drop(column, axis=1)
print('Results')
results.sample(3)

Results


Unnamed: 0,raceId,driverId,grid,positionOrder
13342,547,255,12,5
19003,789,620,24,23
13666,557,270,22,17


In [13]:
for column in pit_stops.columns:
    if column not in ['raceId', 'driverId', 'stop', 'duration']:
        pit_stops = pit_stops.drop(column, axis=1)
print('Pit Stops')
pit_stops.sample(3)

Pit Stops


Unnamed: 0,raceId,driverId,stop,duration
4183,936,154,4,17.064
4487,944,828,1,24.699
9600,1095,832,3,23.816


In [6]:
for column in drivers.columns:
    if column not in ['driverId', 'forename', 'surname']:
        drivers = drivers.drop(column, axis=1)
print('Drivers')
drivers.sample(3)

Drivers


Unnamed: 0,driverId,forename,surname
307,308,Richard,Robarts
678,678,Len,Duncan
154,155,Kamui,Kobayashi


In [7]:
for column in races.columns:
    if column not in ['raceId', 'circuitId', 'name', 'date']:
        races = races.drop(column, axis=1)
print('Races')
races.sample(3)

Races


Unnamed: 0,raceId,circuitId,name,date
465,466,30,South African Grand Prix,1983-10-15
1001,1014,4,Spanish Grand Prix,2019-05-12
474,475,39,Dutch Grand Prix,1982-07-03


## Criando conexão com o DB e salvando os DataFrames

In [8]:
'tipo_de_banco_de_dados://usuario:senha@endereco_do_servidor:porta/nome_do_banco_de_dados'
engine = create_engine('postgresql://root:root@localhost:5432/dbanalise')

# df_dimensoes.to_sql('nome_da_tabela_dimensoes', con=engine, if_exists='replace', index=False)
# df_medidas.to_sql('nome_da_tabela_fato', con=engine, if_exists='replace', index=False)

In [9]:
results.to_sql('fato_resultados', con=engine, if_exists='replace', index=False)
pit_stops.to_sql('fato_pit_stops', con=engine, if_exists='replace', index=False)
drivers.to_sql('dimensao_pilotos', con=engine, if_exists='replace', index=False)
races.to_sql('dimensao_corridas', con=engine, if_exists='replace', index=False)

102

## Query no banco para pegar o DataFrame base do projeto

In [15]:
# recuperar uma tabela através de SELECT JOIN... juntando as tabelas com seus id's referentes

# races + pit_stops -> 'raceId': result1 #
# result1 + driver -> 'driverId': result2
# result2 + results -> 'driverId ou raceId': finalResult
conn = psycopg2.connect(dbname="dbanalise", user="root", password="root")
cur = conn.cursor()

query = '''
SELECT *
FROM dimensao_corridas c
INNER JOIN fato_pit_stops ps ON ps."raceId" = c."raceId"
INNER JOIN dimensao_pilotos dp ON dp."driverId" = ps."driverId"
INNER JOIN fato_resultados r ON r."driverId" = dp."driverId"
WHERE c."circuitId" = 18
'''
cur.execute(query)

rows = cur.fetchall()

cur.close()
conn.close()

finalColumns = ['raceId', 'circuitId', 'name', 'date', 'raceId1', 'driverId', 'stop', 'duration', 'driverId1', 'forename', 'surname', 'raceId1', 'driverId1', 'pos. largada', 'positionOrder']

df_result = pd.DataFrame(rows, columns=finalColumns)
df_result

Unnamed: 0,raceId,circuitId,name,date,raceId1,driverId,stop,duration,driverId1,forename,surname,raceId1.1,driverId1.1,pos. largada,positionOrder
0,1095,18,Brazilian Grand Prix,2022-11-13,1095,1,2,23.415,1,Lewis,Hamilton,18,1,1,1
1,1095,18,Brazilian Grand Prix,2022-11-13,1095,1,1,24.081,1,Lewis,Hamilton,18,1,1,1
2,1071,18,São Paulo Grand Prix,2021-11-14,1071,1,4,22.664,1,Lewis,Hamilton,18,1,1,1
3,1071,18,São Paulo Grand Prix,2021-11-14,1071,1,3,22.690,1,Lewis,Hamilton,18,1,1,1
4,1071,18,São Paulo Grand Prix,2021-11-14,1071,1,2,18.660,1,Lewis,Hamilton,18,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106858,879,18,Brazilian Grand Prix,2012-11-25,879,4,2,23.000,4,Fernando,Alonso,1096,4,10,20
106859,879,18,Brazilian Grand Prix,2012-11-25,879,4,1,22.472,4,Fernando,Alonso,1096,4,10,20
106860,859,18,Brazilian Grand Prix,2011-11-27,859,4,3,20.752,4,Fernando,Alonso,1096,4,10,20
106861,859,18,Brazilian Grand Prix,2011-11-27,859,4,2,20.664,4,Fernando,Alonso,1096,4,10,20


In [16]:
# tirando as colunas repetidas
df_result = df_result.drop('raceId1', axis=1)
df_result = df_result.drop('driverId1', axis=1)
df_result

Unnamed: 0,raceId,circuitId,name,date,driverId,stop,duration,forename,surname,pos. largada,positionOrder
0,1095,18,Brazilian Grand Prix,2022-11-13,1,2,23.415,Lewis,Hamilton,1,1
1,1095,18,Brazilian Grand Prix,2022-11-13,1,1,24.081,Lewis,Hamilton,1,1
2,1071,18,São Paulo Grand Prix,2021-11-14,1,4,22.664,Lewis,Hamilton,1,1
3,1071,18,São Paulo Grand Prix,2021-11-14,1,3,22.690,Lewis,Hamilton,1,1
4,1071,18,São Paulo Grand Prix,2021-11-14,1,2,18.660,Lewis,Hamilton,1,1
...,...,...,...,...,...,...,...,...,...,...,...
106858,879,18,Brazilian Grand Prix,2012-11-25,4,2,23.000,Fernando,Alonso,10,20
106859,879,18,Brazilian Grand Prix,2012-11-25,4,1,22.472,Fernando,Alonso,10,20
106860,859,18,Brazilian Grand Prix,2011-11-27,4,3,20.752,Fernando,Alonso,10,20
106861,859,18,Brazilian Grand Prix,2011-11-27,4,2,20.664,Fernando,Alonso,10,20


## Salva o DF obtido pela Query em um arquivo .csv, para usar posteriormente sem necessidade do DataBase

In [12]:
df_result.to_csv('Interlago_infos.csv', index=False)