In [None]:
# Importing the libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()

# Importing the training dataset
df_training = pd.read_csv('../data/trusted/trusted_file_training.csv')

# Importing the test dataset
df_test = pd.read_csv('../data/trusted/trusted_file_test.csv')

In [None]:
df_training

In [None]:
df_test

In [None]:
# Selecting the variables to be used in training the models
# Training

# UF
uf = df_training['uf']
ufDF = pd.DataFrame(uf)

# Time of day
fd = df_training['fase_dia']
fdDF = pd.DataFrame(fd)

# Weather condition
cm = df_training['condicao_metereologica']
cmDF = pd.DataFrame(cm)

# Type of accident
ta = df_training['tipo_acidente']
taDF = pd.DataFrame(ta)

# Cause of the accident
ca = df_training['causa_acidente']
caDF = pd.DataFrame(ca)

# Day of the week
ds = df_training['dia_semana']
dsDF = pd.DataFrame(ds)

# Direction of the road
sv = df_training['sentido_via']
svDF = pd.DataFrame(sv)

# Type of road
tp = df_training['tipo_pista']
tpDF = pd.DataFrame(tp)

# Road layout
tv = df_training['tracado_via']
tvDF = pd.DataFrame(tv)

# Accident classification
cla_acid = df_training['classificacao_acidente']
cla_acidDF = pd.DataFrame(cla_acid)

# Concatenating the training datasets
df_training = pd.concat([fdDF, cmDF, taDF, caDF, dsDF, svDF, tpDF, tvDF, cla_acidDF, ufDF], axis = 1)

In [None]:
# Selecting the same variables used in training for the test dataset
# Test

# UF
uf = df_test['uf']
ufDF = pd.DataFrame(uf)

# Time of day
fd = df_test['fase_dia']
fdDF = pd.DataFrame(fd)

# Weather condition
cm = df_test['condicao_metereologica']
cmDF = pd.DataFrame(cm)

# Type of accident
ta = df_test['tipo_acidente']
taDF = pd.DataFrame(ta)

# Cause of the accident
ca = df_test['causa_acidente']
caDF = pd.DataFrame(ca)

# Day of the week
ds = df_test['dia_semana']
dsDF = pd.DataFrame(ds)

# Direction of the road
sv = df_test['sentido_via']
svDF = pd.DataFrame(sv)

# Type of road
tp = df_test['tipo_pista']
tpDF = pd.DataFrame(tp)

# Road layout
tv = df_test['tracado_via']
tvDF = pd.DataFrame(tv)

# Accident classification
cla_acid = df_test['classificacao_acidente']
cla_acidDF = pd.DataFrame(cla_acid)

# Concatenating the test datasets
df_test = pd.concat([fdDF, cmDF, taDF, caDF, dsDF, svDF, tpDF, tvDF, cla_acidDF, ufDF], axis = 1)

In [None]:
# Training
# Adding the string "xx" to all instances where "uf" is different from the ones mentioned below
df_training.loc[(df_training['uf'] != 'AC') & 
                (df_training['uf'] != 'AP') & 
                (df_training['uf'] != 'AM') & 
                (df_training['uf'] != 'PA') & 
                (df_training['uf'] != 'RO') & 
                (df_training['uf'] != 'RR') & 
                (df_training['uf'] != 'TO') & 
                (df_training['uf'] != 'MA') & 
                (df_training['uf'] != 'MT'), 'uf'] = 'xx'

# Transforming "xx" into null values
df_training.replace({'xx': np.nan}, inplace = True)

# Test
# Adding the string "xx" to all instances where "uf" is different from the ones mentioned below
df_test.loc[(df_test['uf'] != 'AC') & 
            (df_test['uf'] != 'AP') & 
            (df_test['uf'] != 'AM') & 
            (df_test['uf'] != 'PA') & 
            (df_test['uf'] != 'RO') & 
            (df_test['uf'] != 'RR') & 
            (df_test['uf'] != 'TO') & 
            (df_test['uf'] != 'MA') & 
            (df_test['uf'] != 'MT'), 'uf'] = 'xx'

# Transforming "xx" into null values
df_test.replace({'xx': np.nan}, inplace = True)

In [None]:
# Training
# Dropping all instances with null values from the df_training
df_training = df_training.dropna(how = 'any', axis = 0)

# Resetting the index numbering
df_training.reset_index(drop = True, inplace = True)

# Test
# Dropping all instances with null values from the df_test
df_test = df_test.dropna(how = 'any', axis = 0)

# Resetting the index numbering
df_test.reset_index(drop = True, inplace = True)

In [None]:
# Dropping the "uf" column from the training and test datasets
df_training.drop('uf', axis = 1, inplace = True)
df_test.drop('uf', axis = 1, inplace = True)

# Dropping duplicate rows from the training dataset
df_training = df_training.drop_duplicates(ignore_index = True)

In [None]:
# Training
# Creating a column to represent the original order of the rows
df_training['original_order'] = range(len(df_training))

# Rows containing the variable "Não Grave"
cla_acid_nao_grave_indexs = df_training[df_training['classificacao_acidente'] == 'Não Grave'].index

# Randomly selecting the rows to be excluded (95% of "Não Grave" accidents)
drop_indexs = np.random.choice(cla_acid_nao_grave_indexs, size = int(len(cla_acid_nao_grave_indexs) * 0.95), replace = False)

# Deleting the rows from the df_training
df_training = df_training.drop(drop_indexs)

# Reordering the dataframe based on the "ordem_original" column
df_training = df_training.sort_values('original_order').reset_index(drop = True)

# Dropping "original_order"
df_training.drop('original_order', axis = 1, inplace = True)

In [None]:
# Test

# As they are isolated databases, the training and test DataFrames have different dimensions
# As the dimension of the test inputs has to be the same as the training inputs, some less...
# frequent features are discarded to ensure the quantity of inputs in both databases is the same

# Dropping some features from the df_test of 2021 so that the number of features in both dataframes is equivalent
while df_training.nunique().sum() != df_test.nunique().sum():
    
    # Gets the index of the last row of the dataframe
    indexs = df_test['causa_acidente'].value_counts().tail(1).index

    # Deletes the rows with the obtained indexs
    df_test.drop(df_test[df_test['causa_acidente'].isin(indexs)].index, inplace = True)

In [None]:
# Training
# Replacing the observations of "classificacao_acidente" with 0 or 1
df_training['classificacao_acidente'].replace('Não Grave', int(0), inplace = True)

df_training['classificacao_acidente'].replace('Grave', int(1), inplace = True)

# Test
# Replacing the observations of "classificacao_acidente" with 0 or 1
df_test['classificacao_acidente'].replace('Não Grave', int(0), inplace = True)

df_test['classificacao_acidente'].replace('Grave', int(1), inplace = True)

In [None]:
# Transforming the data, concatenating the corresponding columns, and storing it in a dataframe
# Training
# INPUTS

# Time of day
fd = ['fase_dia']

onehotencoder.fit_transform(df_training[fd])
columns_fd = [fd[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
fdDF = pd.DataFrame(onehotencoder.fit_transform(df_training[fd]).toarray(), columns = columns_fd)

# Weather condition
cm = ['condicao_metereologica']

onehotencoder.fit_transform(df_training[cm])
columns_cm = [cm[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
cmDF = pd.DataFrame(onehotencoder.fit_transform(df_training[cm]).toarray(), columns = columns_cm)

# Type of accident
ta = ['tipo_acidente']

onehotencoder.fit_transform(df_training[ta])
columns_ta = [ta[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
taDF = pd.DataFrame(onehotencoder.fit_transform(df_training[ta]).toarray(), columns = columns_ta)

# Cause of the accident
ca = ['causa_acidente']

onehotencoder.fit_transform(df_training[ca])
columns_ca = [ca[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
caDF = pd.DataFrame(onehotencoder.fit_transform(df_training[ca]).toarray(), columns = columns_ca)

# Day of the week
ds = ['dia_semana']

onehotencoder.fit_transform(df_training[ds])
columns_ds = [ds[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
dsDF = pd.DataFrame(onehotencoder.fit_transform(df_training[ds]).toarray(), columns = columns_ds)

# Direction of the road
sv = ['sentido_via']

onehotencoder.fit_transform(df_training[sv])
columns_sv = [sv[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
svDF = pd.DataFrame(onehotencoder.fit_transform(df_training[sv]).toarray(), columns = columns_sv)

# Type of road
tp = ['tipo_pista']

onehotencoder.fit_transform(df_training[tp])
columns_tp = [tp[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
tpDF = pd.DataFrame(onehotencoder.fit_transform(df_training[tp]).toarray(), columns = columns_tp)

# Road layout
tv = ['tracado_via']

onehotencoder.fit_transform(df_training[tv])
columns_tv = [tv[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
tvDF = pd.DataFrame(onehotencoder.fit_transform(df_training[tv]).toarray(), columns = columns_tv)

inputs_training_df = pd.concat([fdDF, cmDF, taDF, caDF, dsDF, svDF, tpDF, tvDF], axis = 1)

# OUTPUTS

# Accident classification
cla_acid = ['classificacao_acidente']

outputs_training_df = pd.DataFrame(df_training[cla_acid])

In [None]:
# Transforming the data, concatenating the corresponding columns, and storing it in a dataframe
# Test
# INPUTS

# Time of day
fd = ['fase_dia']

onehotencoder.fit_transform(df_test[fd])
columns_fd = [fd[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
fdDF_test = pd.DataFrame(onehotencoder.fit_transform(df_test[fd]).toarray(), columns = columns_fd)

# Weather condition
cm = ['condicao_metereologica']

onehotencoder.fit_transform(df_test[cm])
columns_cm = [cm[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
cmDF_test = pd.DataFrame(onehotencoder.fit_transform(df_test[cm]).toarray(), columns = columns_cm)

# Type of accident
ta = ['tipo_acidente']

onehotencoder.fit_transform(df_test[ta])
columns_ta = [ta[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
taDF_test = pd.DataFrame(onehotencoder.fit_transform(df_test[ta]).toarray(), columns = columns_ta)

# Cause of the accident
ca = ['causa_acidente']

onehotencoder.fit_transform(df_test[ca])
columns_ca = [ca[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
caDF_test = pd.DataFrame(onehotencoder.fit_transform(df_test[ca]).toarray(), columns = columns_ca)

# Day of the week
ds = ['dia_semana']

onehotencoder.fit_transform(df_test[ds])
columns_ds = [ds[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
dsDF_test = pd.DataFrame(onehotencoder.fit_transform(df_test[ds]).toarray(), columns = columns_ds)

# Direction of the road
sv = ['sentido_via']

onehotencoder.fit_transform(df_test[sv])
columns_sv = [sv[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
svDF_test = pd.DataFrame(onehotencoder.fit_transform(df_test[sv]).toarray(), columns = columns_sv)

# Type of road
tp = ['tipo_pista']

onehotencoder.fit_transform(df_test[tp])
columns_tp = [tp[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
tpDF_test = pd.DataFrame(onehotencoder.fit_transform(df_test[tp]).toarray(), columns = columns_tp)

# Road layout
tv = ['tracado_via']

onehotencoder.fit_transform(df_test[tv])
columns_tv = [tv[0] + ' - ' + cat_name for cat_name in onehotencoder.categories_][0]
tvDF_test = pd.DataFrame(onehotencoder.fit_transform(df_test[tv]).toarray(), columns = columns_tv)

inputs_test_df = pd.concat([fdDF_test, cmDF_test, taDF_test, caDF_test, dsDF_test, svDF_test, tpDF_test, tvDF_test], axis = 1)

# OUTPUTS

# Accident classification
cla_acid = ['classificacao_acidente']

outputs_test_df = pd.DataFrame(df_test[cla_acid])

In [None]:
# Converting the data to a file in CSV format
inputs_training_df.to_csv('../data/refined/inputs_training_df.csv', index = False)
outputs_training_df.to_csv('../data/refined/outputs_training_df.csv', index = False)

inputs_test_df.to_csv('../data/refined/inputs_test_df.csv', index = False)
outputs_test_df.to_csv('../data/refined/outputs_test_df.csv', index = False)