## Importing

In [None]:
import pandas as pd

df = pd.read_csv('../data/dataset.csv', index_col='PES_ID')

## Making Adjustments

In [None]:
df.rename(columns={'EMP_ID_x': 'EMP_ID'}, inplace=True)

In [None]:
# Changing the type of the columns to datetime, so it can be used later.
# 'PES_DATAINI' and 'PES_DATAFIM' contaisn only date data and 'PES_HRINI' and 'PES_HRFIM' contains only time data.
# No longer timestamp data.
for i in range(len(df.columns)):
    #print(df.columns[i])
    if df.columns[i] == 'PES_DATAINI' or df.columns[i] == 'PES_DATAFIM':
        df[df.columns[i]] = pd.to_datetime(df[df.columns[i]])
        
    elif df.columns[i] == 'PES_HRINI' or df.columns[i] == 'PES_HRFIM':
        df[df.columns[i]] = pd.to_datetime(df[df.columns[i]])
         
        without_date = df[df.columns[i]].apply( lambda d : d.time() )
        df[df.columns[i]] = without_date # turned to object type
        
        df[df.columns[i]] = pd.to_datetime(df[df.columns[i]], format='%H:%M:%S')

In [None]:
# To turn 'PES_HRINI' and 'PES_HRFIM''s data into minute format calculating the duration.
df['PES_HRFIM'] = df.apply(lambda x: x['PES_HRFIM'] + pd.Timedelta(days=1) if x['PES_HRFIM'] < x['PES_HRINI'] else x['PES_HRFIM'], axis=1)

df['DURATION'] = (abs(df['PES_HRFIM'] - df['PES_HRINI'])).dt.total_seconds() / 60
# df['DURATION'] = df['DURATION'].astype(int)

In [None]:
df = df.replace({'-REA': 0, 'CIRCUITO': 1, 'PONTUAL': 2})

## Removing duplicate data


In [None]:
DROP_LIST = ['LOCDESCARREGO_DESC', 'EMP_NOME', 'PES_HRFIM', 'PES_HRINI', 'PES_PESOFIM', 'PES_PESOINI', 'COLETA_DESC', 'ESPECCOLETA_DESC', 'EMP_ID_y', 'LOCAL_NOME', 'ROTA_DESC']

df.drop(DROP_LIST, axis=1, inplace=True)

## Removing Missing Values

In [None]:
df.dropna(inplace=True)

## Applying OneHotEncoder on Categorical Columns

In [None]:
df = pd.get_dummies(df, prefix=['TPVEICULO_DESC'], columns=['TPVEICULO_DESC'], dtype='int')

## Our Results

In [None]:
df.info()

In [None]:
df.head()

## Exporting

In [None]:
df.to_csv('../data/preprocessed.csv', index=True)