In [None]:
# Importing the libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Configuring the style of the plots with Seaborn
sns.set_style('dark')

In [None]:
# Importing the training dataset
df = pd.read_csv('../data/trusted/trusted_file_training.csv')

# Importing the test dataset
#df = pd.read_csv('../data/trusted/trusted_file_test.csv')

In [None]:
# Adding the string "xx" to all instances where "uf" is different from the ones mentioned below
# States belonging to the Amazônia region
df.loc[(df['uf'] != 'AC') & 
       (df['uf'] != 'AP') & 
       (df['uf'] != 'AM') & 
       (df['uf'] != 'PA') & 
       (df['uf'] != 'RO') & 
       (df['uf'] != 'RR') & 
       (df['uf'] != 'TO') & 
       (df['uf'] != 'MA') & 
       (df['uf'] != 'MT'), 'uf'] = 'xx'

# Transforming "xx" into null values
df.replace({'xx': np.nan}, inplace = True)

# Dropping all instances with null values from the df
df = df.dropna(how = 'any', axis = 0)

# Resetting the index numbering
df.reset_index(drop = True, inplace = True)

In [None]:
# Defining colors for each state
color_dict = {'PA': 'C0', 'RO': 'C1', 'TO': 'C2', 'AC': 'C3', 'RR': 'C4', 'AP': 'C5', 'AM': 'C6', 'MT': 'C7', 'MA': 'C8'}

# Counting the number of instances for each state
state_counts = df['uf'].value_counts()

# Creating the figure and defining the size according to the number of states
fig = plt.figure(figsize = (len(state_counts) * 1.5, 8))

# Creating the bars with colors defined by the dictionary.
plt.bar(state_counts.index, state_counts, color = [color_dict.get(state, 'gray') for state in state_counts.index])

# Adding the percentage above each bar
for i, value in enumerate(state_counts):
    plt.text(i, value + 200, f"{value / state_counts.sum() * 100:.1f}%", ha = 'center', va = 'bottom', fontsize = 12.5)

# Setting y-axis limits and adding labels and a title
plt.ylim([0, state_counts.max() + 2000])
plt.xlabel('States', fontsize = 16, labelpad = 10, horizontalalignment = 'center')
plt.ylabel('Number of accidents that occurred (2007 - 2020)', fontsize = 16, labelpad = 17, verticalalignment = 'center')
plt.title(f'Amazônia Region', fontsize = 16, pad = 12, ha = 'center')

plt.show()