In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
import geopandas as gpd
%matplotlib inline

from scipy.stats import pearsonr
import statsmodels.api

# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Configure seaborn theme
sns.set_theme(style='whitegrid', palette='pastel')
# sns.set_theme(style="ticks", context="talk", palette="bright")
# sns.color_palette("Paired")

In [None]:
# Import DataFrames
commune = pd.read_parquet('donnee-comm-data.gouv-parquet-2023-geographie2024-produit-le2024-07-05.parquet')
dep = pd.read_csv('donnee-dep-data.gouv-2023-geographie2024-produit-le2024-07-05.csv', sep=';')
reg = pd.read_csv('donnee-reg-data.gouv-2023-geographie2024-produit-le2024-07-05.csv', sep=';')

# Display the last 10 rows of the 'commune' DataFrame
commune.tail(10)

# I) Data Cleaning in the Different DataFrames
## A) Checking Consistency and Uniformity of Data
### 1) Consistency Check
First, it is important to verify that each variable is of the correct type. The goal is to ensure consistency in the value types across the three DataFrames (commune, dep, and reg).
Note that dates will be treated separately as they require special handling.

In [None]:
## DataFrame commune
# Columns 'classe' and 'unité.de.compte' must be of type object
commune['classe'] = commune['classe'].astype('str')
commune['unité.de.compte'] = commune['unité.de.compte'].astype('str')

# Display information about the 'commune' DataFrame
commune.info()

In [None]:
## DataFrame dep
# The column 'tauxpourmille' is not of the correct type; it should be changed from object to float
# To do this, each comma must be replaced with a period.
# We will also round the values to 10^-3

dep['tauxpourmille'] = dep['tauxpourmille'].str.replace(',', '.')
dep['tauxpourmille'] = dep['tauxpourmille'].astype(float)
dep['tauxpourmille'] = dep['tauxpourmille'].round(3)

# Display information about the 'dep' DataFrame
dep.info()

In [None]:
## DataFrame reg
# The column 'tauxpourmille' is not of the correct type; it should be changed from object to float
reg['tauxpourmille'] = reg['tauxpourmille'].str.replace(',', '.')
reg['tauxpourmille'] = reg['tauxpourmille'].astype(float)
reg['tauxpourmille'] = reg['tauxpourmille'].round(3)

# Display information about the 'reg' DataFrame
reg.info()

### 2) Checking Data Uniformity
It is important to check that the data is uniform, meaning it should always be written in the same way following a single rule.
#### a) Quantitative Data

In [None]:
reg.head(1)

In [None]:
## DataFrame commune
# Check if all years are written in the same format
commune['annee'].unique()

# Check if all classes are written in the same format and there are no duplicates written differently
commune['classe'].unique()

# 'victime' and 'victime entendue' correspond to the same unit of account. This needs to be corrected.
commune['unité.de.compte'] = commune['unité.de.compte'].replace('victime entendue', 'victime')

# Check if all published values are written in the same format
commune['valeur.publiée'].unique()

## DataFrame dep
# Check if all years are written in the same format
dep['annee'].unique()

# Check if all classes are written in the same format
dep['classe'].unique()

# Check if all departments are written in the same format
# Note: The value "20" is expected to be missing as it corresponds to Corsica, whose code was replaced in 1976 by 2A and 2B
dep['Code.département'].unique()

# 'victime' and 'victime entendue' correspond to the same unit of account. This needs to be corrected.
dep['unité.de.compte'] = dep['unité.de.compte'].replace('victime entendue', 'victime')

## DataFrame reg
# Check if all years are written in the same format
reg['annee'].unique()

# Check if all classes are written in the same format
reg['classe'].unique()

# Check if all regions are present and written in the same format
reg['Code.région'].unique()

#### b) Dates
The study of formats has shown that the dates (here "annee") are in integer format, which is incorrect. For the subsequent analysis, these variables need to be in datetime64 format.

In [None]:
commune['annee'] = pd.to_datetime(commune['annee'].apply(lambda x: f"20{x:02d}")).dt.year
dep['annee'] = pd.to_datetime(dep['annee'].apply(lambda x: f"20{x:02d}")).dt.year
reg['annee'] = pd.to_datetime(reg['annee'].apply(lambda x: f"20{x:02d}")).dt.year

In [None]:
commune.info()

## B) Duplicate Check

In [None]:
# Check for duplicates
print('There are', commune.duplicated().sum(), 'duplicates in the df commune')
print('There are', dep.duplicated().sum(), 'duplicates in the df dep')
print('There are', reg.duplicated().sum(), 'duplicates in the df reg')

## C) Missing Values Check
### 1) Global Analysis

In [None]:
# Check for missing values in the columns of the df 'dep'
print('There are', dep.isna().any(axis=0).sum(), 'columns with missing values in the df dep')

# Check for missing values in the rows of the df 'dep'
# dep.isna().any(axis=1).sum() Not necessary, so omitted

In [None]:
# Check for missing values in the columns of the df 'reg'
print('There are', reg.isna().any(axis=0).sum(), 'columns with missing values in the df reg')

# Check for missing values in the rows of the df 'reg'
# reg.isna().any(axis=1).sum() Not necessary, so omitted

In [None]:
# Check for missing values in the columns of the df 'commune'
print('There are', commune.isna().any(axis=0).sum(), 'columns with missing values in the df commune')

# Check for missing values in the rows of the df 'commune'
print('There are', commune.isna().any(axis=1).sum(), 'rows with missing values out of', commune.shape[0], 'rows in the df.')
print('The columns with missing values are: faits, taux pour mille, complementinfoval, complementinfototaux')

### 2) Handling Missing Values
The metadata file of the dataset indicates that if the value in the "valeur.publiée" column is "ndiff," then the variables in the "faits" and "tauxpourmille" columns will not be filled. Therefore, it has been decided to remove all rows where the "valeur.publiée" column has the value "ndiff."

In [None]:
# Remove missing values in the "faits" column
commune.loc[commune['valeur.publiée'] == 'ndiff', 'faits'] = 0.0

# Remove missing values in the "tauxpourmille" column
commune.loc[commune['valeur.publiée'] == 'ndiff', 'tauxpourmille'] = 0.0
commune.isna().any(axis=0)

# We observe that there are still NaNs. Let's locate and investigate them
commune.loc[commune[['tauxpourmille']].isna().any(axis=1)]

# The remaining NaNs can be recalculated because 'tauxpourmille' is the number of 'faits' divided by the population, multiplied by 1000
commune['millPOP'].unique()  # Verify that 'millPOP' does not take the value 0, as division by zero would be impossible

# Fill missing values in 'tauxpourmille' with calculated values
commune['tauxpourmille'] = commune['tauxpourmille'].fillna((commune['faits'] / commune['POP']) * 1000)

# There are still NaNs in 'tauxpourmille', which are from cities with 0 population.
commune['tauxpourmille'] = commune['tauxpourmille'].fillna(0)

# If necessary, drop rows with NaN values in 'tauxpourmille'
# commune = commune.dropna(axis=0, how='all', subset=['tauxpourmille'])

# Display the number of missing values in each column
display(pd.DataFrame(commune.isna().sum(), columns=["Number of NA"]))

# The columns "complementinfoval" and "complementinfototaux" still contain NaNs but are not useful for the analysis.


Some columns are not necessary for this study, so they should be removed. If it turns out that these columns are needed in the future, simply delete the following cell.

In [None]:
## DataFrame commune
commune=commune.drop("complementinfoval", axis=1)
commune=commune.drop("complementinfotaux", axis=1)
commune=commune.drop("LOG", axis=1)
commune=commune.drop("millLOG", axis=1)
commune=commune.drop("millPOP", axis=1)
commune=commune.drop("valeur.publiée", axis=1)
## DataFrame reg
reg=reg.drop("LOG", axis=1)
reg=reg.drop("millLOG", axis=1)
reg=reg.drop("millPOP", axis=1)
## DataFrame dep
dep=dep.drop("LOG", axis=1)
dep=dep.drop("millLOG", axis=1)
dep=dep.drop("millPOP", axis=1)

## D) Other Preprocessing
Renaming columns for better readability.

In [None]:
## DataFrame commune
dictionnaire = {'CODGEO_2024': 'codgeo',
                'unité.de.compte': 'cible',
                'valeur.publiée': 'publication',
                'POP':'pop',
                'millPOP':'millpop'}
commune=commune.rename(dictionnaire, axis = 1) 
## DataFrame reg
dictionnaire = {'Code.région': 'region',
                'unité.de.compte': 'cible',
                'POP':'pop',
                'millPOP':'millpop'}
reg=reg.rename(dictionnaire, axis = 1)
## DataFrame dep
dictionnaire = {'Code.département': 'departement',
                'Code.région': 'region',
                'unité.de.compte': 'cible',
                'POP':'pop',
                'millPOP':'millpop'}
dep=dep.rename(dictionnaire, axis = 1)
#dep.head(1)

The department and region codes are not very informative. To facilitate post-processing in future studies, it is advisable to replace the codes with their full names.

In [None]:
## DataFrame reg
# The idea here is to create a key-value dictionary from a JSON (source in the README file) to "translate" the code into the region name
json_cles_reg = pd.read_json('anciennes-nouvelles-regions.json')
dico_reg = json_cles_reg.set_index('new_code')['region'].to_dict()

# Map the 'region' column using the dictionary
reg['region'] = reg['region'].map(dico_reg)

# Display the last 10 rows of the 'reg' DataFrame
reg.tail(10)


In [None]:
## DataFrame dep
# First, we modify the region codes to their names as we did previously
dep['region'] = dep['region'].map(dico_reg)

# Next, we need to create a new key-value dictionary from a CSV file (source in the README file)
csv_cles_dep = pd.read_csv('georef-france-departement.csv', sep=';')
dico_dep = csv_cles_dep.set_index('Code Officiel Département')['Nom Officiel Département Majuscule'].to_dict()

# Map the 'departement' column using the dictionary
dep['departement'] = dep['departement'].map(dico_dep)

# Display the first few rows of the 'dep' DataFrame
dep.head()


In [None]:
## DataFrame commune 
csv_cles_com=pd.read_csv('v_commune_2024.csv')
dico_com=csv_cles_com.set_index('COM')['NCC'].to_dict()
commune['codgeo']=commune['codgeo'].map(dico_com)

commune.head(1)

In [None]:
# Check if the previous steps generated any NaNs
print('There are', dep.isna().any(axis=0).sum(), 'columns with missing values in the df dep')
print('There are', reg.isna().any(axis=0).sum(), 'columns with missing values in the df reg')
print('There are', commune.isna().any(axis=0).sum(), 'columns with missing values in the df commune')

In [None]:
# We can rename the "codgeo" column, as it no longer reflects the variables it contains
dictionnaire = {'codgeo': 'ville'}

commune = commune.rename(dictionnaire, axis=1)

# Display the first row of the 'commune' DataFrame
commune.head(1)

# II) Exploratory Statistics
## A) Descriptive Analysis of the Datasets
# One of the underlying goals is the study and handling of outliers.

### 1) Commune DataFrame

#### a) Quantitative Variables

In [None]:
## DataFrame commune
commune.describe()

# We observe a very large disparity in the number of 'faits'

Variable faits

In [None]:
# Plot the boxplot to display the distribution of 'faits'
sns.boxplot(x='faits', data=commune)
# We observe that there are 8 outliers. To determine the most appropriate treatment, we need to investigate whether these are extreme values or outliers.

# To do this, we sort the DataFrame by the number of 'faits' in descending order
comm_sorted = commune.sort_values(by='faits', ascending=False)

# Display the top 10 rows of the sorted DataFrame
comm_sorted.head(10)


It is observed that the number of incidents varies greatly and is very high in the Paris region. These values represent extreme values rather than outliers. Therefore, they are kept in the dataset.

Variable pop (population)

In [None]:
# Plot the boxplot to display the distribution of 'pop'
sns.boxplot(x='pop', data=commune)

# To do this, we sort the DataFrame by the population ('pop') in descending order
comm_sorted = commune.sort_values(by='pop', ascending=False)

# Display the top 10 rows of the sorted DataFrame
comm_sorted.head(10)

In [None]:
# Study excluding Paris and Marseille
comm_sorted_out = commune.loc[~commune['ville'].isin(['PARIS', 'MARSEILLE'])].sort_values(by='pop', ascending=False)

# Display the top 10 rows of the sorted DataFrame
comm_sorted_out.head(10)

We observe extreme values for the population. Upon examining the populations, it is clear that these extreme values correspond to the populations of Paris (2.1 million in 2023), Marseille (873,000 inhabitants in 2023), and Lyon (522,000 inhabitants). Therefore, these values are not outliers but represent extreme values.

Variable tauxpourmille

In [None]:
# Plot the boxplot to display the distribution of 'tauxpourmille'
sns.boxplot(x='tauxpourmille', data=commune)
# We observe that there are many outliers. To determine the most appropriate treatment, we need to investigate whether these are extreme values or outliers.

# To do this, we sort the DataFrame by 'tauxpourmille' in descending order
comm_sorted = commune.sort_values(by='tauxpourmille', ascending=False)

# Display the top 20 rows of the sorted DataFrame
comm_sorted.head(20)


We observe extreme values. The highest values are located in areas where:

There are a lot of tourists (non-residents)
The population is small
Since the "taux pour mille" (rate per thousand) is calculated as the number of incidents divided by the population, multiplied by 1,000, it is completely normal for the variance of the rate per thousand to be so high.

#### b) Qualitative Variables

In [None]:
# Observe the type of each variable
commune.dtypes
# Determine the categorical variables and store them in a DataFrame
cat_commune = commune.select_dtypes(include='O')

# Display the count of different categories in categorical variables using the value_counts method
print(cat_commune["ville"].value_counts())
print("------------------------------")
print(cat_commune["classe"].value_counts())  # Not needed since in all cases, each class is filled for each city
print("------------------------------")
print(cat_commune["cible"].value_counts())


In [None]:
# Determine and study the frequency of each category
print(cat_commune["ville"].value_counts(normalize=True))
print("------------------------------")
print(cat_commune["cible"].value_counts(normalize=True))

#### c) Analysis of Relationships
To examine relationships between variables in a dataset, we need to distinguish three levels of analysis:

Relationships between quantitative variables,
Relationships between qualitative variables,
Relationships between qualitative and quantitative variables.
For each level of analysis, we ask the question: is there dependence or independence between the variables? The goal of this part is to determine whether there is a dependence between variables in the dataset.

The pairs of variables to be studied will be as follows:

faits (quantitative variable) / pop (quantitative variable): Pearson or Spearman Test
faits (quantitative variable) / annee (qualitative variable): ANOVA Test

Pair facts / population: Pearson Correlation Test

Here, we are wondering if there is an influence between the size of the population and the number of crimes committed within a commune.

In [None]:
## Creating the DataFrame
com_fait_pop = commune.loc[commune['annee'] == 2016].groupby(['ville', 'pop'])['faits'].sum().sort_values(ascending=False).reset_index()

## Hypotheses
# H0: The size of the population does not influence the number of crimes committed
# H1: The size of the population does influence the number of crimes committed: the variables are correlated

## Statistical test
pearsonr(x=com_fait_pop['pop'], y=com_fait_pop['faits'])

print("p-value: ", pearsonr(x=com_fait_pop['pop'], y=com_fait_pop['faits'])[1])
print("coefficient: ", pearsonr(x=com_fait_pop['pop'], y=com_fait_pop['faits'])[0])

# Since the p-value is 0, we reject H0 and conclude H1.

#sns.lmplot(x='pop', y='faits', data=com_fait_pop)
sns.lmplot(x="pop", y="faits", data=com_fait_pop, height=5, order=2, line_kws={'color': 'red'})

According to the Pearson statistical test, there is a linear correlation between the size of the population and the number of crimes committed within a commune.

Couple of variables "faits" / "année": ANOVA correlation test

Here, we are investigating whether there is an influence between the time evolution (year) and the number of crimes.

In [None]:
## Creating the DataFrame
commune_annee_fait = commune.groupby(['annee'])['faits'].sum().sort_values(ascending=False).reset_index().sort_values(by='annee')

## Hypotheses
# H0: There is no linear relationship between the year and the number of crimes committed
# H1: There is a linear relationship between the year and the number of crimes committed

## Statistical test
result = statsmodels.formula.api.ols('annee ~ faits', data=commune_annee_fait).fit()
statsmodels.api.stats.anova_lm(result)

# Since the p-value is greater than 5%, we reject H1 and conclude H0

In [None]:
## DataFrame reg
# The idea here is to create a key-value dictionary from a JSON (source in the README file) to "translate" the code into the region name
json_cles_reg = pd.read_json('anciennes-nouvelles-regions.json')
dico_reg = json_cles_reg.set_index('new_code')['region'].to_dict()

# Map the 'region' column using the dictionary
reg['region'] = reg['region'].map(dico_reg)

# Display the last 10 rows of the 'reg' DataFrame
reg.tail(10)


There is no linear relationship between the year and the total number of crimes committed.

However, it would be interesting to investigate whether a linear relationship exists between a specific type of crime (such as domestic violence injuries) and the year.

In [None]:
## Creating the DataFrame
commune_annee_fait = commune.loc[commune['classe'] == 'Coups et blessures volontaires intrafamiliaux'].groupby(['annee'])['faits'].sum().sort_values(ascending=False).reset_index().sort_values(by='annee')

## Hypotheses
# H0: There is no linear relationship between the year and the number of domestic voluntary assaults and injuries
# H1: There is a linear relationship between the year and the number of domestic voluntary assaults and injuries

## Statistical test
result = statsmodels.formula.api.ols('annee ~ faits', data=commune_annee_fait).fit()
statsmodels.api.stats.anova_lm(result)

# Since the p-value is less than 5%, we reject H0 and conclude H1

This statistical test shows the following conclusion:
After the lockdown, the number of domestic violence injuries skyrocketed and continues to rise.

In [None]:
sns.lineplot(x=commune_annee_fait['annee'], y=commune_annee_fait['faits'], marker = 'o', label = 'Faits');

### 2) DataFrame dep
#### a) Quantitative variables

In [None]:
dep.describe()

Variable faits

In [None]:
# Plotting the boxplot to show the distribution of the facts
sns.boxplot(x = 'faits', data = dep)
# We observe that there are 8 outliers. To proceed with the most suitable treatment, we need to study whether these are extreme values or outliers.

# To do this, we sort the DataFrame by the number of facts in descending order
dep_sorted = dep.sort_values(by = 'faits', ascending = False)
dep_sorted.head(10)

It is observed that the number of incidents varies significantly and is notably high in the Parisian region. These values represent extreme values rather than outliers. Therefore, they are kept in the dataset.

Variable pop (population)

In [None]:
# Plotting the boxplot to show the distribution of the population
sns.boxplot(x = 'pop', data = commune)

# To do this, we sort the DataFrame by the population in descending order
dep_sorted = dep.sort_values(by = 'pop', ascending = False)
dep_sorted.head(10)

There are outliers present, which represent extreme values rather than outliers. Indeed, these outliers characterize the department of Nord (this department is the most populated in France with 2.6 million inhabitants).

Study of population densities

In [None]:
# Grouping the data by department and population, and summing the facts
dep_fait_pop = dep.groupby(['departement', 'pop'])['faits'].sum().sort_values(ascending=False).reset_index()

# Plotting the KDE (Kernel Density Estimate) for the population distribution in the French departments
sns.kdeplot(dep_fait_pop["pop"])
plt.title('Population Distribution in French Departments')
plt.show()


#### b) Quantitative variables

In [None]:
# Observing the type of each variable
dep.dtypes
# Determining the categorical variables and storing them in a DataFrame
cat_dep = dep.select_dtypes(include='O')

# Displaying the count of different categories in the categorical variables using the value_counts method
print(cat_dep["departement"].value_counts())
print("------------------------------")
print(cat_dep["classe"].value_counts())  # Not needed since in all cases, each city has a class provided
print("------------------------------")
print(cat_dep["cible"].value_counts())


In [None]:
# Determining and studying the frequency of each category
print(cat_dep["departement"].value_counts(normalize=True))
print("------------------------------")
print(cat_dep["cible"].value_counts(normalize=True))

#### c) Analysis of relationships

Another way to check for linear relationships is by using the correlation matrix and the seaborn pairplot. 

In [None]:
dep_group=dep.groupby(['annee','classe'])['faits'].sum().unstack()
dep_group.head()

In [None]:
sns.pairplot(data=dep_group, diag_kind='kde')

### 3) DataFrame reg
#### a) Quantitative variables

In [None]:
reg.describe()

Variable faits

In [None]:
# Plotting the distribution of the 'faits' variable
sns.boxplot(x='faits', data=reg)
# Unsurprisingly, there are also extreme values in the Paris region.
reg_sorted = reg.sort_values(by='faits', ascending=False)
reg_sorted.head(10) 

Variable pop

In [None]:
# Plotting the distribution of the 'pop' variable
sns.boxplot(x='pop', data=reg)

# Sorting the DataFrame by population in descending order
reg_sorted = reg.sort_values(by='pop', ascending=False)
reg_sorted.head(10) 

Study of population densities

In [None]:
reg_fait_pop = reg.groupby(['region', 'pop'])['faits'].sum().sort_values(ascending=False).reset_index()

sns.kdeplot(reg_fait_pop["pop"])
plt.title('Population Distribution in French Regions')
plt.show()

We observe a distribution with a variance significantly smaller than in the previous DataFrames. Indeed, the Île-de-France region is the most densely populated, with approximately 1020 inhabitants per km².

#### b) Quantitative variables

In [None]:
# Observing the data types of each variable
reg.dtypes

# Identifying categorical variables and storing them in a DataFrame
cat_reg = reg.select_dtypes(include='O')

# Displaying the frequency of different categories in the categorical variables using the value_counts method
print(cat_reg["region"].value_counts())
print("------------------------------")
print(cat_reg["classe"].value_counts())  # This is unnecessary as each class is reported for each region
print("------------------------------")
print(cat_reg["cible"].value_counts())

In [None]:
# Determining and studying the frequency of each category
print(cat_reg["region"].value_counts(normalize=True))
print("------------------------------")
print(cat_reg["cible"].value_counts(normalize=True))

# III) Statistical Studies

The most interesting aspect initially is to study which type of offense is most frequently committed at the level of communes, departments, and regions. 
Normally, the distributions should be identical (if all reports have been correctly made).

## A) Crimes and offenses committed at the communal level

In [None]:
# Filter data for the year 2016, group by 'classe' and sum the 'faits' (crimes/offenses) for each class
# Sort in descending order for better display of my barplot."
commune2016 = commune.loc[commune['annee'] == 2016].groupby('classe')['faits'].sum().sort_values(ascending=False)

# Create a bar plot with a size of 6x4 inches
fig, ax = plt.subplots(figsize=(6, 4)) 
sns.barplot(y=commune2016.index, x=commune2016, ax=ax)  # Plot the data with 'classe' on the y-axis
plt.title('Distribution of Crimes and Offenses in French Communes in 2016')  # Title of the plot
plt.show()  # Display the plot

In [None]:
commune_part = commune.loc[commune['annee'] == 2016].groupby(['classe', 'annee'])['faits'].sum().sort_values(ascending=False).reset_index()

plt.figure(figsize=(7, 7))

# Plot the pie chart without labels
plt.pie(x=commune_part.faits, 
        labels=None,  # No labels on the chart
        autopct=lambda x: str(round(x, 2)) + '%', 
        pctdistance=1.15, 
        wedgeprops={'linewidth': 1, 'edgecolor': 'black'})

# Add the legend with labels
plt.legend(labels=['Thefts without violence against individuals',
                   'Deliberate destruction and damage',
                   'Intentional assault and battery', 'Thefts from vehicles',
                   'Drug use', 'Home burglaries',
                   'Domestic violence assault and battery',
                   'Thefts of vehicles', 'Other intentional assault and battery',
                   'Violent theft without a weapon', 'Sexual violence',
                   "Theft of vehicle accessories", 'Drug trafficking',
                   'Thefts with weapons'],
           bbox_to_anchor=(1.05, 0.5), loc='center left')
plt.title('Share of Crimes and Offenses in French Communes in 2016')
plt.show()

We observe that the most frequently committed offense is theft without violence against individuals.
We now aim to conduct a high-level study of the evolution of crimes and offenses over time in order to identify trends

In [None]:
# Now we will try to display all the years on the same graph, using the 'hue' argument
communeTest = commune.groupby(['classe', 'annee'])['faits'].sum().sort_values(ascending=False).reset_index().sort_values(by='annee')
# Note that .reset_index().sort_values(by='annee') allows me to convert back to a DataFrame. This way, I can freely use the 'hue' argument to separate by year.

sns.set_theme(style="ticks", context="talk", palette="bright")
fig, ax = plt.subplots(figsize=(10, 10))
sns.barplot(y=communeTest.classe, x=communeTest.faits, ax=ax, hue=communeTest.annee)
plt.title('Distribution and evolution over time of crimes and offenses in French communes by year')
plt.show()

In [None]:
# Grouping the data by year ('annee') and crime type ('classe'), summing the occurrences of each crime
commune_group = commune.groupby(['annee', 'classe'])['faits'].sum().unstack()

# Plotting the grouped data with a line plot
commune_group.plot(figsize=(20, 8), style='o-')

# Adding a title to the plot
plt.title('Evolution over time of crimes and offenses in French communes by year')

# Adjusting the legend placement outside the plot for better visibility
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjusting the layout to avoid overlap
plt.tight_layout()

# Displaying the plot
plt.show()

We observe an increase over the years in domestic violence (voluntary injuries within families). This increase was much more significant during and after the various lockdowns.

Similarly, we observe a decrease in crimes and offenses during the lockdown periods (2019 and 2020), before increasing again. This is due to the fact that people were confined, making it harder to commit offenses.

Additionally, we observe:

An increase in drug trafficking
A marked rise in drug use
An increase in voluntary injuries (assaults)
A very slight decrease in armed theft
A decrease in violent armed thefts
A increase in sexual violence

We now ask ourselves:

What are the top 5 cities where most offenses are committed?
Is this ranking the same across different years?

In [None]:
# Display the first 20 rows of the 'commune' DataFrame for 2016
commune.head(20)

# Summing the number of crimes for each city in 2016
commune2016Top = commune.loc[commune['annee'] == 2016].groupby(['ville'])['faits'].sum().sort_values(ascending=False).head(5)

# Setting the theme and color palette for the plot
sns.set_theme(style='whitegrid', palette='pastel')

# Creating a figure and axis for the plot
fig, ax = plt.subplots(figsize=(10, 4))

# Plotting the barplot for the top 5 cities with the most crimes in 2016
sns.barplot(x=commune2016Top.index, y=commune2016Top, ax=ax)

# Adding a title to the plot
plt.title('Top 5 Cities with the Most Crimes and Offenses in 2016')

# Displaying the plot
plt.show()

In [None]:
# Group the data by year ('annee') and city ('ville'), summing the number of crimes ('faits')
communeTop = commune.groupby(['annee', 'ville'])['faits'].sum().reset_index() \
    .sort_values(['annee', 'faits'], ascending=[True, False]) \
    .groupby('annee').head(5).reset_index(drop=True)

# Create a bar plot using Plotly Express
fig = px.bar(communeTop,
             x='ville',          # Cities on the x-axis
             y='faits',          # Number of crimes on the y-axis
             animation_frame='annee',  # Animation by year
             color='ville')      # Color by city

# Customize the layout of the plot
fig.update_layout(title_text="Top 5 Cities with the Most Crimes and Offenses Over Time",
                  width=800,          # Set plot width
                  height=600)         # Set plot height

# Display the plot
fig.show('notebook')

We observe that between 2016 and 2020, the ranking remains unchanged. However, in 2021 and 2023, Lille is overtaken by Bordeaux. In 2024, the ranking returns to its initial values.
Furthermore, we observe a decrease in the total number of offenses in each city in 2020. This is due to the lockdown.

What about this ranking when we consider the number of inhabitants?
For reference, the rate per thousand is the number of offenses divided by the population, all multiplied by 1000.

In [None]:
# Group the data by year ('annee') and city ('ville'), summing the crime rate per thousand ('tauxpourmille')
communeTop2 = commune.groupby(['annee', 'ville'])['tauxpourmille'].sum().reset_index() \
    .sort_values(['annee', 'tauxpourmille'], ascending=[True, False]) \
    .groupby('annee').head(5).reset_index(drop=True)

# Create a bar plot using Plotly Express
fig = px.bar(communeTop2,
             x='ville',            # Cities on the x-axis
             y='tauxpourmille',    # Crime rate per thousand on the y-axis
             animation_frame='annee',  # Animation by year
             color='ville')        # Color by city

# Customize the layout of the plot
fig.update_layout(title_text="Top 5 Cities with the Highest Crime Rate per Thousand People Over Time",
                  width=1000,            # Set plot width
                  height=600)           # Set plot height

# Display the plot
fig.show('notebook')

It is now suggested to study the differences between selecting the number of offenses or the rate per thousand inhabitants (thus weighted by the number of offenses).

In [None]:
# Create the figure
fig = go.Figure()

# Add the first trace
fig.add_trace(go.Bar(
    x=communeTop['ville'].head(5),
    y=communeTop['faits'].head(5),
    name='Crimes',
    marker_color='blue', 
))

# Add the second trace
fig.add_trace(go.Bar(
    x=communeTop2['ville'].head(5),
    y=communeTop2['tauxpourmille'].head(5),
    name='Crime Rate per Thousand',
    marker_color='orange'
))

# Adjust the figure size
fig.update_layout(
    autosize=False,
    width=800,
    height=900,  
    title="Top 5 by City",
    xaxis_title='City',
    yaxis_title='Value'
)

# Add buttons for interactivity
fig.update_layout(
    updatemenus=[dict(
        type="buttons",
        direction="right",
        active=0,
        x=0.57,
        y=1.2,
        buttons=list([ 
            dict(
                label="Crimes",
                method="update",
                args=[{"visible": [True, False]},
                      {"title": "Top 5 Based on Number of Crimes (Cumulative over the Years)"}]
            ),
            dict(
                label="Crime Rate per Thousand",
                method="update",
                args=[{"visible": [False, True]},
                      {"title": "Top 5 Based on Crime Rate per Thousand (Cumulative over the Years)"}]
            )
        ])
    )]
)

# Add annotations
high_annotations = [dict(x=-0.05,
                         y=communeTop['faits'].mean(),
                         xanchor="right",
                         yanchor="bottom",
                         xref="x domain",
                         yref="y",
                         text="Average Crimes: %.2f" % communeTop['faits'].mean(),
                         showarrow=False)]

low_annotations = [dict(x=-0.05,
                        y=communeTop2['tauxpourmille'].mean(),
                        xanchor="right",
                        yanchor="bottom",
                        xref="x domain",
                        yref="y",
                        text="Average Crime Rate: %.2f" % communeTop2['tauxpourmille'].mean(),
                        showarrow=False)]

# Add the legend and title
fig.update_layout(
    title="Comparison of Cities by Number of Crimes and Crime Rate per Thousand",
    xaxis_title='Cities',
    yaxis_title='Value'
)

# Add the annotation
fig.add_annotation(
    text="Select the data source:",
    showarrow=False,
    x=0,
    y=1.1,
    yref="paper",
    xref="paper"
)

fig.show("notebook")


Conclusion: We observe that when analyzing either the number of offenses or the rate per thousand inhabitants, the results change.
The rate per thousand allows us to account for the population size of the city. Thus, the statistical study is less sensitive to the sample size, and we can compare a large city like Paris with a small village like Saint-Cast.

As in previous hypothesis, cities that attract tourists (a perfect example being Roissy with its international airport) rise in the ranking.

## B) Crimes and offenses committed at the departmental level

In [None]:
# Filter by year, group by crime class, and sum the number of incidents for each class
dep2016 = dep.loc[dep['annee'] == 2016].groupby('classe')['faits'].sum().sort_values(ascending=False)

# Create a barplot to visualize the distribution of crimes and offenses in French departments in 2016
fig, ax = plt.subplots(figsize=(6, 4)) 
sns.barplot(y=dep2016.index, x=dep2016, ax=ax)
plt.title('Distribution of Crimes and Offenses in French Departments in 2016')
plt.show()

We observe that the most frequently committed offense is theft without violence against individuals.
We now aim to conduct a high-level study of the evolution of crimes and offenses over time in order to identify trends.

In [None]:
# Now, we want to display ALL years on the same graph using the 'hue' argument
depTest = dep.groupby(['classe', 'annee'])['faits'].sum().sort_values(ascending=False).reset_index().sort_values(by='annee')

# Note that .reset_index().sort_values(by='annee') converts it back to a DataFrame. This allows me to freely use the 'hue' argument to differentiate by year.

sns.set_theme(style="ticks", context="talk", palette="bright")
fig, ax = plt.subplots(figsize=(10, 10))
sns.barplot(y=depTest.classe, x=depTest.faits, ax=ax, hue=depTest.annee)
plt.title('Distribution and Evolution of Crimes and Offenses Over Time in French Departments by Year')
plt.show()

In [None]:
# Group the data by year and crime class, summing the number of crimes
dep_group = dep.groupby(['annee', 'classe'])['faits'].sum().unstack()

# Plot the evolution of crimes over time for each class
dep_group.plot(figsize=(20, 8), style='o-')

# Set the title and layout for the plot
plt.title('Evolution of Crimes and Offenses Over Time in French Departments by Year')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout to prevent clipping of labels
plt.tight_layout()

# Show the plot
plt.show()

The conclusions are identical to those from the study of data at the communal level.

We observe an increase over the years in domestic violence (voluntary injuries within families). This increase was much more significant during and after the various lockdowns.

Similarly, we observe a decrease in crimes and offenses during the lockdown periods (2019 and 2020), before increasing again. This is due to the fact that people were confined, making it harder to commit offenses.

Additionally, we observe:

- An increase in drug trafficking
- A marked rise in drug use
- An increase in voluntary injuries (assaults)
- A very slight decrease in armed theft
- A decrease in violent armed thefts (possibly at the expense of thefts without violence?)
- A sad increase in sexual violence

We now ask ourselves:

What are the top 5 cities where most offenses are committed?
Is this ranking the same across different years?

In [None]:
# Group the data by year and department, summing the number of crimes for each combination
depTop = dep.groupby(['annee', 'departement'])['faits'].sum().reset_index().sort_values(['annee', 'faits'], ascending=[True, False]).groupby('annee').head(5).reset_index(drop=True)

# Create a bar plot with Plotly Express showing the top 5 departments for each year
fig = px.bar(depTop,
             x='departement',  # Departments on the x-axis
             y='faits',  # Number of crimes on the y-axis
             animation_frame='annee',  # Create an animation for each year
             color='departement')  # Color bars by department

# Update the layout of the plot with a title and size adjustments
fig.update_layout(title_text="Top 5 Departments Where Most Crimes Were Committed Over Time",
                  width=1000,  # Set the width of the plot
                  height=600)  # Set the height of the plot

# Show the plot in the notebook
fig.show('notebook')

We observe that the ranking remains the same between 2016 and 2020.
The positions of the Nord and Bouches-du-Rhône departments switch starting from 2021, as do those of the Rhône and Seine-Saint-Denis.

What about this ranking when we consider the number of inhabitants?
For reference, the rate per thousand is the number of offenses divided by the population, all multiplied by 1000.

In [None]:
# Now, let's sort the departments with the highest crime rate per thousand people
depTop2 = dep.groupby(['annee', 'departement'])['tauxpourmille'].sum().reset_index().sort_values(['annee', 'tauxpourmille'], ascending=[True, False]).groupby('annee').head(5).reset_index(drop=True)

# Display the plot
fig = px.bar(depTop2,
             x='departement',  # Departments on the x-axis
             y='tauxpourmille',  # Crime rate per thousand on the y-axis
             animation_frame='annee',  # Create an animation for each year
             color='departement')  # Color bars by department

# Update the layout with title and size adjustments
fig.update_layout(title_text="Top 5 Departments with the Highest Crime Rate Over Time",
                  width=1000,  # Set the width of the plot
                  height=600)  # Set the height of the plot

# Show the plot in the notebook
fig.show('notebook')

It is now proposed to study the differences between selecting the number of offenses or the rate per thousand inhabitants (thus weighted by the number of offenses).

In [None]:
# Create the figure
fig = go.Figure()

# Add the first trace (for total crimes/facts)
fig.add_trace(go.Bar(
    x=depTop['departement'].head(5),  # First 5 departments based on number of crimes
    y=depTop['faits'].head(5),  # Total number of crimes
    name='Crimes',  # Label for this trace
    marker_color='blue',  # Color of the bars
))

# Add the second trace (for crime rate per thousand)
fig.add_trace(go.Bar(
    x=depTop2['departement'].head(5),  # First 5 departments based on crime rate per thousand
    y=depTop2['tauxpourmille'].head(5),  # Crime rate per thousand
    name='Crime Rate per Thousand',  # Label for this trace
    marker_color='orange',  # Color of the bars
))

# Adjust the figure size
fig.update_layout(
    autosize=False,  # Set to False for manual size adjustments
    width=800,  # Set width of the plot
    height=900,  # Set height of the plot
    title="Top 5 by Department",  # Title of the plot
    xaxis_title='Department',  # Title for the x-axis
    yaxis_title='Value'  # Title for the y-axis
)

# Add buttons for interactivity to switch between "Faits" and "Crime Rate per Thousand"
fig.update_layout(
    updatemenus=[dict(
        type="buttons",
        direction="right",
        active=0,
        x=0.57,
        y=1.2,
        buttons=list([
            dict(
                label="Crimes",  # Button to display total crimes
                method="update",
                args=[{"visible": [True, False]},
                      {"title": "Top 5 based on Total Crimes (Cumulative over Years)"}]
            ),
            dict(
                label="Crime Rate per Thousand",  # Button to display crime rate per thousand
                method="update",
                args=[{"visible": [False, True]},
                      {"title": "Top 5 based on Crime Rate per Thousand (Cumulative over Years)"}]
            )
        ])
    )]
)

# Add annotations for average values
high_annotations = [dict(x=-0.05,
                         y=depTop['faits'].mean(),
                         xanchor="right",
                         yanchor="bottom",
                         xref="x domain",
                         yref="y",
                         text="Average Crimes: %.2f" % depTop['faits'].mean(),
                         showarrow=False)]

low_annotations = [dict(x=-0.05,
                        y=depTop2['tauxpourmille'].mean(),
                        xanchor="right",
                        yanchor="bottom",
                        xref="x domain",
                        yref="y",
                        text="Average Crime Rate: %.2f" % depTop2['tauxpourmille'].mean(),
                        showarrow=False)]

# Add the legend and title for the plot
fig.update_layout(
    title="Comparison of Departments by Number of Crimes and Crime Rate per Thousand",
    xaxis_title='Department',
    yaxis_title='Value'
)

# Add general annotations to provide instructions for data source selection
fig.add_annotation(
    text="Select Data Source:",
    showarrow=False,
    x=0,
    y=1.1,
    yref="paper",
    xref="paper"
)

# Display the plot in the notebook
fig.show("notebook")

Conclusion: We observe that when analyzing either the number of offenses or the rate per thousand inhabitants, the results change.
The rate per thousand allows us to account for the population size of the department. Thus, the statistical study is less sensitive to the sample size, and we can compare a department like Paris with a smaller one like Guyane.

Here, we observe that Seine-Saint-Denis, whose cities did not appear in the top 5 of the previous study, ranks second.

## C) Crimes and offenses committed at the regional level


We observe that between 2016 and 2020, the ranking remains unchanged. However, in 2021 and 2023, Lille is overtaken by Bordeaux. In 2024, the ranking returns to its initial values.
Additionally, we observe a decrease in the total number of offenses in each city in 2020. This is due to the lockdown.

In [None]:
# Filter by year, group by class, and sum the number of crimes for each class occurrence
# Then sort in descending order for a better display of the barplot.
reg2016 = reg.loc[reg['annee'] == 2016].groupby('classe')['faits'].sum().sort_values(ascending=False)

# Create the bar plot
fig, ax = plt.subplots(figsize=(6, 4)) 
sns.barplot(y=reg2016.index, x=reg2016, ax=ax)
plt.title('Distribution of Crimes and Offenses in French Regions in 2016')
plt.show()

We observe that the most frequently committed offenses are theft without violence against individuals, as well as intentional destruction and damage.

We now aim to conduct a high-level study of the evolution of crimes and offenses over time in order to identify trends

In [None]:
# Now, we will display all years on the same graph using the 'hue' argument
regTest = reg.groupby(['classe', 'annee'])['faits'].sum().sort_values(ascending=False).reset_index().sort_values(by='annee')
# Note that .reset_index().sort_values(by='annee') converts it back into a DataFrame,
# allowing the use of the 'hue' argument to discretize by year.

sns.set_theme(style="ticks", context="talk", palette="bright")
fig, ax = plt.subplots(figsize=(10, 10))
sns.barplot(y=regTest.classe, x=regTest.faits, ax=ax, hue=regTest.annee)
plt.title('Distribution and Evolution of Crimes and Offenses in French Regions Over Time by Year')
plt.show()

In [None]:
# Group by year and crime class, summing the number of crimes
reg_group = reg.groupby(['annee', 'classe'])['faits'].sum().unstack()

# Plot the evolution over time for each crime class
reg_group.plot(figsize=(20, 8), style='o-')

# Add a title and adjust legend position
plt.title('Evolution of Crimes and Offenses in French Regions Over Time by Year')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout to prevent clipping
plt.tight_layout()

# Show the plot
plt.show()

The observations are the same as for the previous two datasets. We observe an increase over the years in domestic violence (voluntary injuries within families). This increase was much more significant during and after the various lockdowns.

Similarly, we observe a decrease in crimes and offenses during the lockdown periods (2019 and 2020), before increasing again.

Additionally, we observe:

- An increase in drug trafficking
- A marked rise in drug use
- An increase in voluntary injuries (assaults)
- A very slight decrease in armed theft
- A decrease in violent armed thefts (possibly at the expense of thefts without violence?)
- A sad increase in sexual violence

We now ask ourselves:

What are the top 5 cities where most offenses are committed?
Is this ranking the same across different years?

In [None]:
# Group by year and region, sum the number of crimes, and sort the values to get the top 5 regions for each year
regTop = reg.groupby(['annee', 'region'])['faits'].sum().reset_index().sort_values(['annee', 'faits'], ascending=[True, False]).groupby('annee').head(5).reset_index(drop=True)

# Create an animated bar chart using Plotly Express
fig = px.bar(regTop,
             x='region',
             y='faits',
             animation_frame='annee',
             color='region')

# Update the layout with a title and set the size of the chart
fig.update_layout(title_text="Top 5 Regions with the Most Crimes and Offenses Over Time",
                  width=1000,
                  height=600)

# Display the chart in a notebook
fig.show('notebook')

We observe that the ranking remains the same regardless of the year.
Île-de-France is the region where the most crimes and offenses have been committed. However, this ranking is not surprising given the population density of the region.

What about this ranking when we consider the number of inhabitants?
For reference, the rate per thousand is the number of offenses divided by the population, all multiplied by 1000.

In [None]:
# Sorting regions by the highest crime rate per thousand (tauxpourmille) for each year
regTop2 = reg.groupby(['annee', 'region'])['tauxpourmille'].sum().reset_index().sort_values(['annee', 'tauxpourmille'], ascending=[True, False]).groupby('annee').head(5).reset_index(drop=True)

# Display the chart
fig = px.bar(regTop2,
             x='region',
             y='tauxpourmille',
             animation_frame='annee',
             color='region')

# Update the layout of the chart with title and size
fig.update_layout(title_text="Top 5 Regions with the Highest Crime Rate per Thousand Over Time",
                  width=1000,
                  height=600)

# Show the chart in the notebook
fig.show('notebook')

It is now proposed to study the differences between selecting the number of offenses or the rate per thousand inhabitants (thus weighted by the number of offenses)

In [None]:
# Create the figure
fig = go.Figure()

# Add the first trace
fig.add_trace(go.Bar(
    x=regTop['region'].head(5),
    y=regTop['faits'].head(5),
    name='Crimes',
    marker_color='blue', 
))

# Add the second trace
fig.add_trace(go.Bar(
    x=regTop2['region'].head(5),
    y=regTop2['tauxpourmille'].head(5),
    name='Crime Rate per Thousand',
    marker_color='orange'
))

# Adjust the figure size
fig.update_layout(
    autosize=False,
    width=800,
    height=900,  
    title="Top 5 by Region",
    xaxis_title='Region',
    yaxis_title='Value'
)

# Add buttons for interactivity
fig.update_layout(
    updatemenus=[dict(
        type="buttons",
        direction="right",
        active=0,
        x=0.57,
        y=1.2,
        buttons=list([ 
            dict(
                label="Crimes",
                method="update",
                args=[{"visible": [True, False]},
                      {"title": "Top 5 Based on Number of Crimes (Aggregated Over the Years)"}]
            ),
            dict(
                label="Crime Rate per Thousand",
                method="update",
                args=[{"visible": [False, True]},
                      {"title": "Top 5 Based on Crime Rate per Thousand (Aggregated Over the Years)"}]
            )
        ])
    )]
)

# Add annotations
high_annotations = [dict(x=-0.05,
                         y=regTop['faits'].mean(),
                         xanchor="right",
                         yanchor="bottom",
                         xref="x domain",
                         yref="y",
                         text="Average Crimes: %.2f" % regTop['faits'].mean(),
                         showarrow=False)]

low_annotations = [dict(x=-0.05,
                        y=regTop2['tauxpourmille'].mean(),
                        xanchor="right",
                        yanchor="bottom",
                        xref="x domain",
                        yref="y",
                        text="Average Crime Rate: %.2f" % regTop2['tauxpourmille'].mean(),
                        showarrow=False)]

# Update legends and title
fig.update_layout(
    title="Comparison of Regions by Number of Crimes and Crime Rate per Thousand",
    xaxis_title='Region',
    yaxis_title='Value'
)

# Add additional annotations
fig.add_annotation(
    text="Select Data Source:",
    showarrow=False,
    x=0,
    y=1.1,
    yref="paper",
    xref="paper"
)

fig.show("notebook")

Conclusion: We observe that when analyzing either the number of offenses or the rate per thousand, the results change.
The rate per thousand allows us to account for the population size of the region. Thus, the statistical study is less sensitive to the sample size, and we can compare a large region like Île-de-France with a smaller one.

We also observe that the regions with the most population movement (e.g., tourism) are at the top.
It would be interesting to correlate this hypothesis with a complementary statistical study on tourism in France.

# IV) Geographical representations

The objective of this section is to use the GeoPandas package to display a map.

GeoPandas requires a GeoDataFrame in which a 'geometry' column contains the information needed to create the map.
This study will focus on the data from the commune DataFrame.

## A) Departmental view

In [None]:
import geopandas as gpd

# Load the GeoJSON file
json_cles_geo = gpd.read_file('departements.geojson')

# Normalize, encode to ASCII, remove accents, and convert to uppercase
json_cles_geo['nom'] = json_cles_geo['nom'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# Standardize department names: convert to uppercase and remove hyphens and apostrophes
json_cles_geo['nom'] = json_cles_geo['nom'].apply(lambda name: name.upper())
json_cles_geo['nom'] = json_cles_geo['nom'].str.replace(r"[-']", " ", regex=True)

# Display the first few rows
json_cles_geo.head()

In [None]:
# For this study, we focus exclusively on mainland France.
# We remove all rows from the DOM-TOMs where the regions are: 'GUADELOUPE', 'MARTINIQUE', 'GUYANE', 'LA REUNION', 'MAYOTTE'
dep = dep[~dep['region'].isin(['GUADELOUPE', 'MARTINIQUE', 'GUYANE', 'LA REUNION', 'MAYOTTE'])]
dep.region.unique()

In [None]:
# We create a new column in dep to hold the geometry values
dep['geometry'] = 0
dep.head(1)

In [None]:
# Now, we aim to add these columns to the dep DataFrame.
# We use a key-value dictionary to match the correct coordinates based on the department name

dico_geo = json_cles_geo.set_index('nom')['geometry'].to_dict()
dep['geometry'] = dep['departement'].map(dico_geo)

dep.head()

In [None]:
# Check that no NaNs were created
dep.isna().any(axis=0)
display(pd.DataFrame(dep.isna().sum(), columns=["Number of NaNs"]))  # No NaNs, we can continue the analysis confidently

In [None]:
# Transform the DataFrame into a GeoDataFrame
dep = gpd.GeoDataFrame(dep, geometry='geometry')

# Set an initial CRS, which we will later modify to study its influence
dep = dep.set_crs('wgs84', allow_override=True)

# Print the CRS to verify
print(dep.crs)

In [None]:
# Display the total number of incidents, cumulative across all years
depTotal = dep.groupby(['departement', 'geometry'])['faits'].sum().sort_values(ascending=False).reset_index()

# Convert depTotal into a GeoDataFrame
depTotal = gpd.GeoDataFrame(depTotal, geometry='geometry')

# Display the map using the 'explore' method, coloring based on the 'faits' column
depTotal.explore(column='faits', cmap='viridis', legend=True)