In [5]:
# Bronze step crime
import pandas as pd
import os

base_dir = "c:/Users/darka/Desktop/Projets/Elexxion"
input_csv = os.path.join(base_dir, "crime/raw/crime_2016_2024_departement.csv")
output_dir = os.path.join(base_dir, "crime/parquet")

os.makedirs(output_dir, exist_ok=True)

# Transform CSV in dataframe
df = pd.read_csv(input_csv, sep=';', encoding='utf-8')

# Save as Parquet with '_df_bronze' prefix
output_parquet = os.path.join(output_dir, "df_bronze_crime_2016_2024_departement.parquet")
df.to_parquet(output_parquet, index=False)

print(f'File saved as Parquet: {output_parquet}')

File saved as Parquet: c:/Users/darka/Desktop/Projets/Elexxion\crime/parquet\df_bronze_crime_2016_2024_departement.parquet


In [6]:
# Silver step crime
import pandas as pd

# Load the bronze Parquet file
bronze_file_path = 'c:/Users/darka/Desktop/Projets/Elexxion/crime/parquet/df_bronze_crime_2016_2024_departement.parquet'
df = pd.read_parquet(bronze_file_path)

# Standardize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Remove duplicates
df = df.drop_duplicates()

# Remove empty rows
df = df.dropna()

# Colums renaming
df.rename(columns={'code_departement': 'departement'}, inplace=True)
df.rename(columns={'code_region': 'region'}, inplace=True)
df.rename(columns={'insee_pop': 'population'}, inplace=True)
df.rename(columns={'indicateur': 'type'}, inplace=True)
df.rename(columns={'taux_pour_mille': 'tpm'}, inplace=True)

# Colums deleting
df = df.drop(columns=["unite_de_compte", "insee_log", "insee_pop_millesime", "insee_log_millesime"])

# Save as Parquet with '_df_silver' prefix
silver_file_path = 'c:/Users/darka/Desktop/Projets/Elexxion/crime/parquet/df_silver_crime_2016_2024_departement.parquet'
df.to_parquet(silver_file_path, index=False)


In [7]:
# Gold step crime
import pandas as pd

# Load the silver Parquet file
silver_file_path = 'c:/Users/darka/Desktop/Projets/Elexxion/crime/parquet/df_silver_crime_2016_2024_departement.parquet'
df = pd.read_parquet(silver_file_path)

# Force correct types
df['departement'] = df['departement'].astype(str)
df['region'] = df['region'].astype(int)
df['annee'] = df['annee'].astype(int)
df['type'] = df['type'].astype(str)
df['nombre'] = df['nombre'].astype(int)
df['tpm'] = df['tpm'].astype(str)
df['population'] = df['population'].astype(int)

# Aggregate data : sum 'nombre' per department and year
gold_df = df.groupby(['departement', 'annee', 'region', 'population'])['nombre'].sum().reset_index()

# Save as Parquet with '_df_gold' prefix
gold_file_path = 'c:/Users/darka/Desktop/Projets/Elexxion/crime/parquet/df_gold_crime_2016_2024_departement.parquet'
gold_df.to_parquet(gold_file_path, index=False)