## 1. Import / inspection

In [35]:
# Importation de bibliothèque
import pandas as pd
import numpy as np

# Chargement des données
customers = pd.read_csv("customers.csv")
products = pd.read_csv("products.csv")
orders = pd.read_csv("order_lines.csv")

In [36]:
# Inspection globale
for name, df in zip(["customers", "products", "orders"], [customers, products, orders]):
    print(f"\n===== {name.upper()} =====")
    print(f"Le dataset {name.capitalize()} est de dimension {df.shape}")


===== CUSTOMERS =====
Le dataset Customers est de dimension (500, 6)

===== PRODUCTS =====
Le dataset Products est de dimension (60, 5)

===== ORDERS =====
Le dataset Orders est de dimension (2225, 18)


In [37]:
# Inspection globale
for name, df in zip(["customers", "products", "orders"], [customers, products, orders]):
    print(f"\n===== {name.upper()} =====")
    print(df.info())


===== CUSTOMERS =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  500 non-null    object
 1   age          500 non-null    object
 2   gender       490 non-null    object
 3   city         485 non-null    object
 4   segment      500 non-null    object
 5   signup_date  500 non-null    object
dtypes: object(6)
memory usage: 23.6+ KB
None

===== PRODUCTS =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    60 non-null     object 
 1   category      60 non-null     object 
 2   brand         60 non-null     object 
 3   product_name  60 non-null     object 
 4   unit_price    60 non-null     float64
dtypes: float64(1), object(4)
memory usage: 2.5+ KB
None

===== ORDERS ===

In [41]:
# Inspection globale
for name, df in zip(["customers", "products", "orders"], [customers, products, orders]):
    print(f"\n===== {name.upper()} =====")
    print(df.describe())


===== CUSTOMERS =====
       customer_id  age gender            city   segment signup_date
count          500  500    490             485       500         500
unique         500   36      3              16         4         386
top          C0136   16      F  Port-au-Prince  Étudiant  2023-11-27
freq             1   40    235             135       182           3

===== PRODUCTS =====
        unit_price
count    60.000000
mean    354.351000
std     514.712694
min      24.970000
25%      72.845000
50%     124.245000
75%     308.502500
max    1932.530000

===== ORDERS =====
          quantity   unit_price  discount_pct  gross_amount   net_amount  \
count  2225.000000  2225.000000   2225.000000   2225.000000  2225.000000   
mean      2.966292   354.163991      0.115097   1049.257438   926.902701   
std       1.426634   506.596786      0.068481   1711.115156  1510.243744   
min      -1.000000    24.970000      0.000000     24.970000    20.110000   
25%       2.000000    74.300000      0.

In [42]:
# Inspection globale
for name, df in zip(["customers", "products", "orders"], [customers, products, orders]):
    print(f"\n===== {name.upper()} =====")
    print(df.head())


===== CUSTOMERS =====
  customer_id      age gender            city        segment signup_date
0       C0001       28      M        Gonaïves     Entreprise  2025-09-26
1       C0002       16      M        Gonaïves  Professionnel  2024-01-19
2       C0003  unknown      M      Saint-Marc    Indépendant  2024-05-05
3       C0004       46      F  Port-au-Prince  Professionnel  2024-11-16
4       C0005       19      M     Cap-Haïtien  Professionnel  2024-11-21

===== PRODUCTS =====
  product_id    category    brand      product_name  unit_price
0       P001      Laptop       HP      Ultrabook 14     1681.08
1       P002      Laptop     ASUS       Notebook 15     1058.14
2       P003       Livre  Manning  Data Engineering       40.16
3       P004  Accessoire    Anker       SSD externe       91.27
4       P005      Laptop       HP         Gaming 16     1123.49

===== ORDERS =====
  order_id customer_id product_id  order_date  quantity  unit_price  \
0   O00001       C0201       P031  2024-06

## 2. Nettoyage

In [46]:
# 1. Correction de types
customers["signup_date"] = pd.to_datetime(customers["signup_date"], errors="coerce")
orders["order_date"] = pd.to_datetime(orders["order_date"], errors="coerce")

customers["age"] = pd.to_numeric(customers["age"], errors="coerce")

orders["discount_pct"] = (
    orders["discount_pct"]
    .astype(str)
    .str.replace("%", "")
    .astype(float)
    / 100
)

In [47]:
customers["gender"].fillna("Unknown", inplace=True)
customers["city"].fillna("Unknown", inplace=True)

orders["delivery_days"].fillna(orders["delivery_days"].median(), inplace=True)
orders["review_score"].fillna(orders["review_score"].mean(), inplace=True)

In [48]:
orders = orders.drop_duplicates()

## 1. Overview

## 2. Chargement et compréhension de données

In [53]:
# 2.1. Importation des bibliothèques
# Importation de bibliothèque
import pandas as pd
import numpy as np

In [54]:
# 2.2. Chargement des datasets
customers = pd.read_csv("customers.csv")
products = pd.read_csv("products.csv")
orders = pd.read_csv("order_lines.csv")

In [None]:
# 2.3. Informations de base (head(), shape, info(), describe(include("all")))

In [None]:
# 2.4. Vérification de types, valeurs abbérantes et manquantes

### $Commentaire$


## 3. Nettoyage

In [None]:
# 3.1. Conversion de types

In [None]:
# 3.2. Traitement des Valeurs manquantes

In [None]:
# 3.3. Doublons

In [None]:
# 3.4. Valeurs abbérantes

### $Commentaire$
order_lines_clean.csv + 5 règles de nettoyage documentées

## 4. Indicateur Clé de Performance

In [None]:
# 4.1. Chiffre d'affaires mensuel et total

In [None]:
# 4.2. Panier moyen

In [None]:
# 4.3. Taux de remise moyen (global et par catégorie)

In [None]:
# 4.4. Taux de retour

In [None]:
# 4.5. Score moyen (global, par catégorie, par délai de livraison)

## 5. Jointures

In [50]:
# 5.1. Vérification de clés

In [None]:
# 5.2. Jointures
# Jointures Ventes <==> Clients

# Jointures Résultat <==> Produits

In [None]:
# 5.3. Qualité de la jointure (shape, ligne sans match)

In [51]:
# 5.4. Calcul de colonne Business
# gross_amount_calc = unit_price * quantity
# discount_pct converti en numérique
# net_amount_calc = gross_amount_calc * (1- discount_pct)

In [52]:
### $Commentaire$
Comparer avec gross_amount et net_amount existants

In [None]:
# Créer une colonne amount_diff = net_amount- net_amount_calc
# Identifier les lignes “suspectes” (ex : abs(amount_diff) > 0.01)

## 6. Analyse

In [None]:
# Produire un tableau (DataFrame) : CA net par segment client et par catégorie produit, trié
décroissant.

### $Commentaire$
 5 lignes de commentaires : ce que vous avez observé 