In [1]:
import pandas as pd

DATA_PATH = "../donnees/ecommerce/"

In [2]:
# Load
orders    = pd.read_parquet(DATA_PATH + "orders.parquet")
reviews   = pd.read_parquet(DATA_PATH + "reviews.parquet")
payments  = pd.read_parquet(DATA_PATH + "payments.parquet")
customers = pd.read_parquet(DATA_PATH + "customers.parquet")
geo       = pd.read_parquet(DATA_PATH + "geolocation.parquet")

zip_col   = "geolocation_zip_code_prefix" if "geolocation_zip_code_prefix" in geo.columns else "zip_code"
lat_col   = "geolocation_lat" if "geolocation_lat" in geo.columns else "lat"
lng_col   = "geolocation_lng" if "geolocation_lng" in geo.columns else "lng"
city_col  = "geolocation_city" if "geolocation_city" in geo.columns else "city"
state_col = "geolocation_state" if "geolocation_state" in geo.columns else "state"

geo_cust = (
    geo[[zip_col, lat_col, lng_col, city_col, state_col]]
    .dropna(subset=[zip_col])
    .astype({zip_col: "string"})
    .groupby(zip_col, as_index=False)
    .agg({lat_col: "mean", lng_col: "mean", city_col: "first", state_col: "first"})
    .rename(columns={
        zip_col: "customer_zip_code_prefix",
        lat_col: "cust_lat", lng_col: "cust_lng",
        city_col: "cust_geo_city", state_col: "cust_geo_state"
    })
)

# --- Customers (id, zip, ville/état) ---
cust_cols = [c for c in ["customer_id", "customer_zip_code_prefix", "customer_city", "customer_state"] if c in customers]
customers_slim = customers[cust_cols].copy()
customers_slim["customer_zip_code_prefix"] = customers_slim["customer_zip_code_prefix"].astype("string")

# --- Orders (id, status, timestamp) ---
ts_col = "order_purchase_timestamp" if "order_purchase_timestamp" in orders.columns else (
    "purchase_timestamp" if "purchase_timestamp" in orders.columns else None
)
orders_keep = ["order_id"]
for c in ["order_status", ts_col, "customer_id"]:
    if c in orders.columns:
        orders_keep.append(c)
orders_slim = orders[orders_keep].copy()

# --- Payments (valeur par commande, agrégée) ---
pay = (
    payments.groupby("order_id", as_index=False)["payment_value"]
    .sum()
    .rename(columns={"payment_value":"order_payment_value"})
)

# --- Reviews (score par commande ; on garde id/score/date si présents) ---
rev_keep = [c for c in ["review_id","order_id","review_score","review_creation_date","review_answer_timestamp"] if c in reviews]
reviews_slim = reviews[rev_keep].copy()

# --- Consolidation ---
df1 = (
    orders_slim
    .merge(pay, on="order_id", how="left")
    .merge(reviews_slim, on="order_id", how="left")
    .merge(customers_slim, on="customer_id", how="left")
    .merge(geo_cust, on="customer_zip_code_prefix", how="left")
)

# --- Temps ---
if ts_col:
    df1[ts_col] = pd.to_datetime(df1[ts_col], errors="coerce")
    df1["year"]  = df1[ts_col].dt.year
    df1["month"] = df1[ts_col].dt.month

# --- Colonnes finales (simples & explicites) ---
cols_df1 = [c for c in [
    "order_id","customer_id",
    "review_id","review_score","review_creation_date","review_answer_timestamp",
    "order_payment_value","order_status",
    "customer_zip_code_prefix","customer_city","customer_state",
    "cust_lat","cust_lng","cust_geo_city","cust_geo_state",
    ts_col,"year","month"
] if c in df1.columns]
df_reviews_sales = df1[cols_df1].copy()

print("DF1 shape:", df_reviews_sales.shape)
display(df_reviews_sales.head(5))
display(df_reviews_sales.describe(include="all").T)


Unnamed: 0,order_id,customer_id,zip_code,purchase_timestamp,year,month
0,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,28013,2017-09-13 08:59:02,2017,9
1,00018f77f2f0320c557190d7a144bdd3,f6dd3ec061db4e3987629fe6b26e5cce,15775,2017-04-26 10:53:06,2017,4
2,000229ec398224ef6ca0657da4fc703e,6489ae5e4333f3693df5ad4372dab6d3,35661,2018-01-14 14:33:31,2018,1
3,00024acbcdf0a6daa1e931b038114c75,d4eb9395c8c0431ee92fce09860c5a06,12952,2018-08-08 10:00:35,2018,8
4,00042b26cf59d7ce69dfabb4e55b4fd9,58dbd0b2d70206bf40e62cd34e84d795,13226,2017-02-04 13:57:51,2017,2


Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
order_id,98673.0,98673.0,00010242fe8c5a6d1ba2dd792cb16214,1.0,,,,,,,
customer_id,98673.0,98673.0,3ce436f183e68e07877b285a838db11a,1.0,,,,,,,
zip_code,98673.0,,,,35153.994791,1003.0,11340.0,24415.0,59020.0,99990.0,29821.024954
purchase_timestamp,98673.0,,,,2017-12-31 12:30:02.947635200,2016-09-04 21:15:19,2017-09-12 16:15:06,2018-01-19 08:49:47,2018-05-04 19:50:51,2018-10-17 17:30:18,
year,98673.0,,,,2017.54028,2016.0,2017.0,2018.0,2018.0,2018.0,0.504923
month,98673.0,,,,6.032106,1.0,3.0,6.0,8.0,12.0,3.231047
