In [62]:
import findspark
findspark.init()

In [63]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
import re


In [64]:
paths = ["dataset/2019-Oct.csv", "dataset/2019-Nov.csv"]

In [65]:
spark = SparkSession.builder \
    .appName("EcommerceAnalysis") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()

In [66]:
#Définir le schéma strictement
schema = StructType([
    StructField("event_time", StringType(), True),
    StructField("event_type", StringType(), True),
    StructField("product_id", LongType(), True),
    StructField("category_id", LongType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("user_id", LongType(), True),
    StructField("user_session", StringType(), True)
])

In [67]:
df = spark.read.csv(paths, header=True, schema=schema)
print(f"Les fichiers sont chargés. Nombre de partitions : {df.rdd.getNumPartitions()}")

Les fichiers sont chargés. Nombre de partitions : 110


In [68]:
# Fusionner et sauvegarder en format optimisé sur votre disque
df.write.mode("overwrite").parquet("dataset/full_data.parquet")

                                                                                

In [69]:
df = spark.read.parquet("dataset/full_data.parquet")

In [70]:
print(f"Données chargées : {df.count()} lignes.")

Données chargées : 109950743 lignes.


In [71]:
df.shape


AttributeError: 'DataFrame' object has no attribute 'shape'

## NORMALISATION

In [None]:
def normalize_columns(df):
    current_cols = df.columns
    new_cols = []
    
    for col in current_cols:
        clean_name = col.lower()
        clean_name = re.sub(r'[.\s]+', '_', clean_name)
        clean_name = re.sub(r'[^\w]', '', clean_name)
        new_cols.append(clean_name)
    
    return df.toDF(*new_cols)

In [None]:
df = normalize_columns(df)
print("Nouveaux noms de colonnes :", df.columns)

df = df.withColumn("brand", F.lower(F.col("brand"))) \
       .withColumn("main_category", F.split(F.col("category_code"), "\.").getItem(0))

df.limit(5).select("brand", "category_code", "main_category").show()

Nouveaux noms de colonnes : ['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session', 'main_category']
+--------+--------------------+-------------+
|   brand|       category_code|main_category|
+--------+--------------------+-------------+
|elenberg|appliances.kitche...|   appliances|
|   intel|computers.compone...|    computers|
|  irobot|appliances.enviro...|   appliances|
| lucente|                null|         null|
| samsung|electronics.smart...|  electronics|
+--------+--------------------+-------------+



## Conversion des types (casting)

In [None]:
df.select("event_time").show(5)

+--------------------+
|          event_time|
+--------------------+
|2019-11-17 08:43:...|
|2019-11-17 08:43:...|
|2019-11-17 08:43:...|
|2019-11-17 08:43:...|
|2019-11-17 08:43:...|
+--------------------+
only showing top 5 rows



In [None]:
df = df.withColumn("event_time", F.to_timestamp("event_time", "yyyy-MM-dd HH:mm:ss 'UTC'"))

In [None]:
cols_to_clean = ["event_type", "category_code", "brand"]
for col_name in cols_to_clean:
    df = df.withColumn(col_name, F.lower(F.trim(F.col(col_name))))

In [None]:
df.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: long (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_session: string (nullable = true)
 |-- main_category: string (nullable = true)



## Gestion des valeurs manquantes (NULL)

In [None]:
string_columns = [c for c, t in df.dtypes if t == 'string']

# Afficher les statistiques (count, mean, stddev, min, max)
df.select(string_columns).describe().show()



+-------+----------+-------------------+--------+--------------------+-------------+
|summary|event_type|      category_code|   brand|        user_session|main_category|
+-------+----------+-------------------+--------+--------------------+-------------+
|  count| 109950743|           74536963|94619500|           109950731|     74536963|
|   mean|      null|               null|     NaN|                null|         null|
| stddev|      null|               null|     NaN|                null|         null|
|    min|      cart|    accessories.bag|  a-case|00000042-3e3f-42f...|  accessories|
|    max|      view|stationery.cartrige|   zyxel|fffffde2-4522-4b4...|   stationery|
+-------+----------+-------------------+--------+--------------------+-------------+



                                                                                

In [None]:
print("Audit des NULL avant nettoyage :")
df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

Audit des NULL avant nettoyage :




+----------+----------+----------+-----------+-------------+--------+-----+-------+------------+-------------+
|event_time|event_type|product_id|category_id|category_code|   brand|price|user_id|user_session|main_category|
+----------+----------+----------+-----------+-------------+--------+-----+-------+------------+-------------+
|         0|         0|         0|          0|     35413780|15331243|    0|      0|          12|     35413780|
+----------+----------+----------+-----------+-------------+--------+-----+-------+------------+-------------+



                                                                                

In [None]:
df = df.withColumn("category_code", F.coalesce(F.col("category_code"), F.lit("Unknown"))) \
       .withColumn("main_category", F.coalesce(F.col("main_category"), F.lit("Unknown"))) \
       .withColumn("brand", F.coalesce(F.col("brand"), F.lit("Unknown")))

In [None]:
df = df.filter(F.col("user_session").isNotNull())

In [None]:
print("Nouveau comptage après correction :")
df.select(F.count("event_type").alias("Total"), 
        F.count("category_code").alias("Cat_Code_Clean"),
        F.count("main_category").alias("Main_Cat_Clean")).show()

Nouveau comptage après correction :




+---------+--------------+--------------+
|    Total|Cat_Code_Clean|Main_Cat_Clean|
+---------+--------------+--------------+
|109950731|     109950731|     109950731|
+---------+--------------+--------------+



                                                                                

In [None]:
# --- Vérification finale ---
print("Audit des NULL après nettoyage :")
df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

Audit des NULL après nettoyage :




+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+-------------+
|event_time|event_type|product_id|category_id|category_code|brand|price|user_id|user_session|main_category|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+-------------+
|         0|         0|         0|          0|            0|    0|    0|      0|           0|            0|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+-------------+



                                                                                

In [72]:
df = df.dropDuplicates()

In [73]:
string_columns = [c for c, t in df.dtypes if t == 'string']

# Afficher les statistiques (count, mean, stddev, min, max)
df.select(string_columns).describe().show()



+-------+--------------------+----------+-------------------+--------+--------------------+
|summary|          event_time|event_type|      category_code|   brand|        user_session|
+-------+--------------------+----------+-------------------+--------+--------------------+
|  count|           109820004| 109820004|           74436319|94498701|           109819992|
|   mean|                null|      null|               null|     NaN|                null|
| stddev|                null|      null|               null|     NaN|                null|
|    min|2019-10-01 00:00:...|      cart|    accessories.bag|  a-case|00000042-3e3f-42f...|
|    max|2019-11-30 23:59:...|      view|stationery.cartrige|   zyxel|fffffde2-4522-4b4...|
+-------+--------------------+----------+-------------------+--------+--------------------+



                                                                                

In [74]:
output_path = "data_clean/gold_ecommerce_cleaned"

In [75]:
df.write \
    .mode("overwrite") \
    .partitionBy("event_type") \
    .parquet(output_path)

print(f"✅ Stockage terminé avec succès dans : {output_path}")

                                                                                

✅ Stockage terminé avec succès dans : data_clean/gold_ecommerce_cleaned
