In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import rand

# 1. Initialisation de Spark
spark = SparkSession.builder \
    .appName("Amélioration MLPC avec classes équilibrées") \
    .config("spark.hadoop.hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

# 2. Charger les données
df_train = spark.sql("SELECT * FROM concessionnaire.client_immat_with_catk9")

# 3. Indexer la colonne cible
label_indexer = StringIndexer(inputCol="categorie", outputCol="indexed_categorie", handleInvalid="keep")
df_train = label_indexer.fit(df_train).transform(df_train)

# 4. Rééchantillonnage
max_count = df_train.groupBy("indexed_categorie").count().agg({"count": "max"}).collect()[0][0]
balanced_df = df_train.groupBy("indexed_categorie").applyInPandas(
    lambda pdf: pdf.sample(n=max_count, replace=True) if len(pdf) < max_count else pdf,
    schema=df_train.schema
)

# 5. Indexer les colonnes catégoriques
categorical_cols = ["sexe", "situationfamilliale", "deuxiemevoiture"]
indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed", handleInvalid="keep") for col in categorical_cols]

# 6. Assembler les features
assembler = VectorAssembler(
    inputCols=["age", "taux", "nbenfantacharge"] + [col + "_indexed" for col in categorical_cols],
    outputCol="featuress"
)

# 7. Réseau de neurones (MLPC)
input_size = len(["age", "taux", "nbenfantacharge"] + [col + "_indexed" for col in categorical_cols])
num_classes = df_train.select("indexed_categorie").distinct().count()
mlpc = MultilayerPerceptronClassifier(
    labelCol="indexed_categorie",
    featuresCol="featuress",
    predictionCol="mlpc_prediction",
    layers=[input_size, 32, 16, num_classes],
    maxIter=200,
    seed=42
)

# 8. Pipeline
pipeline = Pipeline(stages=indexers + [assembler, mlpc])

# 9. Séparer les données
train_df, test_df = balanced_df.randomSplit([0.8, 0.2], seed=42)

# 10. Entraîner le modèle
model = pipeline.fit(train_df)

# 11. Prédictions et évaluation
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexed_categorie", predictionCol="mlpc_prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy MLPC = {accuracy:.4f}")

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="indexed_categorie", predictionCol="mlpc_prediction", metricName="f1"
)
f1_score = evaluator_f1.evaluate(predictions)
print(f"F1 Score MLPC = {f1_score:.4f}")

# 12. Afficher des prédictions
predictions.select("age", "sexe", "taux", "categorie", "mlpc_prediction").show(10, truncate=False)




Accuracy MLPC = 0.3183
F1 Score MLPC = 0.2262
+---+-----+----+---------------+---------------+
|age|sexe |taux|categorie      |mlpc_prediction|
+---+-----+----+---------------+---------------+
|82 |Homme|833 |Micro Citadines|2.0            |
|46 |Homme|837 |Micro Citadines|2.0            |
|56 |Homme|994 |Micro Citadines|2.0            |
|45 |Femme|570 |Micro Citadines|3.0            |
|59 |Homme|577 |Micro Citadines|3.0            |
|18 |Femme|1193|Micro Citadines|2.0            |
|37 |Homme|836 |Micro Citadines|2.0            |
|29 |Homme|1221|Micro Citadines|2.0            |
|24 |Femme|987 |Micro Citadines|2.0            |
|39 |Homme|857 |Micro Citadines|2.0            |
+---+-----+----+---------------+---------------+
only showing top 10 rows
