In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, isnull, expr
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import when
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F


# Utilisation de la base de données Hive, et charge les données de la table

In [2]:
spark = SparkSession.builder \
    .appName("Categorie") \
    .config("spark.hadoop.hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()


In [3]:
spark.sql("USE concessionnaire")
client_immat_df = spark.sql("SELECT * FROM client_immatriculation_categorie")

# Afficher un aperçu des données

In [4]:
client_immat_df.show(20, truncate=False)

+---------------+---+-----+----+-------------------+---------------+---------------+-------+---------+--------+--------+--------+-------+--------+-------+------+-----------------------+-------------------------------------------------------------------+--------+
|immatriculation|age|sexe |taux|situationfamilliale|nbenfantacharge|deuxiemevoiture|marque |puissance|longueur|nbplaces|nbportes|couleur|occasion|prix   |modele|features               |scaled_features                                                    |category|
+---------------+---+-----+----+-------------------+---------------+---------------+-------+---------+--------+--------+--------+-------+--------+-------+------+-----------------------+-------------------------------------------------------------------+--------+
|0 OG 13        |56 |Homme|528 |Célibataire        |0              |false          |Audi   |75.0     |0.0     |5       |5.0     |blanc  |true    |12817.0|A2    |[75.0,0.0,5.0,12817.0] |[-0.9044003686745664,0.0,0

In [5]:
client_immat_df.describe().show()

+-------+---------------+------------------+-----+-----------------+-------------------+------------------+---------------+------+------------------+--------+--------+-------------------+-------+-----------------+-----------------+------------------+
|summary|immatriculation|               age| sexe|             taux|situationfamilliale|   nbenfantacharge|deuxiemevoiture|marque|         puissance|longueur|nbplaces|           nbportes|couleur|             prix|           modele|          category|
+-------+---------------+------------------+-----+-----------------+-------------------+------------------+---------------+------+------------------+--------+--------+-------------------+-------+-----------------+-----------------+------------------+
|  count|          98834|             98834|98834|            98834|              98834|             98834|          98834| 98834|             98834|   98834|   98834|              98834|  98834|            98834|            98834|             988

In [6]:
missing_counts = client_immat_df.select([isnull(col(c)).alias(c) for c in client_immat_df.columns]).groupby().sum().show()

++
||
++
||
++



# Préparation des données

In [7]:
df_reduit = client_immat_df.select("age", "sexe", "taux", "situationfamilliale", "nbenfantacharge", "deuxiemevoiture", "category")


# Encodage des colonnes catégoriques
indexers = [
    StringIndexer(inputCol="sexe", outputCol="sexe_index"),
    StringIndexer(inputCol="situationfamilliale", outputCol="situation_index"),
    StringIndexer(inputCol="deuxiemevoiture", outputCol="deuxiemevoiture_num"),
    StringIndexer(inputCol="category", outputCol="categorie_index")
]

pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df_reduit).transform(df_reduit)


In [8]:
# Identification et Traitement des Valeurs Aberrantes
variables_a_analyser = ["age", "taux", "nbenfantacharge"]  # Variables continues pertinentes

for variable in variables_a_analyser:
    # Calculer Q1, Q3 et IQR pour chaque variable
    quantiles = df_indexed.approxQuantile(variable, [0.25, 0.75], 0.0)
    q1, q3 = quantiles[0], quantiles[1]
    iqr = q3 - q1

    # Définir les limites des valeurs aberrantes
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Filtrer les valeurs aberrantes
    df_indexed = df_indexed.filter((F.col(variable) >= lower_bound) & (F.col(variable) <= upper_bound))

# Afficher un aperçu des données après nettoyage
df_indexed.show(5)
df_indexed.describe().show()


+---+-----+----+-------------------+---------------+---------------+--------+----------+---------------+-------------------+---------------+
|age| sexe|taux|situationfamilliale|nbenfantacharge|deuxiemevoiture|category|sexe_index|situation_index|deuxiemevoiture_num|categorie_index|
+---+-----+----+-------------------+---------------+---------------+--------+----------+---------------+-------------------+---------------+
| 56|Homme| 528|        Célibataire|              0|          false|       2|       0.0|            1.0|                0.0|            0.0|
| 32|Homme| 563|          En Couple|              2|          false|       6|       0.0|            0.0|                0.0|            1.0|
| 28|Homme| 230|          En Couple|              2|          false|       3|       0.0|            0.0|                0.0|            4.0|
| 33|Femme| 954|          En Couple|              1|          false|       7|       1.0|            0.0|                0.0|            5.0|
| 20|Homme|12

In [9]:
# Création de la colonne `features`
assembler = VectorAssembler(
    inputCols=["age", "sexe_index", "taux", "situation_index", "nbenfantacharge", "deuxiemevoiture_num"],
    outputCol="features"
)

df_final = assembler.transform(df_indexed)
df_final.printSchema()


root
 |-- age: integer (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: integer (nullable = true)
 |-- situationfamilliale: string (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: string (nullable = true)
 |-- category: integer (nullable = true)
 |-- sexe_index: double (nullable = false)
 |-- situation_index: double (nullable = false)
 |-- deuxiemevoiture_num: double (nullable = false)
 |-- categorie_index: double (nullable = false)
 |-- features: vector (nullable = true)



In [10]:
df_final.describe().show()

+-------+------------------+-----+-----------------+-------------------+------------------+---------------+------------------+-------------------+------------------+-------------------+------------------+
|summary|               age| sexe|             taux|situationfamilliale|   nbenfantacharge|deuxiemevoiture|          category|         sexe_index|   situation_index|deuxiemevoiture_num|   categorie_index|
+-------+------------------+-----+-----------------+-------------------+------------------+---------------+------------------+-------------------+------------------+-------------------+------------------+
|  count|             98834|98834|            98834|              98834|             98834|          98834|             98834|              98834|             98834|              98834|             98834|
|   mean|  43.7169192787907| NULL|609.6571523969484|               NULL|1.2556407713944593|           NULL|3.9901855636724206| 0.2985713418459235| 0.373818726349232|0.1310480199121

In [11]:
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)
scaler_model = scaler.fit(df_final)
df_final_scaled = scaler_model.transform(df_final)

# Utiliser `scaled_features` pour l'entraînement au lieu de `features`
train_data, test_data = df_final_scaled.randomSplit([0.8, 0.2], seed=42)


In [12]:
# Initialiser le modèle de Random Forest
rf = RandomForestClassifier(
    labelCol="categorie_index",  # La cible encodée
    featuresCol="features",      # Les vecteurs de caractéristiques
    numTrees=40,                 # Nombre d'arbres
    maxDepth=4,                  # Profondeur maximale
    seed=42
)

# Entraîner le modèle
rf_model = rf.fit(train_data)


# Identification et Traitement des Valeurs Aberrantes

In [13]:
# Prédiction sur l'ensemble de test
predictions = rf_model.transform(test_data)


In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Évaluer la précision (accuracy) du modèle
evaluator = MulticlassClassificationEvaluator(
    labelCol="categorie_index",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.6229508196721312


In [15]:
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="categorie_index",
    predictionCol="prediction",
    metricName="f1"
)

f1_score = f1_evaluator.evaluate(predictions)
print(f"Test F1 Score: {f1_score}")


Test F1 Score: 0.5981321631538411


In [16]:
predictions.select("age", "sexe_index", "taux", "situation_index", "nbenfantacharge", "deuxiemevoiture_num", "categorie_index", "prediction").show(10)

+---+----------+----+---------------+---------------+-------------------+---------------+----------+
|age|sexe_index|taux|situation_index|nbenfantacharge|deuxiemevoiture_num|categorie_index|prediction|
+---+----------+----+---------------+---------------+-------------------+---------------+----------+
| 18|       1.0| 159|            1.0|              0|                0.0|            0.0|       1.0|
| 18|       1.0| 172|            0.0|              3|                0.0|            4.0|       4.0|
| 18|       1.0| 183|            1.0|              0|                0.0|            0.0|       1.0|
| 18|       1.0| 195|            0.0|              4|                1.0|            4.0|       4.0|
| 18|       1.0| 220|            0.0|              4|                1.0|            4.0|       4.0|
| 18|       1.0| 229|            0.0|              4|                1.0|            4.0|       4.0|
| 18|       1.0| 400|            0.0|              2|                1.0|            6.0|  