In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.types import DoubleType

In [2]:
spark = SparkSession.builder\
    .appName("Préparation de données")\
    .enableHiveSupport()\
    .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
spark.catalog.clearCache()
spark.sql("USE concessionnaire")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/18 15:35:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/18 15:35:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/11/18 15:35:44 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/11/18 15:35:44 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/11/18 15:35:44 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


DataFrame[]

In [3]:
clients_immatriculations = spark.sql("SELECT * FROM clients_immatriculations")
clients_immatriculations.printSchema()

root
 |-- immatriculation: string (nullable = true)
 |-- age: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: double (nullable = true)
 |-- situationfamiliale: string (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: boolean (nullable = true)
 |-- taux_eligible: boolean (nullable = true)
 |-- marque: string (nullable = true)
 |-- modele: string (nullable = true)
 |-- puissance: integer (nullable = true)
 |-- longueur: string (nullable = true)
 |-- nbplaces: integer (nullable = true)
 |-- nbportes: integer (nullable = true)
 |-- couleur: string (nullable = true)
 |-- occasion: boolean (nullable = true)
 |-- prix: integer (nullable = true)
 |-- categorie: string (nullable = true)



### Verification des doublons

In [4]:
doublons = clients_immatriculations.groupBy(*clients_immatriculations.columns).agg(count("*").alias("count")).filter("count > 1")
doublons.show(truncate=False)

                                                                                

+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+
|immatriculation|age|sexe|taux|situationfamiliale|nbenfantacharge|deuxiemevoiture|taux_eligible|marque|modele|puissance|longueur|nbplaces|nbportes|couleur|occasion|prix|categorie|count|
+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+
+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+



### Analyse des **null**

In [5]:
doublons = clients_immatriculations.groupBy('nbenfantacharge').count().show()

+---------------+-----+
|nbenfantacharge|count|
+---------------+-----+
|              1|16425|
|              3|11451|
|              4| 9960|
|              2|16592|
|              0|44973|
+---------------+-----+



### Supprimer les colones innutiles

In [6]:
clients_immatriculations = clients_immatriculations.drop('immatriculation')
clients_immatriculations = clients_immatriculations.drop('couleur')

### OneHotEncoder

In [7]:
indexer_marque = StringIndexer(inputCol="marque", outputCol="marque_index")
indexer_marque = indexer_marque.fit(clients_immatriculations)
clients_immatriculations = indexer_marque.transform(clients_immatriculations)

encoder_marque = OneHotEncoder(inputCol="marque_index", outputCol="marque_encoded")
clients_immatriculations = encoder_marque.fit(clients_immatriculations).transform(clients_immatriculations)

                                                                                

In [8]:
indexer_sexe = StringIndexer(inputCol="sexe", outputCol="sexe_index")
indexer_sexe = indexer_sexe.fit(clients_immatriculations)
clients_immatriculations = indexer_sexe.transform(clients_immatriculations)

encoder_sexe = OneHotEncoder(inputCol="sexe_index", outputCol="sexe_encoded")
clients_immatriculations = encoder_sexe.fit(clients_immatriculations).transform(clients_immatriculations)

In [9]:
indexer_situationfamiliale = StringIndexer(inputCol="situationfamiliale", outputCol="situationfamiliale_index")
indexer_situationfamiliale = indexer_situationfamiliale.fit(clients_immatriculations)
clients_immatriculations = indexer_situationfamiliale.transform(clients_immatriculations)

encoder_situationfamiliale = OneHotEncoder(inputCol="situationfamiliale_index", outputCol="situationfamiliale_encoded")
clients_immatriculations = encoder_situationfamiliale.fit(clients_immatriculations).transform(clients_immatriculations)

In [10]:
clients_immatriculations = clients_immatriculations.drop('marque') 
clients_immatriculations = clients_immatriculations.drop('marque_index')
clients_immatriculations = clients_immatriculations.drop('sexe') 
clients_immatriculations = clients_immatriculations.drop('sexe_index')
clients_immatriculations = clients_immatriculations.drop('situationfamiliale') 
clients_immatriculations = clients_immatriculations.drop('situationfamiliale_index')
clients_immatriculations = clients_immatriculations.drop('puissance')
clients_immatriculations = clients_immatriculations.drop('nbportes')
clients_immatriculations = clients_immatriculations.drop('occasion')
clients_immatriculations = clients_immatriculations.drop('prix')
clients_immatriculations = clients_immatriculations.drop('nbplaces') 
clients_immatriculations = clients_immatriculations.drop('modele') 
clients_immatriculations = clients_immatriculations.drop('longueur') 

clients_immatriculations = clients_immatriculations.withColumnRenamed('sexe_encoded', 'sexe')
clients_immatriculations = clients_immatriculations.withColumnRenamed('situationfamiliale_encoded', 'situationfamiliale')

## Changer des Boolean en Int

In [11]:
clients_immatriculations = clients_immatriculations.withColumn(
    "deuxiemevoiture",
    when(col("deuxiemevoiture") == False, 0)
    .when(col("deuxiemevoiture") == True, 1)
    .otherwise(col("deuxiemevoiture").cast("int"))
)

clients_immatriculations = clients_immatriculations.withColumn(
    "taux_eligible",
    when(col("taux_eligible") == False, 0)
    .when(col("taux_eligible") == True, 1)
    .otherwise(col("taux_eligible").cast("int"))
)

clients_immatriculations.show(4)

+----+-----+---------------+---------------+-------------+-------------------+---------------+-------------+------------------+
| age| taux|nbenfantacharge|deuxiemevoiture|taux_eligible|          categorie| marque_encoded|         sexe|situationfamiliale|
+----+-----+---------------+---------------+-------------+-------------------+---------------+-------------+------------------+
|61.0|188.0|              0|              0|            0|citadine economique|(18,[13],[1.0])|(1,[0],[1.0])|     (3,[1],[1.0])|
|50.0|460.0|              3|              0|            0|      suv/crossover| (18,[2],[1.0])|    (1,[],[])|     (3,[0],[1.0])|
|54.0|403.0|              0|              0|            0|citadine economique| (18,[4],[1.0])|(1,[0],[1.0])|     (3,[1],[1.0])|
|74.0|531.0|              4|              1|            0|      suv/crossover| (18,[5],[1.0])|(1,[0],[1.0])|     (3,[0],[1.0])|
+----+-----+---------------+---------------+-------------+-------------------+---------------+----------

## Normalisation

In [12]:
clients_immatriculations.groupBy("categorie").count().show()

+-------------------+-----+
|          categorie|count|
+-------------------+-----+
|              autre| 5795|
|          familiale|30361|
|citadine economique|26778|
|      suv/crossover|36467|
+-------------------+-----+



In [13]:
clients_immatriculations.printSchema()
clients_immatriculations.show()

root
 |-- age: double (nullable = true)
 |-- taux: double (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: integer (nullable = true)
 |-- taux_eligible: integer (nullable = true)
 |-- categorie: string (nullable = true)
 |-- marque_encoded: vector (nullable = true)
 |-- sexe: vector (nullable = true)
 |-- situationfamiliale: vector (nullable = true)

+----+------+---------------+---------------+-------------+-------------------+---------------+-------------+------------------+
| age|  taux|nbenfantacharge|deuxiemevoiture|taux_eligible|          categorie| marque_encoded|         sexe|situationfamiliale|
+----+------+---------------+---------------+-------------+-------------------+---------------+-------------+------------------+
|61.0| 188.0|              0|              0|            0|citadine economique|(18,[13],[1.0])|(1,[0],[1.0])|     (3,[1],[1.0])|
|50.0| 460.0|              3|              0|            0|      suv/crossover| (18,[2],[1.0

## Classificateur

In [14]:
# Encodage des colonnes catégoriques (si la cible est catégorique)
indexer = StringIndexer(inputCol="categorie", outputCol="label")
data = indexer.fit(clients_immatriculations).transform(clients_immatriculations)

# Assembler les colonnes de caractéristiques
feature_cols = [col for col in data.columns if col != "categorie" and col != "label"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data).select("features", "label")

In [15]:
# Diviser les données en 60% entraînement, 20% validation et 20% test
(trainingData, validationData, testData) = data.randomSplit([0.6, 0.2, 0.2], seed=42)

print(f"Taille de l'ensemble d'entraînement : {trainingData.count()}")
print(f"Taille de l'ensemble de validation : {validationData.count()}")
print(f"Taille de l'ensemble de test : {testData.count()}")

# Définir le modèle de régression linéaire
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=10)
rf_model = rf.fit(trainingData)

                                                                                

Taille de l'ensemble d'entraînement : 60060


                                                                                

Taille de l'ensemble de validation : 19739


                                                                                

Taille de l'ensemble de test : 19602


                                                                                

In [16]:
trainPrediction =  rf_model.transform(trainingData)

# Évaluer les prédictions
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)

f1_score_train = evaluator.evaluate(trainPrediction)
print(f"F1-score Train : {f1_score_train * 100:.2f}%")

# Faire des prédictions
testPredictions = rf_model.transform(testData)

f1_score_test = evaluator.evaluate(testPredictions)

print(f"F1-score Test : {f1_score_test * 100:.2f}%")

                                                                                

F1-score Train : 93.24%
F1-score Test : 93.47%


                                                                                