In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.types import DoubleType

In [2]:
spark = SparkSession.builder\
    .appName("Classification RandomForest")\
    .enableHiveSupport()\
    .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
spark.catalog.clearCache()
spark.sql("USE concessionnaire")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/19 14:22:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[]

In [3]:
clients_immatriculations = spark.sql("SELECT * FROM clients_immatriculations")
clients_immatriculations.printSchema()

root
 |-- immatriculation: string (nullable = true)
 |-- age: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: double (nullable = true)
 |-- situationfamiliale: string (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: boolean (nullable = true)
 |-- taux_eligible: boolean (nullable = true)
 |-- marque: string (nullable = true)
 |-- modele: string (nullable = true)
 |-- puissance: integer (nullable = true)
 |-- longueur: string (nullable = true)
 |-- nbplaces: integer (nullable = true)
 |-- nbportes: integer (nullable = true)
 |-- couleur: string (nullable = true)
 |-- occasion: boolean (nullable = true)
 |-- prix: integer (nullable = true)
 |-- categorie: string (nullable = true)



### Verification des doublons

In [4]:
doublons = clients_immatriculations.groupBy(*clients_immatriculations.columns).agg(count("*").alias("count")).filter("count > 1")
doublons.show(truncate=False)

                                                                                

+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+
|immatriculation|age|sexe|taux|situationfamiliale|nbenfantacharge|deuxiemevoiture|taux_eligible|marque|modele|puissance|longueur|nbplaces|nbportes|couleur|occasion|prix|categorie|count|
+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+
+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+



### Analyse des **null**

In [5]:
doublons = clients_immatriculations.groupBy('nbenfantacharge').count().show()

[Stage 7:===>                                                     (1 + 15) / 16]

+---------------+-----+
|nbenfantacharge|count|
+---------------+-----+
|              1|16425|
|              3|11451|
|              4| 9960|
|              2|16592|
|              0|44973|
+---------------+-----+



                                                                                

### Supprimer les colones innutiles

In [6]:
clients_immatriculations = clients_immatriculations.drop('immatriculation')
clients_immatriculations = clients_immatriculations.drop('couleur')
clients_immatriculations = clients_immatriculations.drop('marque')

### OneHotEncoder

In [7]:
# indexer_marque = StringIndexer(inputCol="marque", outputCol="marque_index")
# indexer_marque = indexer_marque.fit(clients_immatriculations)
# clients_immatriculations = indexer_marque.transform(clients_immatriculations)

# encoder_marque = OneHotEncoder(inputCol="marque_index", outputCol="marque_encoded")
# clients_immatriculations = encoder_marque.fit(clients_immatriculations).transform(clients_immatriculations)

In [8]:
indexer_sexe = StringIndexer(inputCol="sexe", outputCol="sexe_index").fit(clients_immatriculations)
clients_immatriculations = indexer_sexe.transform(clients_immatriculations)

encoder_sexe = OneHotEncoder(inputCol="sexe_index", outputCol="sexe_encoded").fit(clients_immatriculations)
clients_immatriculations = encoder_sexe.transform(clients_immatriculations)

                                                                                

In [9]:
indexer_situationfamiliale = StringIndexer(inputCol="situationfamiliale", outputCol="situationfamiliale_index").fit(clients_immatriculations)
clients_immatriculations = indexer_situationfamiliale.transform(clients_immatriculations)

encoder_situationfamiliale = OneHotEncoder(inputCol="situationfamiliale_index", outputCol="situationfamiliale_encoded").fit(clients_immatriculations)
clients_immatriculations = encoder_situationfamiliale.transform(clients_immatriculations)

                                                                                

In [10]:
# clients_immatriculations = clients_immatriculations.drop('marque') 
# clients_immatriculations = clients_immatriculations.drop('marque_index')
clients_immatriculations = clients_immatriculations.drop('puissance')
clients_immatriculations = clients_immatriculations.drop('nbportes')
clients_immatriculations = clients_immatriculations.drop('occasion')
clients_immatriculations = clients_immatriculations.drop('prix')
clients_immatriculations = clients_immatriculations.drop('nbplaces') 
clients_immatriculations = clients_immatriculations.drop('modele') 
clients_immatriculations = clients_immatriculations.drop('longueur')

## Changer des Boolean en Int

In [11]:
clients_immatriculations = clients_immatriculations.withColumn(
    "deuxiemevoiture",
    when(col("deuxiemevoiture") == False, 0)
    .when(col("deuxiemevoiture") == True, 1)
    .otherwise(col("deuxiemevoiture").cast("int"))
)

clients_immatriculations = clients_immatriculations.withColumn(
    "taux_eligible",
    when(col("taux_eligible") == False, 0)
    .when(col("taux_eligible") == True, 1)
    .otherwise(col("taux_eligible").cast("int"))
)

clients_immatriculations.show(4)

+----+----+-----+------------------+---------------+---------------+-------------+-------------------+----------+-------------+------------------------+--------------------------+
| age|sexe| taux|situationfamiliale|nbenfantacharge|deuxiemevoiture|taux_eligible|          categorie|sexe_index| sexe_encoded|situationfamiliale_index|situationfamiliale_encoded|
+----+----+-----+------------------+---------------+---------------+-------------+-------------------+----------+-------------+------------------------+--------------------------+
|61.0|   M|188.0|       celibataire|              0|              0|            0|citadine economique|       0.0|(1,[0],[1.0])|                     1.0|             (3,[1],[1.0])|
|50.0|   F|460.0|         en couple|              3|              0|            0|      suv/crossover|       1.0|    (1,[],[])|                     0.0|             (3,[0],[1.0])|
|54.0|   M|403.0|       celibataire|              0|              0|            0|citadine economiqu

## Normalisation

In [12]:
clients_immatriculations.groupBy("categorie").count().show()



+-------------------+-----+
|          categorie|count|
+-------------------+-----+
|              autre| 5795|
|          familiale|30361|
|citadine economique|26778|
|      suv/crossover|36467|
+-------------------+-----+



                                                                                

In [13]:
clients_immatriculations.printSchema()
clients_immatriculations.show()

root
 |-- age: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: double (nullable = true)
 |-- situationfamiliale: string (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: integer (nullable = true)
 |-- taux_eligible: integer (nullable = true)
 |-- categorie: string (nullable = true)
 |-- sexe_index: double (nullable = false)
 |-- sexe_encoded: vector (nullable = true)
 |-- situationfamiliale_index: double (nullable = false)
 |-- situationfamiliale_encoded: vector (nullable = true)

+----+----+------+------------------+---------------+---------------+-------------+-------------------+----------+-------------+------------------------+--------------------------+
| age|sexe|  taux|situationfamiliale|nbenfantacharge|deuxiemevoiture|taux_eligible|          categorie|sexe_index| sexe_encoded|situationfamiliale_index|situationfamiliale_encoded|
+----+----+------+------------------+---------------+---------------+-------------+-------

## Classificateur

In [14]:
# Apprentissage des labels
indexer_model = StringIndexer(inputCol="categorie", outputCol="label").fit(clients_immatriculations)

data = indexer_model.transform(clients_immatriculations)

feature_cols = [col for col in data.columns if col not in ["categorie", "label", 'sexe', 'situationfamiliale']]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data).select("features", "label")

                                                                                

In [15]:
# Diviser les données en 60% entraînement, 20% validation et 20% test
(trainingData, validationData, testData) = data.randomSplit([0.6, 0.2, 0.2], seed=42)

print(f"Taille de l'ensemble d'entraînement : {trainingData.count()}")
print(f"Taille de l'ensemble de validation : {validationData.count()}")
print(f"Taille de l'ensemble de test : {testData.count()}")

# Définir le modèle de régression linéaire
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50, maxDepth=20)
rf_model = rf.fit(trainingData)

                                                                                

Taille de l'ensemble d'entraînement : 60060


                                                                                

Taille de l'ensemble de validation : 19739


                                                                                

Taille de l'ensemble de test : 19602


                                                                                

In [16]:
trainPrediction =  rf_model.transform(trainingData)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)

f1_score_train = evaluator.evaluate(trainPrediction)
print(f"F1-score Train : {f1_score_train * 100:.2f}%")

# Faire des prédictions
testPredictions = rf_model.transform(testData)
f1_score_test = evaluator.evaluate(testPredictions)

print(f"F1-score Test : {f1_score_test * 100:.2f}%")

                                                                                

F1-score Train : 77.87%




F1-score Test : 77.13%


                                                                                

## Prediction de marketing

In [17]:
df_marketing = spark.sql("SELECT * FROM marketing")
df_marketing.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: float (nullable = true)
 |-- situationfamiliale: string (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: boolean (nullable = true)
 |-- taux_eligible: boolean (nullable = true)



In [18]:
df_marketing = indexer_sexe.transform(df_marketing)
df_marketing = encoder_sexe.transform(df_marketing)
df_marketing.show()

+---+----+------+------------------+---------------+---------------+-------------+----------+-------------+
|age|sexe|  taux|situationfamiliale|nbenfantacharge|deuxiemevoiture|taux_eligible|sexe_index| sexe_encoded|
+---+----+------+------------------+---------------+---------------+-------------+----------+-------------+
| 21|   F|1396.0|       celibataire|              0|          false|        false|       1.0|    (1,[],[])|
| 35|   M| 223.0|       celibataire|              0|          false|         true|       0.0|(1,[0],[1.0])|
| 48|   M| 401.0|       celibataire|              0|          false|         true|       0.0|(1,[0],[1.0])|
| 26|   F| 420.0|         en couple|              3|           true|         true|       1.0|    (1,[],[])|
| 80|   M| 530.0|         en couple|              3|          false|         true|       0.0|(1,[0],[1.0])|
| 27|   F| 153.0|         en couple|              2|          false|         true|       1.0|    (1,[],[])|
| 59|   F| 572.0|         en

In [19]:
df_marketing = indexer_situationfamiliale.transform(df_marketing)
df_marketing = encoder_situationfamiliale.transform(df_marketing)

In [20]:
df_marketing = df_marketing.withColumn(
    "deuxiemevoiture",
    when(col("deuxiemevoiture") == False, 0)
    .when(col("deuxiemevoiture") == True, 1)
    .otherwise(col("deuxiemevoiture").cast("int"))
)

df_marketing = df_marketing.withColumn(
    "taux_eligible",
    when(col("taux_eligible") == False, 0)
    .when(col("taux_eligible") == True, 1)
    .otherwise(col("taux_eligible").cast("int"))
)

In [21]:
df_marketing = df_marketing.select(feature_cols)

print(feature_cols)
print(df_marketing.columns)

# Assembler les caractéristiques
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
prediction_data = assembler.transform(df_marketing).select("features")


# Effectuer les prédictions
marketingPrediction = rf_model.transform(prediction_data)

# Mapper les prédictions avec les labels
label_to_category = IndexToString(
    inputCol="prediction",
    outputCol="predicted_category",
    labels=indexer_model.labels
)
result = label_to_category.transform(marketingPrediction)

# Afficher les résultats
result.show()

['age', 'taux', 'nbenfantacharge', 'deuxiemevoiture', 'taux_eligible', 'sexe_index', 'sexe_encoded', 'situationfamiliale_index', 'situationfamiliale_encoded']
['age', 'taux', 'nbenfantacharge', 'deuxiemevoiture', 'taux_eligible', 'sexe_index', 'sexe_encoded', 'situationfamiliale_index', 'situationfamiliale_encoded']
+--------------------+--------------------+--------------------+----------+-------------------+
|            features|       rawPrediction|         probability|prediction| predicted_category|
+--------------------+--------------------+--------------------+----------+-------------------+
|(11,[0,1,5,7,9],[...|[0.77476877496734...|[0.01549537549934...|       2.0|citadine economique|
|(11,[0,1,4,6,7,9]...|[0.19197318460650...|[0.00383946369213...|       2.0|citadine economique|
|(11,[0,1,4,6,7,9]...|[0.23602322875861...|[0.00472046457517...|       2.0|citadine economique|
|[26.0,420.0,3.0,1...|[46.5807425071005...|[0.93161485014201...|       0.0|      suv/crossover|
|(11,[0,1,