In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.types import DoubleType

In [2]:
spark = SparkSession.builder\
    .appName("Préparation de données")\
    .enableHiveSupport()\
    .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
spark.catalog.clearCache()
spark.sql("USE concessionnaire")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/18 19:38:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[]

In [3]:
clients_immatriculations = spark.sql("SELECT * FROM clients_immatriculations")
clients_immatriculations.printSchema()

root
 |-- immatriculation: string (nullable = true)
 |-- age: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: double (nullable = true)
 |-- situationfamiliale: string (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: boolean (nullable = true)
 |-- taux_eligible: boolean (nullable = true)
 |-- marque: string (nullable = true)
 |-- modele: string (nullable = true)
 |-- puissance: integer (nullable = true)
 |-- longueur: string (nullable = true)
 |-- nbplaces: integer (nullable = true)
 |-- nbportes: integer (nullable = true)
 |-- couleur: string (nullable = true)
 |-- occasion: boolean (nullable = true)
 |-- prix: integer (nullable = true)
 |-- categorie: string (nullable = true)



### Verification des doublons

In [4]:
doublons = clients_immatriculations.groupBy(*clients_immatriculations.columns).agg(count("*").alias("count")).filter("count > 1")
doublons.show(truncate=False)

                                                                                

+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+
|immatriculation|age|sexe|taux|situationfamiliale|nbenfantacharge|deuxiemevoiture|taux_eligible|marque|modele|puissance|longueur|nbplaces|nbportes|couleur|occasion|prix|categorie|count|
+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+
+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+



### Analyse des **null**

In [5]:
doublons = clients_immatriculations.groupBy('nbenfantacharge').count().show()

+---------------+-----+
|nbenfantacharge|count|
+---------------+-----+
|              1|16425|
|              3|11451|
|              4| 9960|
|              2|16592|
|              0|44973|
+---------------+-----+



### Supprimer les colones innutiles

In [6]:
clients_immatriculations = clients_immatriculations.drop('immatriculation')
clients_immatriculations = clients_immatriculations.drop('couleur')
clients_immatriculations = clients_immatriculations.drop('marque')

### OneHotEncoder

In [7]:
# indexer_marque = StringIndexer(inputCol="marque", outputCol="marque_index")
# indexer_marque = indexer_marque.fit(clients_immatriculations)
# clients_immatriculations = indexer_marque.transform(clients_immatriculations)

# encoder_marque = OneHotEncoder(inputCol="marque_index", outputCol="marque_encoded")
# clients_immatriculations = encoder_marque.fit(clients_immatriculations).transform(clients_immatriculations)

In [8]:
indexer_sexe = StringIndexer(inputCol="sexe", outputCol="sexe_index")
indexer_sexe_model = indexer_sexe.fit(clients_immatriculations)
clients_immatriculations = indexer_sexe_model.transform(clients_immatriculations)

encoder_sexe = OneHotEncoder(inputCol="sexe_index", outputCol="sexe_encoded")
encoder_sexe_model = encoder_sexe.fit(clients_immatriculations)
clients_immatriculations = encoder_sexe_model.transform(clients_immatriculations)

                                                                                

In [9]:
indexer_situationfamiliale = StringIndexer(inputCol="situationfamiliale", outputCol="situationfamiliale_index")
indexer_situationfamiliale_model = indexer_situationfamiliale.fit(clients_immatriculations)
clients_immatriculations = indexer_situationfamiliale_model.transform(clients_immatriculations)

encoder_situationfamiliale = OneHotEncoder(inputCol="situationfamiliale_index", outputCol="situationfamiliale_encoded")
encoder_situationfamiliale_model = encoder_situationfamiliale.fit(clients_immatriculations)
clients_immatriculations = encoder_situationfamiliale_model.transform(clients_immatriculations)

In [10]:
# clients_immatriculations = clients_immatriculations.drop('marque') 
# clients_immatriculations = clients_immatriculations.drop('marque_index')
clients_immatriculations = clients_immatriculations.drop('sexe') 
clients_immatriculations = clients_immatriculations.drop('sexe_index')
clients_immatriculations = clients_immatriculations.drop('situationfamiliale') 
clients_immatriculations = clients_immatriculations.drop('situationfamiliale_index')
clients_immatriculations = clients_immatriculations.drop('puissance')
clients_immatriculations = clients_immatriculations.drop('nbportes')
clients_immatriculations = clients_immatriculations.drop('occasion')
clients_immatriculations = clients_immatriculations.drop('prix')
clients_immatriculations = clients_immatriculations.drop('nbplaces') 
clients_immatriculations = clients_immatriculations.drop('modele') 
clients_immatriculations = clients_immatriculations.drop('longueur') 

clients_immatriculations = clients_immatriculations.withColumnRenamed('sexe_encoded', 'sexe')
clients_immatriculations = clients_immatriculations.withColumnRenamed('situationfamiliale_encoded', 'situationfamiliale')

## Changer des Boolean en Int

In [11]:
clients_immatriculations = clients_immatriculations.withColumn(
    "deuxiemevoiture",
    when(col("deuxiemevoiture") == False, 0)
    .when(col("deuxiemevoiture") == True, 1)
    .otherwise(col("deuxiemevoiture").cast("int"))
)

clients_immatriculations = clients_immatriculations.withColumn(
    "taux_eligible",
    when(col("taux_eligible") == False, 0)
    .when(col("taux_eligible") == True, 1)
    .otherwise(col("taux_eligible").cast("int"))
)

clients_immatriculations.show(4)

+----+-----+---------------+---------------+-------------+-------------------+-------------+------------------+
| age| taux|nbenfantacharge|deuxiemevoiture|taux_eligible|          categorie|         sexe|situationfamiliale|
+----+-----+---------------+---------------+-------------+-------------------+-------------+------------------+
|61.0|188.0|              0|              0|            0|citadine economique|(1,[0],[1.0])|     (3,[1],[1.0])|
|50.0|460.0|              3|              0|            0|      suv/crossover|    (1,[],[])|     (3,[0],[1.0])|
|54.0|403.0|              0|              0|            0|citadine economique|(1,[0],[1.0])|     (3,[1],[1.0])|
|74.0|531.0|              4|              1|            0|      suv/crossover|(1,[0],[1.0])|     (3,[0],[1.0])|
+----+-----+---------------+---------------+-------------+-------------------+-------------+------------------+
only showing top 4 rows



## Normalisation

In [12]:
clients_immatriculations.groupBy("categorie").count().show()

+-------------------+-----+
|          categorie|count|
+-------------------+-----+
|              autre| 5796|
|          familiale|30363|
|citadine economique|26772|
|      suv/crossover|36470|
+-------------------+-----+



In [13]:
clients_immatriculations.printSchema()
clients_immatriculations.show()

root
 |-- age: double (nullable = true)
 |-- taux: double (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: integer (nullable = true)
 |-- taux_eligible: integer (nullable = true)
 |-- categorie: string (nullable = true)
 |-- sexe: vector (nullable = true)
 |-- situationfamiliale: vector (nullable = true)

+----+------+---------------+---------------+-------------+-------------------+-------------+------------------+
| age|  taux|nbenfantacharge|deuxiemevoiture|taux_eligible|          categorie|         sexe|situationfamiliale|
+----+------+---------------+---------------+-------------+-------------------+-------------+------------------+
|61.0| 188.0|              0|              0|            0|citadine economique|(1,[0],[1.0])|     (3,[1],[1.0])|
|50.0| 460.0|              3|              0|            0|      suv/crossover|    (1,[],[])|     (3,[0],[1.0])|
|54.0| 403.0|              0|              0|            0|citadine economique|(1,[0],[1.

## Classificateur

In [14]:
# Encodage des colonnes catégoriques (si la cible est catégorique)
indexer = StringIndexer(inputCol="categorie", outputCol="label")
data = indexer.fit(clients_immatriculations).transform(clients_immatriculations)

# Assembler les colonnes de caractéristiques
feature_cols = [col for col in data.columns if col != "categorie" and col != "label"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data).select("features", "label")

In [15]:
# Diviser les données en 60% entraînement, 20% validation et 20% test
(trainingData, validationData, testData) = data.randomSplit([0.6, 0.2, 0.2], seed=42)

print(f"Taille de l'ensemble d'entraînement : {trainingData.count()}")
print(f"Taille de l'ensemble de validation : {validationData.count()}")
print(f"Taille de l'ensemble de test : {testData.count()}")

# Définir le modèle de régression linéaire
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=10)
rf_model = rf.fit(trainingData)

                                                                                

Taille de l'ensemble d'entraînement : 60059
Taille de l'ensemble de validation : 19739
Taille de l'ensemble de test : 19603


                                                                                

In [16]:
trainPrediction =  rf_model.transform(trainingData)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)

f1_score_train = evaluator.evaluate(trainPrediction)
print(f"F1-score Train : {f1_score_train * 100:.2f}%")

# Faire des prédictions
testPredictions = rf_model.transform(testData)
f1_score_test = evaluator.evaluate(testPredictions)

print(f"F1-score Test : {f1_score_test * 100:.2f}%")

                                                                                

F1-score Train : 78.24%
F1-score Test : 77.73%


                                                                                

## Prediction de marketing

In [17]:
df_marketing = spark.sql("SELECT * FROM marketing")
df_marketing.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: float (nullable = true)
 |-- situationfamiliale: string (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: boolean (nullable = true)
 |-- taux_eligible: boolean (nullable = true)



In [18]:
df_marketing = indexer_sexe_model.transform(df_marketing)
df_marketing = encoder_sexe_model.transform(df_marketing)

df_marketing = indexer_situationfamiliale_model.transform(df_marketing)
df_marketing = encoder_situationfamiliale_model.transform(df_marketing)

In [19]:
df_marketing = df_marketing.drop('sexe') 
df_marketing = df_marketing.drop('situationfamiliale') 

df_marketing = df_marketing.withColumnRenamed('sexe_encoded', 'sexe')
df_marketing = df_marketing.withColumnRenamed('situationfamiliale_encoded', 'situationfamiliale')

In [20]:
df_marketing = df_marketing.withColumn(
    "deuxiemevoiture",
    when(col("deuxiemevoiture") == False, 0)
    .when(col("deuxiemevoiture") == True, 1)
    .otherwise(col("deuxiemevoiture").cast("int"))
)

df_marketing = df_marketing.withColumn(
    "taux_eligible",
    when(col("taux_eligible") == False, 0)
    .when(col("taux_eligible") == True, 1)
    .otherwise(col("taux_eligible").cast("int"))
)

df_marketing.show(4)
clients_immatriculations.show(4)

print("Training data types:")
trainingData.printSchema()  # DataFrame de l'entraînement
print("Prediction data types:")
df_marketing.printSchema()  # DataFrame de prédiction

+---+------+---------------+---------------+-------------+----------+-------------+------------------------+------------------+
|age|  taux|nbenfantacharge|deuxiemevoiture|taux_eligible|sexe_index|         sexe|situationfamiliale_index|situationfamiliale|
+---+------+---------------+---------------+-------------+----------+-------------+------------------------+------------------+
| 21|1396.0|              0|              0|            0|       1.0|    (1,[],[])|                     1.0|     (3,[1],[1.0])|
| 35| 223.0|              0|              0|            1|       0.0|(1,[0],[1.0])|                     1.0|     (3,[1],[1.0])|
| 48| 401.0|              0|              0|            1|       0.0|(1,[0],[1.0])|                     1.0|     (3,[1],[1.0])|
| 26| 420.0|              3|              1|            1|       1.0|    (1,[],[])|                     0.0|     (3,[0],[1.0])|
+---+------+---------------+---------------+-------------+----------+-------------+---------------------

In [21]:
# Assemble features using the same columns as in training
feature_cols = ['age', 'taux', 'nbenfantacharge', 'deuxiemevoiture', 'taux_eligible', 'sexe', 'situationfamiliale']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
prediction_data = assembler.transform(df_marketing).select("features")

# Generate predictions
marketingPrediction = rf_model.transform(prediction_data)

# Show results
marketingPrediction.select("features", "prediction").show()

AnalysisException: cannot resolve 'age' given input columns: [features, prediction, probability, rawPrediction];
'Project ['age, 'taux, 'nbenfantacharge, 'deuxiemevoiture, 'taux_eligible, 'sexe_value, 'situationfamiliale_value, 'predicted_category]
+- Project [features#1435, rawPrediction#1449, probability#1455, UDF(rawPrediction#1449) AS prediction#1465]
   +- Project [features#1435, rawPrediction#1449, UDF(rawPrediction#1449) AS probability#1455]
      +- Project [features#1435, UDF(features#1435) AS rawPrediction#1449]
         +- Project [features#1435]
            +- Project [age#1057, taux#1059, nbenfantacharge#1061, deuxiemevoiture#1184, taux_eligible#1194, sexe_index#1074, sexe#1164, situationfamiliale_index#1109, situationfamiliale#1174, UDF(struct(age_double_VectorAssembler_0d33ee4c48e5, cast(age#1057 as double), taux_double_VectorAssembler_0d33ee4c48e5, cast(taux#1059 as double), nbenfantacharge_double_VectorAssembler_0d33ee4c48e5, cast(nbenfantacharge#1061 as double), deuxiemevoiture_double_VectorAssembler_0d33ee4c48e5, cast(deuxiemevoiture#1184 as double), taux_eligible_double_VectorAssembler_0d33ee4c48e5, cast(taux_eligible#1194 as double), sexe, sexe#1164, situationfamiliale, situationfamiliale#1174)) AS features#1435]
               +- Project [age#1057, taux#1059, nbenfantacharge#1061, deuxiemevoiture#1184, CASE WHEN (taux_eligible#1063 = false) THEN 0 WHEN (taux_eligible#1063 = true) THEN 1 ELSE cast(taux_eligible#1063 as int) END AS taux_eligible#1194, sexe_index#1074, sexe#1164, situationfamiliale_index#1109, situationfamiliale#1174]
                  +- Project [age#1057, taux#1059, nbenfantacharge#1061, CASE WHEN (deuxiemevoiture#1062 = false) THEN 0 WHEN (deuxiemevoiture#1062 = true) THEN 1 ELSE cast(deuxiemevoiture#1062 as int) END AS deuxiemevoiture#1184, taux_eligible#1063, sexe_index#1074, sexe#1164, situationfamiliale_index#1109, situationfamiliale#1174]
                     +- Project [age#1057, taux#1059, nbenfantacharge#1061, deuxiemevoiture#1062, taux_eligible#1063, sexe_index#1074, sexe#1164, situationfamiliale_index#1109, situationfamiliale_encoded#1127 AS situationfamiliale#1174]
                        +- Project [age#1057, taux#1059, nbenfantacharge#1061, deuxiemevoiture#1062, taux_eligible#1063, sexe_index#1074, sexe_encoded#1090 AS sexe#1164, situationfamiliale_index#1109, situationfamiliale_encoded#1127]
                           +- Project [age#1057, taux#1059, nbenfantacharge#1061, deuxiemevoiture#1062, taux_eligible#1063, sexe_index#1074, sexe_encoded#1090, situationfamiliale_index#1109, situationfamiliale_encoded#1127]
                              +- Project [age#1057, taux#1059, situationfamiliale#1060, nbenfantacharge#1061, deuxiemevoiture#1062, taux_eligible#1063, sexe_index#1074, sexe_encoded#1090, situationfamiliale_index#1109, situationfamiliale_encoded#1127]
                                 +- Project [age#1057, sexe#1058, taux#1059, situationfamiliale#1060, nbenfantacharge#1061, deuxiemevoiture#1062, taux_eligible#1063, sexe_index#1074, sexe_encoded#1090, situationfamiliale_index#1109, UDF(cast(situationfamiliale_index#1109 as double), 0) AS situationfamiliale_encoded#1127]
                                    +- Project [age#1057, sexe#1058, taux#1059, situationfamiliale#1060, nbenfantacharge#1061, deuxiemevoiture#1062, taux_eligible#1063, sexe_index#1074, sexe_encoded#1090, UDF(cast(situationfamiliale#1060 as string)) AS situationfamiliale_index#1109]
                                       +- Project [age#1057, sexe#1058, taux#1059, situationfamiliale#1060, nbenfantacharge#1061, deuxiemevoiture#1062, taux_eligible#1063, sexe_index#1074, UDF(cast(sexe_index#1074 as double), 0) AS sexe_encoded#1090]
                                          +- Project [age#1057, sexe#1058, taux#1059, situationfamiliale#1060, nbenfantacharge#1061, deuxiemevoiture#1062, taux_eligible#1063, UDF(cast(sexe#1058 as string)) AS sexe_index#1074]
                                             +- Project [age#1057, sexe#1058, taux#1059, situationfamiliale#1060, nbenfantacharge#1061, deuxiemevoiture#1062, taux_eligible#1063]
                                                +- SubqueryAlias spark_catalog.concessionnaire.marketing
                                                   +- Relation concessionnaire.marketing[age#1057,sexe#1058,taux#1059,situationfamiliale#1060,nbenfantacharge#1061,deuxiemevoiture#1062,taux_eligible#1063] parquet
