In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum, count, avg, round, udf, corr
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, IndexToString, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.types import DoubleType
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import VectorUDT

In [2]:
spark = SparkSession.builder\
    .appName("Préparation de données")\
    .enableHiveSupport()\
    .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
spark.catalog.clearCache()
spark.sql("USE concessionnaire")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/18 14:29:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/18 14:29:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/11/18 14:29:22 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/11/18 14:29:22 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


DataFrame[]

In [3]:
clients_immatriculations = spark.sql("SELECT * FROM clients_immatriculations")
clients_immatriculations.printSchema()

root
 |-- immatriculation: string (nullable = true)
 |-- age: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: double (nullable = true)
 |-- situationfamiliale: string (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: boolean (nullable = true)
 |-- taux_eligible: boolean (nullable = true)
 |-- marque: string (nullable = true)
 |-- modele: string (nullable = true)
 |-- puissance: integer (nullable = true)
 |-- longueur: string (nullable = true)
 |-- nbplaces: integer (nullable = true)
 |-- nbportes: integer (nullable = true)
 |-- couleur: string (nullable = true)
 |-- occasion: boolean (nullable = true)
 |-- prix: integer (nullable = true)
 |-- categorie: string (nullable = true)



### Verification des doublons

In [4]:
doublons = clients_immatriculations.groupBy(*clients_immatriculations.columns).agg(count("*").alias("count")).filter("count > 1")
doublons.show(truncate=False)

                                                                                

+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+
|immatriculation|age|sexe|taux|situationfamiliale|nbenfantacharge|deuxiemevoiture|taux_eligible|marque|modele|puissance|longueur|nbplaces|nbportes|couleur|occasion|prix|categorie|count|
+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+
+---------------+---+----+----+------------------+---------------+---------------+-------------+------+------+---------+--------+--------+--------+-------+--------+----+---------+-----+



### Analyse des **null**

In [5]:
doublons = clients_immatriculations.groupBy('nbenfantacharge').count().show()



+---------------+-----+
|nbenfantacharge|count|
+---------------+-----+
|              1|16425|
|              3|11451|
|              4| 9960|
|              2|16592|
|              0|44973|
+---------------+-----+



                                                                                

### Supprimer les colones innutiles

In [6]:
clients_immatriculations = clients_immatriculations.drop('immatriculation')
clients_immatriculations = clients_immatriculations.drop('couleur')

### OneHotEncoder

In [7]:
indexer_marque = StringIndexer(inputCol="marque", outputCol="marque_index")
indexer_marque = indexer_marque.fit(clients_immatriculations)
clients_immatriculations = indexer_marque.transform(clients_immatriculations)

encoder_marque = OneHotEncoder(inputCol="marque_index", outputCol="marque_encoded")
clients_immatriculations = encoder_marque.fit(clients_immatriculations).transform(clients_immatriculations)

                                                                                

In [8]:
indexer_sexe = StringIndexer(inputCol="sexe", outputCol="sexe_index")
indexer_sexe = indexer_sexe.fit(clients_immatriculations)
clients_immatriculations = indexer_sexe.transform(clients_immatriculations)

encoder_sexe = OneHotEncoder(inputCol="sexe_index", outputCol="sexe_encoded")
clients_immatriculations = encoder_sexe.fit(clients_immatriculations).transform(clients_immatriculations)

                                                                                

In [9]:
indexer_situationfamiliale = StringIndexer(inputCol="situationfamiliale", outputCol="situationfamiliale_index")
indexer_situationfamiliale = indexer_situationfamiliale.fit(clients_immatriculations)
clients_immatriculations = indexer_situationfamiliale.transform(clients_immatriculations)

encoder_situationfamiliale = OneHotEncoder(inputCol="situationfamiliale_index", outputCol="situationfamiliale_encoded")
clients_immatriculations = encoder_situationfamiliale.fit(clients_immatriculations).transform(clients_immatriculations)

                                                                                

In [10]:
clients_immatriculations = clients_immatriculations.drop('marque') 
clients_immatriculations = clients_immatriculations.drop('marque_index')
clients_immatriculations = clients_immatriculations.drop('sexe') 
clients_immatriculations = clients_immatriculations.drop('sexe_index')
clients_immatriculations = clients_immatriculations.drop('situationfamiliale') 
clients_immatriculations = clients_immatriculations.drop('situationfamiliale_index')

clients_immatriculations = clients_immatriculations.withColumnRenamed('marque_encoded', 'marque')
clients_immatriculations = clients_immatriculations.withColumnRenamed('sexe_encoded', 'sexe')
clients_immatriculations = clients_immatriculations.withColumnRenamed('situationfamiliale_encoded', 'situationfamiliale')

## Changer des Boolean en Int

In [11]:
clients_immatriculations = clients_immatriculations.withColumn(
    "deuxiemevoiture",
    when(col("deuxiemevoiture") == False, 0)
    .when(col("deuxiemevoiture") == True, 1)
    .otherwise(col("deuxiemevoiture").cast("int"))
)

clients_immatriculations = clients_immatriculations.withColumn(
    "taux_eligible",
    when(col("taux_eligible") == False, 0)
    .when(col("taux_eligible") == True, 1)
    .otherwise(col("taux_eligible").cast("int"))
)

clients_immatriculations = clients_immatriculations.withColumn(
    "occasion",
    when(col("occasion") == False, 0)
    .when(col("occasion") == True, 1)
    .otherwise(col("occasion").cast("int"))
)

clients_immatriculations.show(4)

+----+-----+---------------+---------------+-------------+----------------+---------+-----------+--------+--------+--------+-----+-------------------+---------------+-------------+------------------+
| age| taux|nbenfantacharge|deuxiemevoiture|taux_eligible|          modele|puissance|   longueur|nbplaces|nbportes|occasion| prix|          categorie|         marque|         sexe|situationfamiliale|
+----+-----+---------------+---------------+-------------+----------------+---------+-----------+--------+--------+--------+-----+-------------------+---------------+-------------+------------------+
|61.0|188.0|              0|              0|            0|     picanto 1.1|       65|     courte|       5|       5|       0| 8990|citadine economique|(18,[13],[1.0])|(1,[0],[1.0])|     (3,[1],[1.0])|
|50.0|460.0|              3|              0|            0|vel satis 3.5 v6|      245|tres longue|       5|       5|       0|49200|      suv/crossover| (18,[2],[1.0])|    (1,[],[])|     (3,[0],[1.0])|


### Analyse de **Modele**

In [12]:
count_modele = clients_immatriculations.select("modele").distinct().count()
print(f"Il y a {count_modele} modeles.")



Il y a 28 modeles.


                                                                                

On travail uniquement sur les catégories, les modèles spécifiques n'ajoutent pas de valeur.
L'analyse est simplifier en se concentrant sur des regroupements plus larges.

In [13]:
clients_immatriculations = clients_immatriculations.drop('modele') 

clients_immatriculations.show(n=2)

+----+-----+---------------+---------------+-------------+---------+-----------+--------+--------+--------+-----+-------------------+---------------+-------------+------------------+
| age| taux|nbenfantacharge|deuxiemevoiture|taux_eligible|puissance|   longueur|nbplaces|nbportes|occasion| prix|          categorie|         marque|         sexe|situationfamiliale|
+----+-----+---------------+---------------+-------------+---------+-----------+--------+--------+--------+-----+-------------------+---------------+-------------+------------------+
|61.0|188.0|              0|              0|            0|       65|     courte|       5|       5|       0| 8990|citadine economique|(18,[13],[1.0])|(1,[0],[1.0])|     (3,[1],[1.0])|
|50.0|460.0|              3|              0|            0|      245|tres longue|       5|       5|       0|49200|      suv/crossover| (18,[2],[1.0])|    (1,[],[])|     (3,[0],[1.0])|
+----+-----+---------------+---------------+-------------+---------+-----------+-----

### Analyse de **longueur**

In [14]:
clients_immatriculations.select("longueur").distinct().show()



+-----------+
|   longueur|
+-----------+
|    moyenne|
|tres longue|
|     courte|
|     longue|
+-----------+



                                                                                

In [15]:
indexer_longueur = StringIndexer(inputCol="longueur", outputCol="longueur_indexed")
indexer_model_longueur = indexer_longueur.fit(clients_immatriculations)
clients_immatriculations = indexer_model_longueur.transform(clients_immatriculations)

clients_immatriculations = clients_immatriculations.drop('longueur')
clients_immatriculations = clients_immatriculations.withColumnRenamed('longueur_indexed', 'longueur')

clients_immatriculations.show(n=4)

                                                                                

+----+-----+---------------+---------------+-------------+---------+--------+--------+--------+-----+-------------------+---------------+-------------+------------------+--------+
| age| taux|nbenfantacharge|deuxiemevoiture|taux_eligible|puissance|nbplaces|nbportes|occasion| prix|          categorie|         marque|         sexe|situationfamiliale|longueur|
+----+-----+---------------+---------------+-------------+---------+--------+--------+--------+-----+-------------------+---------------+-------------+------------------+--------+
|61.0|188.0|              0|              0|            0|       65|       5|       5|       0| 8990|citadine economique|(18,[13],[1.0])|(1,[0],[1.0])|     (3,[1],[1.0])|     1.0|
|50.0|460.0|              3|              0|            0|      245|       5|       5|       0|49200|      suv/crossover| (18,[2],[1.0])|    (1,[],[])|     (3,[0],[1.0])|     0.0|
|54.0|403.0|              0|              0|            0|       55|       5|       3|       0|12200

### Nombre de place

In [16]:
clients_immatriculations.groupBy("nbplaces").count().show()

[Stage 34:===>                                                    (1 + 15) / 16]

+--------+-----+
|nbplaces|count|
+--------+-----+
|       5|99401|
+--------+-----+



                                                                                

Nombre de place ne dispose d'aucune variation, elle n'apporte rien au modele.

In [17]:
clients_immatriculations = clients_immatriculations.drop('nbplaces')

## Normalisation

In [18]:
clients_immatriculations.groupBy("categorie").count().show()

+-------------------+-----+
|          categorie|count|
+-------------------+-----+
|              autre| 5795|
|          familiale|30361|
|citadine economique|26778|
|      suv/crossover|36467|
+-------------------+-----+



                                                                                

In [19]:
clients_immatriculations.printSchema()
clients_immatriculations.show()

root
 |-- age: double (nullable = true)
 |-- taux: double (nullable = true)
 |-- nbenfantacharge: integer (nullable = true)
 |-- deuxiemevoiture: integer (nullable = true)
 |-- taux_eligible: integer (nullable = true)
 |-- puissance: integer (nullable = true)
 |-- nbportes: integer (nullable = true)
 |-- occasion: integer (nullable = true)
 |-- prix: integer (nullable = true)
 |-- categorie: string (nullable = true)
 |-- marque: vector (nullable = true)
 |-- sexe: vector (nullable = true)
 |-- situationfamiliale: vector (nullable = true)
 |-- longueur: double (nullable = false)

+----+------+---------------+---------------+-------------+---------+--------+--------+------+-------------------+---------------+-------------+------------------+--------+
| age|  taux|nbenfantacharge|deuxiemevoiture|taux_eligible|puissance|nbportes|occasion|  prix|          categorie|         marque|         sexe|situationfamiliale|longueur|
+----+------+---------------+---------------+-------------+---------

## Classificateur

In [20]:
# Encodage des colonnes catégoriques (si la cible est catégorique)
indexer = StringIndexer(inputCol="categorie", outputCol="label")
data = indexer.fit(clients_immatriculations).transform(clients_immatriculations)

# Assembler les colonnes de caractéristiques
feature_cols = [col for col in data.columns if col != "categorie" and col != "label"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data).select("features", "label")

In [21]:
# Diviser les données en 60% entraînement, 20% validation et 20% test
(trainingData, validationData, testData) = data.randomSplit([0.6, 0.2, 0.2], seed=42)

print(f"Taille de l'ensemble d'entraînement : {trainingData.count()}")
print(f"Taille de l'ensemble de validation : {validationData.count()}")
print(f"Taille de l'ensemble de test : {testData.count()}")

# Définir le modèle de régression linéaire
lr = LinearRegression(featuresCol="features", labelCol="label")

# Entraîner le modèle
lr_model = lr.fit(trainingData)

# Afficher les coefficients du modèle
print("Coefficients : ", lr_model.coefficients)
print("Intercept : ", lr_model.intercept)

                                                                                

Taille de l'ensemble d'entraînement : 60060




Taille de l'ensemble de validation : 19739


                                                                                

Taille de l'ensemble de test : 19602


                                                                                

Coefficients :  [0.00047031268159792114,0.0001805321731593393,-0.003283797793364148,0.03922042821243353,0.026775825498363964,-0.002029972650726769,-0.007751789597702686,0.194009292473186,-1.947121006998374e-05,-8.732150585933771e-05,-0.06458842676474034,-0.2820462408852232,-0.7664752474585373,-0.05147172570894013,-0.45535548334247006,-1.1732420882695298,0.032891029089999094,-0.6171954068342261,-0.33475766724322903,-0.27581551926914083,0.5845944935950161,1.4741967942237546,0.029685774171510804,-0.002963666063063168,-0.3455737124975541,1.0474376366969986,-0.3397734340530555,-0.0022397554089655623,0.13418897535479635,0.19353993461303012,0.1486636696752328,-0.30926766744819817]
Intercept :  2.386038671270097


In [22]:
# Faire des prédictions
predictions = lr_model.transform(testData)

# Évaluer les prédictions
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) : {rmse * 100:.2f}%")

# Afficher les prédictions
predictions.select("features", "label", "prediction").show()

                                                                                

Root Mean Squared Error (RMSE) : 39.28%
+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|(32,[0,1,2,3,4,5,...|  2.0|2.1422969604064086|
|(32,[0,1,2,3,4,5,...|  2.0| 2.166870859162601|
|(32,[0,1,2,3,4,5,...|  2.0|2.1376089552183424|
|(32,[0,1,2,3,4,5,...|  2.0|2.1486362525878318|
|(32,[0,1,2,3,4,5,...|  0.0|0.4594176618777621|
|(32,[0,1,2,3,4,5,...|  0.0|0.4652048810687126|
|(32,[0,1,2,3,4,5,...|  0.0|0.4009656213469235|
|(32,[0,1,2,3,4,5,...|  2.0|1.9348433315584215|
|(32,[0,1,2,3,4,5,...|  2.0|1.9127138032546211|
|(32,[0,1,2,3,4,5,...|  2.0|1.9727967982401848|
|(32,[0,1,2,3,4,5,...|  2.0|1.9151357511644758|
|(32,[0,1,2,3,4,5,...|  2.0|1.8618416949210865|
|(32,[0,1,2,3,4,5,...|  2.0|1.9405537224185583|
|(32,[0,1,2,3,4,5,...|  2.0|1.9260056460514352|
|(32,[0,1,2,3,4,5,...|  2.0| 1.983848124636319|
|(32,[0,1,2,3,4,5,...|  2.0|1.9279961451131062|
|(32,[0,1,2,3,4,5,...|  2.0|1.9862034338652115|
