In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum, count, avg, round
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, IndexToString
from pyspark.ml.classification import RandomForestClassifier

In [2]:
spark = SparkSession.builder\
    .appName("Préparation de données")\
    .enableHiveSupport()\
    .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
spark.catalog.clearCache()
spark.sql("USE concessionnaire")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/15 15:41:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[]

In [3]:
df_catalogue = spark.sql("SELECT * FROM catalogue;")
df_catalogue.show(n=10)

                                                                                

+----------+----------------+-----------+--------+--------+-------------------+-----------+----------+------------+
|    marque|          modele|   longueur|nbplaces|nbportes|          categorie|bonus_malus|rejets_co2|cout_energie|
+----------+----------------+-----------+--------+--------+-------------------+-----------+----------+------------+
|   renault|vel satis 3.5 v6|tres longue|       5|       5|      SUV/Crossover|    -6000.0|       0.0|       206.0|
|    lancia| ypsilon 1.4 16v|     courte|       5|       3|citadine economique|    -4865.0|      33.0|       158.0|
|volkswagen|     polo 1.2 6v|     courte|       5|       3|citadine economique|    -6000.0|      23.0|        96.0|
|  daihatsu|       cuore 1.0|     courte|       5|       3|citadine economique|    -4865.0|      33.0|       158.0|
|     smart|      toledo 1.6|     longue|       5|       5|          familiale|    -6000.0|       0.0|       191.0|
|   renault|vel satis 3.5 v6|tres longue|       5|       5|             

### Verification des doublons

In [4]:
doublons = df_catalogue.groupBy(*df_catalogue.columns).agg(count("*").alias("count")).filter("count > 1")
doublons.show(truncate=False)

[Stage 2:>                                                        (0 + 12) / 12]

+------+------+--------+--------+--------+---------+-----------+----------+------------+-----+
|marque|modele|longueur|nbplaces|nbportes|categorie|bonus_malus|rejets_co2|cout_energie|count|
+------+------+--------+--------+--------+---------+-----------+----------+------------+-----+
+------+------+--------+--------+--------+---------+-----------+----------+------------+-----+



                                                                                

### Analyse des **null**

In [5]:
df_catalogue.select(
    *[sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_catalogue.columns]
).show()

+------+------+--------+--------+--------+---------+-----------+----------+------------+
|marque|modele|longueur|nbplaces|nbportes|categorie|bonus_malus|rejets_co2|cout_energie|
+------+------+--------+--------+--------+---------+-----------+----------+------------+
|     0|     0|       0|       0|       0|        0|          0|         0|           0|
+------+------+--------+--------+--------+---------+-----------+----------+------------+



### Normalisation de la **marque**

In [6]:
indexer = StringIndexer(inputCol="marque", outputCol="marque_index")
df_catalogue = indexer.fit(df_catalogue).transform(df_catalogue)

encoder = OneHotEncoder(inputCol="marque_index", outputCol="marque_encoded")
df_catalogue = encoder.fit(df_catalogue).transform(df_catalogue)

df_catalogue = df_catalogue.drop('marque') 
df_catalogue = df_catalogue.drop('marque_index') 

df_catalogue = df_catalogue.withColumnRenamed('marque_encoded', 'marque')

df_catalogue.show(n=10)

                                                                                

+----------------+-----------+--------+--------+-------------------+-----------+----------+------------+---------------+
|          modele|   longueur|nbplaces|nbportes|          categorie|bonus_malus|rejets_co2|cout_energie|         marque|
+----------------+-----------+--------+--------+-------------------+-----------+----------+------------+---------------+
|vel satis 3.5 v6|tres longue|       5|       5|      SUV/Crossover|    -6000.0|       0.0|       206.0| (19,[0],[1.0])|
| ypsilon 1.4 16v|     courte|       5|       3|citadine economique|    -4865.0|      33.0|       158.0|(19,[14],[1.0])|
|     polo 1.2 6v|     courte|       5|       3|citadine economique|    -6000.0|      23.0|        96.0| (19,[1],[1.0])|
|       cuore 1.0|     courte|       5|       3|citadine economique|    -4865.0|      33.0|       158.0|(19,[10],[1.0])|
|      toledo 1.6|     longue|       5|       5|          familiale|    -6000.0|       0.0|       191.0|(19,[18],[1.0])|
|vel satis 3.5 v6|tres longue|  

### Analyse de **Modele**

In [7]:
count_modele = df_catalogue.select("modele").distinct().count()
print(f"Il y a {count_modele} modeles.")

Il y a 32 modeles.


On travail uniquement sur les catégories, les modèles spécifiques n'ajoutent pas de valeur.
L'analyse est simplifier en se concentrant sur des regroupements plus larges.

In [8]:
indexer = StringIndexer(inputCol="categorie", outputCol="categorie_indexed")
df_catalogue = indexer.fit(df_catalogue).transform(df_catalogue)

encoder = OneHotEncoder(inputCol="categorie_indexed", outputCol="categorie_encoded")
df_catalogue = encoder.fit(df_catalogue).transform(df_catalogue)

df_catalogue = df_catalogue.drop('modele') 
df_catalogue = df_catalogue.drop('categorie') 
df_catalogue = df_catalogue.drop('categorie_indexed') 

df_catalogue = df_catalogue.withColumnRenamed('categorie_encoded', 'categorie')

df_catalogue.show(n=10)

+-----------+--------+--------+-----------+----------+------------+---------------+-------------+
|   longueur|nbplaces|nbportes|bonus_malus|rejets_co2|cout_energie|         marque|    categorie|
+-----------+--------+--------+-----------+----------+------------+---------------+-------------+
|tres longue|       5|       5|    -6000.0|       0.0|       206.0| (19,[0],[1.0])|(3,[1],[1.0])|
|     courte|       5|       3|    -4865.0|      33.0|       158.0|(19,[14],[1.0])|(3,[2],[1.0])|
|     courte|       5|       3|    -6000.0|      23.0|        96.0| (19,[1],[1.0])|(3,[2],[1.0])|
|     courte|       5|       3|    -4865.0|      33.0|       158.0|(19,[10],[1.0])|(3,[2],[1.0])|
|     longue|       5|       5|    -6000.0|       0.0|       191.0|(19,[18],[1.0])|(3,[0],[1.0])|
|tres longue|       5|       5|    -6000.0|       0.0|       206.0| (19,[0],[1.0])|    (3,[],[])|
|     courte|       5|       5|    -6000.0|      22.0|       126.0|(19,[15],[1.0])|    (3,[],[])|
|     courte|       

### Analyse de **longueur**

In [9]:
df_catalogue.select("longueur").distinct().show()

+-----------+
|   longueur|
+-----------+
|tres longue|
|     courte|
|     longue|
|    moyenne|
+-----------+



In [10]:
indexer_longueur = StringIndexer(inputCol="longueur", outputCol="longueur_indexed")
df_catalogue = indexer_longueur.fit(df_catalogue).transform(df_catalogue)

df_catalogue = df_catalogue.drop('longueur') 

df_catalogue = df_catalogue.withColumnRenamed('longueur_indexed', 'longueur')

df_catalogue.show(n=4)

+--------+--------+-----------+----------+------------+---------------+-------------+--------+
|nbplaces|nbportes|bonus_malus|rejets_co2|cout_energie|         marque|    categorie|longueur|
+--------+--------+-----------+----------+------------+---------------+-------------+--------+
|       5|       5|    -6000.0|       0.0|       206.0| (19,[0],[1.0])|(3,[1],[1.0])|     3.0|
|       5|       3|    -4865.0|      33.0|       158.0|(19,[14],[1.0])|(3,[2],[1.0])|     2.0|
|       5|       3|    -6000.0|      23.0|        96.0| (19,[1],[1.0])|(3,[2],[1.0])|     2.0|
|       5|       3|    -4865.0|      33.0|       158.0|(19,[10],[1.0])|(3,[2],[1.0])|     2.0|
+--------+--------+-----------+----------+------------+---------------+-------------+--------+
only showing top 4 rows



## Classificateur

In [11]:
assembler = VectorAssembler(
    inputCols=["nbplaces", "nbportes", "bonus_malus", "rejets_co2", "cout_energie", "marque", "longueur"],
    outputCol="features")

df_final = assembler.transform(df_catalogue_with_moyennes)


IllegalArgumentException: categorie_indexed does not exist. Available: nbplaces, nbportes, bonus_malus, rejets_co2, cout_energie, marque, categorie, longueur, features