In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import avg, when, trim, split, regexp_replace, round, lower, col, encode, count

In [2]:
spark = SparkSession.builder\
    .appName("AggregateCatalogueCo2")\
    .enableHiveSupport()\
    .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/13 11:11:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.sparkContext.setLogLevel("OFF")
spark.catalog.clearCache()
spark.sql("USE concessionnaire")

df_catalogue = spark.sql("SELECT * FROM catalogue_ext")
df_co2 = spark.sql("SELECT * FROM crit_air_ext")

df_catalogue = df_catalogue.filter(df_catalogue['marque'] != 'marque') # A GERER DANS L'IMPORT DE HIVE ???

In [4]:
df_co2.show()
df_catalogue.show()

+--------------------+-----------+----------+------------+
|       marque_modele|bonus_malus|rejets_co2|cout_energie|
+--------------------+-----------+----------+------------+
|      CITROEN C-ZERO|  -6 000€ 1|       0.0|       491 €|
|MERCEDES SPRINTER...|    +8 753€|     200.0|       799 €|
|VOLKSWAGEN Passat...|          -|      31.0|        56 €|
|    SMART EQ FORFOUR|  -6 000€ 1|       0.0|       175 €|
|BENTLEY BENTAYGA ...|          -|      84.0|       102 €|
|SMART EQ FORTWO C...|  -6 000€ 1|       0.0|       175 €|
|SMART EQ FORFOUR ...|  -6 000€ 1|       0.0|       213 €|
|AUDI Q5 50 TFSI e...|          -|      49.0|       105 €|
|MERCEDES SPRINTER...|    +8 753€|     255.0|       988 €|
|MERCEDES SPRINTER...|    +8 753€|     200.0|       799 €|
|KIA SOUL Moteur Ã...|  -6 000€ 1|       0.0|       214 €|
|MERCEDES VITO Tou...|  -6 000€ 1|       0.0|       411 €|
|MERCEDES SPRINTER...|    +8 753€|     262.0|       999 €|
|SMART EQ FORFOUR ...|  -6 000€ 1|       0.0|       213 

### Remarque 001

- **co2** dispose de marque et modele dans la meme colone
- La colone **nom** du catalogue n'est pas nommée **modele** dans **co2**
- La colone **modele** dans les 2 tableaux n'ont pas la meme casse.
- La colone **marque** dans les 2 tableaux n'ont pas la meme casse.
- Le signe **€** est mentionné dans la colone **bonus_malus** de co2.
- Le signe **€** est mentionné dans la colone **cout_energie** de co2.
- Le chiffre 1 peut apparaitre après le signe **€** dans la colone **bonus_malus** de co2.

In [5]:
# co2 dispose de marque et modele dans la meme colone
df_co2 = df_co2.withColumn("marque", split(df_co2["marque_modele"], " ", 2).getItem(0))
df_co2 = df_co2.withColumn("modele", split(df_co2["marque_modele"], " ", 2).getItem(1))
df_co2 = df_co2.drop('marque_modele')

# La colone **nom** du catalogue n'est pas nommée **modele** dans dans **co2**
df_catalogue = df_catalogue.withColumnRenamed("nom", "modele")

# La colone modele dans les 2 tableaux n’ont pas la meme casse.
df_co2 = df_co2.withColumn("marque", lower(trim(col("marque"))))
df_catalogue = df_catalogue.withColumn("marque", lower(trim(col("marque"))))

# La colone modele dans les 2 tableaux n’ont pas la meme casse.
df_co2 = df_co2.withColumn("modele", lower(trim(col("modele"))))
df_catalogue = df_catalogue.withColumn("modele", lower(trim(col("modele"))))

# Le signe € est mentionné dans la colone bonus_malus de co2.
df_co2 = df_co2.withColumn("bonus_malus", split(trim(df_co2["bonus_malus"]), "€").getItem(0))
# Le signe € est mentionné dans la colone cout_energie de co2.
df_co2 = df_co2.withColumn("cout_energie", split(trim(df_co2["cout_energie"]), "€").getItem(0))

# Le chiffre 1 peut apparaitre après le signe € dans la colone bonus_malus de co2.
df_co2 = df_co2.withColumn("bonus_malus", regexp_replace(trim(df_co2["bonus_malus"]), "[^0-9-]", "").cast("float"))

In [6]:
df_co2.show()
df_catalogue.show()

+-----------+----------+------------+----------+--------------------+
|bonus_malus|rejets_co2|cout_energie|    marque|              modele|
+-----------+----------+------------+----------+--------------------+
|    -6000.0|       0.0|        491 |   citroen|              c-zero|
|     8753.0|     200.0|        799 |  mercedes|sprinter combi 31...|
|       null|      31.0|         56 |volkswagen|passat sw 1.4 tsi...|
|    -6000.0|       0.0|        175 |     smart|          eq forfour|
|       null|      84.0|        102 |   bentley|     bentayga hybrid|
|    -6000.0|       0.0|        175 |     smart|eq fortwo cabrio ...|
|    -6000.0|       0.0|        213 |     smart|eq forfour 7 kw m...|
|       null|      49.0|        105 |      audi|q5 50 tfsi e (299...|
|     8753.0|     255.0|        988 |  mercedes|sprinter combi 31...|
|     8753.0|     200.0|        799 |  mercedes|sprinter combi 31...|
|    -6000.0|       0.0|        214 |       kia|soul moteur ã©lec...|
|    -6000.0|       

In [7]:
marques_count_catalogue = df_catalogue.groupBy("marque").count()
marques_count_co2 = df_co2.groupBy("marque").count()

marques_count_catalogue = marques_count_catalogue.withColumnRenamed("count", "count_catalogue")
marques_count_co2 = marques_count_co2.withColumnRenamed("count", "count_co2")

marques_count_joined = marques_count_catalogue.join(
    marques_count_co2,
    on="marque",
    how="outer"  # Utilise "left", "right", ou "outer" si nécessaire
).select(
    "marque",
    F.col("count_catalogue"),
    F.col("count_co2")
)

marques_count_joined = marques_count_joined.fillna(0, subset=["count_catalogue", "count_co2"])


marques_count_joined.show(n=1000)

+----------+---------------+---------+
|    marque|count_catalogue|count_co2|
+----------+---------------+---------+
|      audi|             20|        8|
|   bentley|              0|        1|
|       bmw|             20|       12|
|   citroen|              0|        2|
|     dacia|              5|        0|
|  daihatsu|              5|        0|
|        ds|              0|        2|
|      fiat|             10|        0|
|      ford|             10|        0|
|     honda|              5|        0|
|   hyundai|              0|        3|
|   hyunda�|              5|        0|
|    jaguar|             10|        1|
|       kia|              5|        6|
|    lancia|             10|        0|
|      land|              0|        5|
|  mercedes|             20|       42|
|      mini|             10|        2|
|mitsubishi|              0|        2|
|    nissan|             15|        9|
|   peugeot|             10|        5|
|   porsche|              0|        4|
|   renault|             

In [8]:
modele_count_catalogue = df_catalogue.groupBy("modele").count()
modele_count_co2 = df_co2.groupBy("modele").count()

modele_count_catalogue = modele_count_catalogue.withColumnRenamed("count", "count_catalogue")
modele_count_co2 = modele_count_co2.withColumnRenamed("count", "count_co2")

modele_count_joined = modele_count_catalogue.join(
    modele_count_co2,
    on="modele",
    how="outer"  # Utilise "left", "right", ou "outer" si nécessaire
).select(
    "modele",
    F.col("count_catalogue"),
    F.col("count_co2")
)

modele_count_joined_match = modele_count_catalogue.join(
    modele_count_co2,
    on="modele",
    how="inner"  # Utilise "left", "right", ou "outer" si nécessaire
).select(
    "modele",
    F.col("count_catalogue"),
    F.col("count_co2")
)

modele_count_joined = modele_count_joined.fillna(0, subset=["count_catalogue", "count_co2"])


modele_count_joined.show(n=1000, truncate=False)
modele_count_joined_match.show(n=1000, truncate=False)

+---------------------------------------------------------------------------------------------------+---------------+---------+
|modele                                                                                             |count_catalogue|count_co2|
+---------------------------------------------------------------------------------------------------+---------------+---------+
|1007 1.4                                                                                           |10             |0        |
|120i                                                                                               |10             |0        |
|208 e- tense (136 ch)                                                                              |0              |1        |
|225xe active tourer                                                                                |0              |1        |
|330e berline                                                                                       |0  

In [12]:
df_normalized = marques_count_joined.withColumn("marque_normalisee", lower(regexp_replace("marque", "ï", "i")))
df_normalized.show(n=1000)

+----------+---------------+---------+-----------------+
|    marque|count_catalogue|count_co2|marque_normalisee|
+----------+---------------+---------+-----------------+
|      audi|             20|        8|             audi|
|   bentley|              0|        1|          bentley|
|       bmw|             20|       12|              bmw|
|   citroen|              0|        2|          citroen|
|     dacia|              5|        0|            dacia|
|  daihatsu|              5|        0|         daihatsu|
|        ds|              0|        2|               ds|
|      fiat|             10|        0|             fiat|
|      ford|             10|        0|             ford|
|     honda|              5|        0|            honda|
|   hyundai|              0|        3|          hyundai|
|   hyunda�|              5|        0|          hyunda�|
|    jaguar|             10|        1|           jaguar|
|       kia|              5|        6|              kia|
|    lancia|             10|   