In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, floor
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, IndexToString, StandardScaler, StringIndexerModel, OneHotEncoderModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.sql.types import DoubleType
from itertools import product

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession.builder\
    .appName("Classification RandomForest")\
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .enableHiveSupport()\
    .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
spark.catalog.clearCache()
spark.sql("USE concessionnaire")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/26 12:19:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/26 12:19:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


DataFrame[]

In [3]:
# Charger les données de la table marketing
df_marketing = spark.sql("SELECT * FROM marketing")
df_marketing.printSchema()

df_marketing.show()

# Charger les modèles de transformation
indexer_model = StringIndexerModel.load("hdfs://namenode:9000/user/model/indexer_model")
indexer_sexe = StringIndexerModel.load("hdfs://namenode:9000/user/model/indexer_sexe")
indexer_situationfamiliale = StringIndexerModel.load("hdfs://namenode:9000/user/model/indexer_situationfamiliale")
encoder_sexe = OneHotEncoderModel.load("hdfs://namenode:9000/user/model/encoder_sexe")
encoder_situationfamiliale = OneHotEncoderModel.load("hdfs://namenode:9000/user/model/encoder_situationfamiliale")

# Appliquer les transformations nécessaires (identiques à celles de l'entraînement)
df_marketing = indexer_sexe.transform(df_marketing)
df_marketing = encoder_sexe.transform(df_marketing)
df_marketing = indexer_situationfamiliale.transform(df_marketing)
df_marketing = encoder_situationfamiliale.transform(df_marketing)

# Transformer les colonnes booléennes en entiers
df_marketing = df_marketing.withColumn(
    "deuxiemevoiture",
    when(col("deuxiemevoiture") == False, 0)
    .when(col("deuxiemevoiture") == True, 1)
    .otherwise(col("deuxiemevoiture").cast("int"))
)

df_marketing = df_marketing.withColumn(
    "taux_eligible",
    when(col("taux_eligible") == False, 0)
    .when(col("taux_eligible") == True, 1)
    .otherwise(col("taux_eligible").cast("int"))
)

# Définir les colonnes utilisées pour les features
feature_cols = [col for col in df_marketing.columns if col not in ["categorie", "label", 'sexe', 'situationfamiliale']]
df_marketing.show()


# Assembler les caractéristiques en un vecteur de features
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
prediction_data = assembler.transform(df_marketing).select("features")

# Charger le modèle depuis HDFS
hdfs_path = "hdfs://namenode:9000/user/model/categorie"
model = RandomForestClassificationModel.load(hdfs_path)

# Effectuer les prédictions
marketingPrediction = model.transform(prediction_data)

# Mapper les prédictions avec les catégories d'origine
label_to_category = IndexToString(
    inputCol="prediction",
    outputCol="predicted_category",
    labels=indexer_model.labels
)
result = label_to_category.transform(marketingPrediction)
# Afficher les résultats
result.select("features", "prediction", "predicted_category").show()

# Arrêter la SparkSession
spark.stop()

AnalysisException: Table or view not found: marketing; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [marketing], [], false
