In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, IndexToString, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptron
from pyspark.sql.types import DoubleType
from itertools import product

ImportError: cannot import name 'MultilayerPerceptron' from 'pyspark.ml.classification' (/usr/local/spark/python/pyspark/ml/classification.py)

In [None]:
spark = SparkSession.builder\
    .appName("Classification RandomForest")\
    .enableHiveSupport()\
    .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
spark.catalog.clearCache()
spark.sql("USE concessionnaire")

In [None]:
clients_immatriculations = spark.sql("SELECT * FROM clients_immatriculations")
clients_immatriculations.printSchema()

### Verification des doublons

In [None]:
doublons = clients_immatriculations.groupBy(*clients_immatriculations.columns).agg(count("*").alias("count")).filter("count > 1")
doublons.show(truncate=False)

### Analyse des **null**

In [None]:
doublons = clients_immatriculations.groupBy('nbenfantacharge').count().show()

### Supprimer les colones innutiles

In [None]:
clients_immatriculations = clients_immatriculations.drop('immatriculation')
clients_immatriculations = clients_immatriculations.drop('couleur')
clients_immatriculations = clients_immatriculations.drop('marque')

clients_immatriculations.show()

### OneHotEncoder

In [None]:
indexer_sexe = StringIndexer(inputCol="sexe", outputCol="sexe_index").fit(clients_immatriculations)
clients_immatriculations = indexer_sexe.transform(clients_immatriculations)

encoder_sexe = OneHotEncoder(inputCol="sexe_index", outputCol="sexe_encoded").fit(clients_immatriculations)
clients_immatriculations = encoder_sexe.transform(clients_immatriculations)

In [None]:
indexer_situationfamiliale = StringIndexer(inputCol="situationfamiliale", outputCol="situationfamiliale_index").fit(clients_immatriculations)
clients_immatriculations = indexer_situationfamiliale.transform(clients_immatriculations)

encoder_situationfamiliale = OneHotEncoder(inputCol="situationfamiliale_index", outputCol="situationfamiliale_encoded").fit(clients_immatriculations)
clients_immatriculations = encoder_situationfamiliale.transform(clients_immatriculations)

In [None]:
clients_immatriculations = clients_immatriculations.drop('puissance')
clients_immatriculations = clients_immatriculations.drop('nbportes')
clients_immatriculations = clients_immatriculations.drop('occasion')
clients_immatriculations = clients_immatriculations.drop('prix')
clients_immatriculations = clients_immatriculations.drop('nbplaces') 
clients_immatriculations = clients_immatriculations.drop('modele') 
clients_immatriculations = clients_immatriculations.drop('longueur')

## Changer des Boolean en Int

In [None]:
clients_immatriculations = clients_immatriculations.withColumn(
    "deuxiemevoiture",
    when(col("deuxiemevoiture") == False, 0)
    .when(col("deuxiemevoiture") == True, 1)
    .otherwise(col("deuxiemevoiture").cast("int"))
)

clients_immatriculations = clients_immatriculations.withColumn(
    "taux_eligible",
    when(col("taux_eligible") == False, 0)
    .when(col("taux_eligible") == True, 1)
    .otherwise(col("taux_eligible").cast("int"))
)

clients_immatriculations.show(4)

## Normalisation

In [None]:
clients_immatriculations.groupBy("categorie").count().show()

In [None]:
clients_immatriculations.printSchema()
clients_immatriculations.show()

## Classificateur

In [None]:
# Apprentissage des labels
indexer_model = StringIndexer(inputCol="categorie", outputCol="label").fit(clients_immatriculations)

data = indexer_model.transform(clients_immatriculations)

feature_cols = [col for col in data.columns if col not in ["categorie", "label", 'sexe', 'situationfamiliale']]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data).select("features", "label")

# # Ajout de la normalisation
# scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)
# scaler_model = scaler.fit(data)
# data = scaler_model.transform(data).select("scaled_features", "label")

# data.show()

In [None]:
# Diviser les données en 60% entraînement, 20% validation et 20% test
(trainingData, validationData, testData) = data.randomSplit([0.6, 0.2, 0.2], seed=42)

print(f"Taille de l'ensemble d'entraînement : {trainingData.count()}")
print(f"Taille de l'ensemble de validation : {validationData.count()}")
print(f"Taille de l'ensemble de test : {testData.count()}")

# Définir le modèle de régression linéaire
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50, maxDepth=20)
rf_model = rf.fit(trainingData)

In [None]:
# Définir les hyperparamètres à tester
param_grid = {
    'numTrees': [10, 20, 50],
    'maxDepth': [5, 10, 15],
    'maxBins': [32, 64]
}

# Liste pour enregistrer les résultats
results = []

# Générer toutes les combinaisons d'hyperparamètres
for numTrees, maxDepth, maxBins in product(param_grid['numTrees'], param_grid['maxDepth'], param_grid['maxBins']):
    # Configurer le modèle avec les hyperparamètres
    rf = RandomForestClassifier(numTrees=numTrees, maxDepth=maxDepth, maxBins=maxBins, labelCol="label", featuresCol="features")
    
    # Entraîner le modèle
    model = rf.fit(trainingData)
    
    # Prédire sur les données de test
    predictions = model.transform(testData)
    
    # Évaluer la précision
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    
    # Enregistrer les résultats
    results.append((numTrees, maxDepth, maxBins, accuracy))

# Trouver les meilleurs hyperparamètres
best_params = max(results, key=lambda x: x[3])  # Trier par précision
print(f"Meilleurs paramètres : numTrees={best_params[0]}, maxDepth={best_params[1]}, maxBins={best_params[2]} avec précision={best_params[3]}")
