In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum, count, avg, round, udf, corr
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, IndexToString, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.types import DoubleType


In [2]:
spark = SparkSession.builder\
    .appName("Préparation de données")\
    .enableHiveSupport()\
    .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
spark.catalog.clearCache()
spark.sql("USE concessionnaire")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/18 08:01:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[]

In [3]:
clients_immatriculations = spark.sql("SELECT * FROM clients_immatriculations")
clients_immatriculations.printSchema()

AnalysisException: Table or view not found: clients_immatriculations; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [clients_immatriculations], [], false


### Verification des doublons

In [None]:
doublons = clients_immatriculations.groupBy(*clients_immatriculations.columns).agg(count("*").alias("count")).filter("count > 1")
doublons.show(truncate=False)

### Analyse des **null**

In [None]:
from pyspark.sql.functions import col, when, sum

null_counts = clients_immatriculations.select(
    *[sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in clients_immatriculations.columns]
)

null_counts_dict = null_counts.collect()[0].asDict()

columns_with_nulls = [(col_name, count) for col_name, count in null_counts_dict.items() if count > 0]

nulls_df = spark.createDataFrame(columns_with_nulls, ["Column", "Null Count"])

nulls_df.show()

In [None]:
clients_immatriculations = clients_immatriculations.fillna(0)

### Supprimer les colones innutiles

In [None]:
clients_immatriculations = clients_immatriculations.drop('immatriculation')

### OneHotEncoder

In [None]:
indexer_marque = StringIndexer(inputCol="marque", outputCol="marque_index")
indexer_marque = indexer_marque.fit(clients_immatriculations)
clients_immatriculations = indexer_marque.transform(clients_immatriculations)

encoder_marque = OneHotEncoder(inputCol="marque_index", outputCol="marque_encoded")
clients_immatriculations = encoder_marque.fit(clients_immatriculations).transform(clients_immatriculations)

In [None]:
indexer_sexe = StringIndexer(inputCol="sexe", outputCol="sexe_index")
indexer_sexe = indexer_sexe.fit(clients_immatriculations)
clients_immatriculations = indexer_sexe.transform(clients_immatriculations)

encoder_sexe = OneHotEncoder(inputCol="sexe_index", outputCol="sexe_encoded")
clients_immatriculations = encoder_sexe.fit(clients_immatriculations).transform(clients_immatriculations)

In [None]:
indexer_couleur = StringIndexer(inputCol="couleur", outputCol="couleur_index")
indexer_couleur = indexer_couleur.fit(clients_immatriculations)
clients_immatriculations = indexer_couleur.transform(clients_immatriculations)

encoder_couleur = OneHotEncoder(inputCol="couleur_index", outputCol="couleur_encoded")
clients_immatriculations = encoder_couleur.fit(clients_immatriculations).transform(clients_immatriculations)

In [None]:
indexer_situationfamiliale = StringIndexer(inputCol="situationfamiliale", outputCol="situationfamiliale_index")
indexer_situationfamiliale = indexer_situationfamiliale.fit(clients_immatriculations)
clients_immatriculations = indexer_situationfamiliale.transform(clients_immatriculations)

encoder_situationfamiliale = OneHotEncoder(inputCol="situationfamiliale_index", outputCol="situationfamiliale_encoded")
clients_immatriculations = encoder_situationfamiliale.fit(clients_immatriculations).transform(clients_immatriculations)

In [None]:
clients_immatriculations = clients_immatriculations.drop('marque') 
clients_immatriculations = clients_immatriculations.drop('marque_index')
clients_immatriculations = clients_immatriculations.drop('sexe') 
clients_immatriculations = clients_immatriculations.drop('sexe_index')
clients_immatriculations = clients_immatriculations.drop('couleur') 
clients_immatriculations = clients_immatriculations.drop('couleur_index')
clients_immatriculations = clients_immatriculations.drop('situationfamiliale') 
clients_immatriculations = clients_immatriculations.drop('situationfamiliale_index')

clients_immatriculations = clients_immatriculations.withColumnRenamed('marque_encoded', 'marque')
clients_immatriculations = clients_immatriculations.withColumnRenamed('sexe_encoded', 'sexe')
clients_immatriculations = clients_immatriculations.withColumnRenamed('couleur_encoded', 'couleur')
clients_immatriculations = clients_immatriculations.withColumnRenamed('situationfamiliale_encoded', 'situationfamiliale')

In [None]:
clients_immatriculations.show()

## Changer des Boolean en Int

In [None]:
clients_immatriculations = clients_immatriculations.withColumn(
    "deuxiemevoiture",
    when(col("deuxiemevoiture") == False, 0)
    .when(col("deuxiemevoiture") == True, 1)
    .otherwise(col("deuxiemevoiture").cast("int"))
)

clients_immatriculations = clients_immatriculations.withColumn(
    "taux_eligible",
    when(col("taux_eligible") == False, 0)
    .when(col("taux_eligible") == True, 1)
    .otherwise(col("taux_eligible").cast("int"))
)

clients_immatriculations = clients_immatriculations.withColumn(
    "occasion",
    when(col("occasion") == False, 0)
    .when(col("occasion") == True, 1)
    .otherwise(col("occasion").cast("int"))
)

clients_immatriculations.show(4)

### Analyse de **Modele**

In [None]:
count_modele = clients_immatriculations.select("modele").distinct().count()
print(f"Il y a {count_modele} modeles.")

On travail uniquement sur les catégories, les modèles spécifiques n'ajoutent pas de valeur.
L'analyse est simplifier en se concentrant sur des regroupements plus larges.

In [None]:
indexer = StringIndexer(inputCol="categorie", outputCol="categorie_indexed")
indexer_model = indexer.fit(clients_immatriculations)
clients_immatriculations = indexer_model.transform(clients_immatriculations)

clients_immatriculations = clients_immatriculations.drop('modele') 
clients_immatriculations = clients_immatriculations.drop('categorie') 

clients_immatriculations = clients_immatriculations.withColumnRenamed('categorie_indexed', 'categorie')

clients_immatriculations.show(n=10)

### Analyse de **longueur**

In [None]:
clients_immatriculations.select("longueur").distinct().show()

In [None]:
indexer_longueur = StringIndexer(inputCol="longueur", outputCol="longueur_indexed")
indexer_model_longueur = indexer_longueur.fit(clients_immatriculations)
clients_immatriculations = indexer_model_longueur.transform(clients_immatriculations)

clients_immatriculations = clients_immatriculations.drop('longueur')
clients_immatriculations = clients_immatriculations.withColumnRenamed('longueur_indexed', 'longueur')

clients_immatriculations.show(n=4)

## Normalisation

In [None]:
clients_immatriculations.groupBy("categorie").count().show()

In [None]:
clients_immatriculations.printSchema()

In [None]:
# Étape 1: Identifier les colonnes numériques
numerical_cols = ['age', 'taux', 'nbenfantacharge', 'puissance', 'nbplaces', 'nbportes', 'prix', 'categorie', 'longueur']

# Étape 2: Assembler les colonnes numériques
numerical_assembler = VectorAssembler(
    inputCols=numerical_cols,
    outputCol="numerical_features"
)
df_numeric = numerical_assembler.transform(clients_immatriculations)

# Étape 3: Normaliser les données numériques
scaler = StandardScaler(
    inputCol="numerical_features",
    outputCol="scaled_numerical_features",
    withMean=True,
    withStd=True
)
scaler_model = scaler.fit(df_numeric)
df_scaled = scaler_model.transform(df_numeric)

# Étape 4: Assembler les caractéristiques normalisées avec les variables catégorielles
categorical_vector_cols = ['marque', 'sexe', 'couleur', 'situationfamiliale']

final_assembler = VectorAssembler(
    inputCols=['scaled_numerical_features'] + categorical_vector_cols,
    outputCol="features"
)
df_final = final_assembler.transform(df_scaled)

# Afficher le résultat
df_final.select("features").show(truncate=False)

## Classificateur

Entrainement du modele

In [None]:
(trainingData, testData) = df_final.randomSplit([0.8, 0.2], seed=42)

gbt = GBTClassifier(labelCol="categorie", featuresCol="features", maxIter=10)
model = gbt.fit(trainingData)

Évaluation initiale

In [None]:
predictions = model.transform(testData)

predictions.show()

evaluator = MulticlassClassificationEvaluator(
    labelCol="categorie", predictionCol="prediction", metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Précision du modèle sur l'ensemble de test = {accuracy * 100:.2f}%")

F1-Score

In [None]:
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="categorie", predictionCol="prediction", metricName="f1"
)
f1_score = f1_evaluator.evaluate(predictions)
print(f"F1-score global : {f1_score * 100:.2f}%")

Matrice de confusion

In [None]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedCategorie", labels=indexer_model.labels)
predictions = labelConverter.transform(predictions)

# Regroupement
predictions.groupBy("categorie", "predictedCategorie").count().show()

Validation croisée

In [None]:
# paramGrid = (ParamGridBuilder()
#     .addGrid(rf.numTrees, [10, 20])
#     .addGrid(rf.maxDepth, [5, 10])
#     .build())

# crossval = CrossValidator(
#     estimator=rf,
#     estimatorParamMaps=paramGrid,
#     evaluator=evaluator,
#     numFolds=3
# )

# cvModel = crossval.fit(trainingData)
# cv_accuracy = evaluator.evaluate(cvModel.transform(testData))
# print(f"Précision moyenne après validation croisée : {cv_accuracy * 100:.2f}%")

Vérification des corrélations entre caractéristiques et catégorie

In [None]:
numerical_cols = ['age', 'taux', 'nbenfantacharge', 'puissance', 'nbplaces', 'nbportes', 'prix', 'categorie', 'longueur']
print("Corrélations entre les colonnes numériques et la catégorie :")
for col_name in numerical_cols:
    correlation = clients_immatriculations.select(corr(col_name, "categorie")).first()[0]
    print(f"{col_name}: {correlation}")

In [None]:
# Afficher les importances des caractéristiques
importances = model.featureImportances
print("Importances des caractéristiques :")
for i, col in enumerate(['scaled_numerical_features', 'marque', 'sexe', 'couleur', 'situationfamiliale']):
    print(f"{col}: {importances[i]:.4f}")

In [None]:
clients_immatriculations.groupBy('categorie').count().show()

# Taille du jeu de données total
print(f"Taille du jeu de données : {df_final.count()}")

# Taille des ensembles d'entraînement et de test
print(f"Taille de l'ensemble d'entraînement : {trainingData.count()}")
print(f"Taille de l'ensemble de test : {testData.count()}")