## Entrainement de differents modèles pour calculer les probabilités de retour à l'emploi

In [1]:
from pyspark.sql import SparkSession

### Rechargement du dataset Personnes

In [2]:
%%time
MASTER="yarn"           # Spark distant sur CDP  (Spark pushdown)
# MASTER="local[*]"     # Spark local sur CML

spark = SparkSession.builder.appName("1_entrainement.ipynb").master(MASTER).getOrCreate()


[0;31m7.1.9 and 7.2.17 are the last CDP runtime releases where Spark 2 is supported.
Please migrate your Spark 2 applications to Spark 3.

Updating Spark 2 applications for Spark 3:
https://docs.cloudera.com/runtime/7.2.16/running-spark-applications/topics/spark-update-spark2-spark3.html
[0m
CPU times: user 72.8 ms, sys: 43.1 ms, total: 116 ms
Wall time: 38.5 s


In [3]:
sdf=spark.sql("select * from olivier.personnes")

Hive Session ID = f5a71e7b-7c7e-4415-9602-4b3cd05df562


### Initialisation des experimentation MLFLOW pour l'Atelier 1

In [7]:
sdf.show(5)

+-----+---+-----+----------------+-------------+--------------+--------------------+---------+------------------+----------------+--------------------+--------------+-------------+
|   id|age| sexe|niveau_education|duree_chomage|experience_ant|          competence|formation|taux_chomage_local|secteur_activite|reseau_professionnel|support_social|retour_emploi|
+-----+---+-----+----------------+-------------+--------------+--------------------+---------+------------------+----------------+--------------------+--------------+-------------+
|95167| 42|homme|         LICENSE|            5|             8|    Public librarian|     true| 11.50570330550432|           Group|                  95|             1|            1|
|76849| 39|femme|         BTS/DUT|           17|             6| Associate Professor|     true| 2.208842935285328|             LLC|                  70|             7|            0|
|36869| 18|femme|          MASTER|           10|            13|              Lawyer|     true| 

In [8]:
# mlflow.set_experiment ("Experimentations Atelier 1")

## Lancement d'une entrainement Regression logistique de Spark ML Lib

In [9]:
sdf.printSchema()

root
 |-- id: long (nullable = true)
 |-- age: long (nullable = true)
 |-- sexe: string (nullable = true)
 |-- niveau_education: string (nullable = true)
 |-- duree_chomage: long (nullable = true)
 |-- experience_ant: long (nullable = true)
 |-- competence: string (nullable = true)
 |-- formation: boolean (nullable = true)
 |-- taux_chomage_local: double (nullable = true)
 |-- secteur_activite: string (nullable = true)
 |-- reseau_professionnel: long (nullable = true)
 |-- support_social: long (nullable = true)
 |-- retour_emploi: long (nullable = true)



In [11]:
sdf=sdf.drop('id')      # Suppression colonne id non requise pour l'étude
sdf.printSchema()

root
 |-- age: long (nullable = true)
 |-- sexe: string (nullable = true)
 |-- niveau_education: string (nullable = true)
 |-- duree_chomage: long (nullable = true)
 |-- experience_ant: long (nullable = true)
 |-- competence: string (nullable = true)
 |-- formation: boolean (nullable = true)
 |-- taux_chomage_local: double (nullable = true)
 |-- secteur_activite: string (nullable = true)
 |-- reseau_professionnel: long (nullable = true)
 |-- support_social: long (nullable = true)
 |-- retour_emploi: long (nullable = true)



#### Extraction des variables categorielles

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [13]:
variables = sdf.columns[0:-1]   # Toutes les variables sauf la derniere qui est la variable à étudier

In [14]:
# Extract les colonnes categories (types string)
cat_cols=[]
feature_cols=[]
for colname, coltype in sdf.dtypes[1:-1]:
    if ( coltype == 'string'):
        cat_cols += [colname]
    else:
        feature_cols += [colname]


In [15]:
cat_cols

['sexe', 'niveau_education', 'competence', 'secteur_activite']

In [16]:
feature_cols

['duree_chomage',
 'experience_ant',
 'formation',
 'taux_chomage_local',
 'reseau_professionnel',
 'support_social']

#### Categorise (index) et encode les variables à catégoriser

In [17]:
stages = [] 

for colname in cat_cols:

   # Assigne un indice aux variable categorielle
   indexer = StringIndexer(inputCol=colname, outputCol=colname + "_index") 
        
   encoder = OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=colname + "_vec")  

   stages       += [indexer, encoder]             # A jout des étapes au pipeline de tranformation

#### Transforme toutes les variables en vecteur compréhensible pour la régression

In [18]:
from pyspark.ml.feature import VectorAssembler

assemblerInputs = feature_cols + [c + "_vec" for c in cat_cols]              # Réassemblage les variables

assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") # Vectorise les variables 


stages += [assembler]      # Ajout du Vector assembleur eu pipeline de transformation

stages

[StringIndexer_2f6695ad9576,
 OneHotEncoder_7459139ab2ed,
 StringIndexer_94dde65321cd,
 OneHotEncoder_0a307c5d0f98,
 StringIndexer_946732c647aa,
 OneHotEncoder_5dac3c38571c,
 StringIndexer_fb478644cbd2,
 OneHotEncoder_99425b4954e4,
 VectorAssembler_f7c449935905]

In [19]:
pipeline = Pipeline(stages=stages)           # Inititialisation du pipeline de transformation  
sdf = pipeline.fit(sdf).transform(sdf)       # Transformation

                                                                                

In [20]:
sdf.printSchema()

root
 |-- age: long (nullable = true)
 |-- sexe: string (nullable = true)
 |-- niveau_education: string (nullable = true)
 |-- duree_chomage: long (nullable = true)
 |-- experience_ant: long (nullable = true)
 |-- competence: string (nullable = true)
 |-- formation: boolean (nullable = true)
 |-- taux_chomage_local: double (nullable = true)
 |-- secteur_activite: string (nullable = true)
 |-- reseau_professionnel: long (nullable = true)
 |-- support_social: long (nullable = true)
 |-- retour_emploi: long (nullable = true)
 |-- sexe_index: double (nullable = false)
 |-- sexe_vec: vector (nullable = true)
 |-- niveau_education_index: double (nullable = false)
 |-- niveau_education_vec: vector (nullable = true)
 |-- competence_index: double (nullable = false)
 |-- competence_vec: vector (nullable = true)
 |-- secteur_activite_index: double (nullable = false)
 |-- secteur_activite_vec: vector (nullable = true)
 |-- features: vector (nullable = true)



## Entrainement du modèle de régression logistiques SparkML

#### Echantillonage train/test

In [21]:
(train, test) = sdf.randomSplit([0.7, 0.3], seed=42)
print(train.count())
print(test.count())

                                                                                

13658


[Stage 14:>                                                         (0 + 2) / 2]

5842


                                                                                

#### Entrainement

In [25]:
from pyspark.ml.classification import LogisticRegression

model = LogisticRegression(labelCol= 'retour_emploi', featuresCol='features', maxIter= 10).fit(train)

                                                                                

#### Predictions

In [39]:
predictions = model.transform(test)

In [40]:
predictions.select("retour_emploi", "prediction", "probability").show(5)

+-------------+----------+--------------------+
|retour_emploi|prediction|         probability|
+-------------+----------+--------------------+
|            1|       0.0|[0.54623888241867...|
|            1|       1.0|[0.41307655884248...|
|            0|       1.0|[0.44489941942492...|
|            1|       0.0|[0.56437098680459...|
|            1|       1.0|[0.46867973677214...|
+-------------+----------+--------------------+
only showing top 5 rows



                                                                                

In [44]:
predictions.printSchema()

root
 |-- age: long (nullable = true)
 |-- sexe: string (nullable = true)
 |-- niveau_education: string (nullable = true)
 |-- duree_chomage: long (nullable = true)
 |-- experience_ant: long (nullable = true)
 |-- competence: string (nullable = true)
 |-- formation: boolean (nullable = true)
 |-- taux_chomage_local: double (nullable = true)
 |-- secteur_activite: string (nullable = true)
 |-- reseau_professionnel: long (nullable = true)
 |-- support_social: long (nullable = true)
 |-- retour_emploi: long (nullable = true)
 |-- sexe_index: double (nullable = false)
 |-- sexe_vec: vector (nullable = true)
 |-- niveau_education_index: double (nullable = false)
 |-- niveau_education_vec: vector (nullable = true)
 |-- competence_index: double (nullable = false)
 |-- competence_vec: vector (nullable = true)
 |-- secteur_activite_index: double (nullable = false)
 |-- secteur_activite_vec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = t

#### Evaluation du modèle

In [49]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='retour_emploi')
evaluator.evaluate(predictions)

                                                                                

0.5155630131282045

In [51]:
# Confusion Matrix
matrix = predictions.crosstab('retour_emploi', 'prediction')

                                                                                

In [64]:
import pandas as pd

df = matrix.toPandas()


In [67]:
df

Unnamed: 0,retour_emploi_prediction,0.0,1.0
0,1,1475,1493
1,0,1482,1392


In [81]:
print(f'Precision   : {df.iloc[0,2] / (df.iloc[0,2] + df.iloc[1,2])}')
print(f'Recall      : {df.iloc[0,2] / (df.iloc[0,2] + df.iloc[0,1])}')
print(f'Specificity : {df.iloc[1,1] / (df.iloc[1,1] + df.iloc[1,2])}')

Precision   : 0.5175043327556326
Recall      : 0.503032345013477
Specificity : 0.5156576200417536


In [82]:
spark.stop()