#### Importation de données

In [None]:
# Importer les librairies 
from pyspark.sql import SparkSession
import pyspark.sql as sparksql
import pandas
# Ouvrir une session Spark SQL 
spark = SparkSession.builder.appName('stroke').getOrCreate()
# Importer le Data
df = spark.read.csv('/Users/mac/Downloads/healthcare-dataset-stroke-data.csv', inferSchema= True ,header= True )

In [None]:
# La méthode printSchema() fournit une vue facilement lisible du schéma de DataFrame
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [None]:
# pandas.dtypes() retourne le type de chaque colonne
df.dtypes

[('id', 'int'),
 ('gender', 'string'),
 ('age', 'double'),
 ('hypertension', 'int'),
 ('heart_disease', 'int'),
 ('ever_married', 'string'),
 ('work_type', 'string'),
 ('Residence_type', 'string'),
 ('avg_glucose_level', 'double'),
 ('bmi', 'string'),
 ('smoking_status', 'string'),
 ('stroke', 'int')]

#### Exploration de données

In [None]:
# head() renvoie la première ligne
df.head()

Row(id=9046, gender='Male', age=67.0, hypertension=0, heart_disease=1, ever_married='Yes', work_type='Private', Residence_type='Urban', avg_glucose_level=228.69, bmi='36.6', smoking_status='formerly smoked', stroke=1)

In [None]:
# Grouper notre jeu de données selon les catégories du smoking_status et compter le nombre de personnes de chaque catégorie
df.groupBy('smoking_status').count().show()

+---------------+-----+
| smoking_status|count|
+---------------+-----+
|         smokes|  789|
|        Unknown| 1544|
|   never smoked| 1892|
|formerly smoked|  885|
+---------------+-----+



In [None]:
# Grouper selon le genre des personnes: femmes, hommes et autre
df.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 2994|
| Other|    1|
|  Male| 2115|
+------+-----+



In [None]:
# Grouper les personnes selon le niveau moyen de glucose
df.groupBy('avg_glucose_level').count().show()

+-----------------+-----+
|avg_glucose_level|count|
+-----------------+-----+
|           242.52|    1|
|            60.98|    4|
|            76.46|    1|
|            98.09|    2|
|            73.73|    1|
|            78.75|    1|
|            75.29|    1|
|            79.89|    4|
|            77.19|    2|
|            206.4|    1|
|            61.78|    1|
|           151.25|    2|
|            95.57|    1|
|            65.78|    1|
|            58.51|    1|
|             64.2|    1|
|           116.04|    2|
|           106.01|    1|
|            85.86|    1|
|            70.07|    2|
+-----------------+-----+
only showing top 20 rows



In [None]:
# Grouper les personnes selon l'indice de masse corporelle
df.groupBy('bmi').count().show()

+----+-----+
| bmi|count|
+----+-----+
|34.4|   18|
|47.5|    3|
|20.5|   18|
|38.3|    2|
|45.4|    4|
|38.5|    7|
|26.5|   30|
|  51|    1|
|48.1|    1|
|29.4|   30|
|16.6|    8|
|14.2|    4|
|12.8|    1|
|26.7|   37|
|17.1|   12|
|36.1|    7|
|30.1|   26|
|40.1|   10|
|  54|    1|
|  15|    2|
+----+-----+
only showing top 20 rows



#### Analyse de données

In [None]:
# Créer DataFrame en tant que vue temporaire
df.createOrReplaceTempView('table')

In [None]:
# Compter le nombre de personnes pour chaque catégorie du travail, sachant qu'ils sont atteints d'accident vasculaire cérébral
spark.sql("""
          SELECT work_type, count(work_type) as work_type_count
          FROM table WHERE stroke == 1
          GROUP BY work_type
          ORDER BY work_type_count DESC""").show()

+-------------+---------------+
|    work_type|work_type_count|
+-------------+---------------+
|      Private|            149|
|Self-employed|             65|
|     Govt_job|             33|
|     children|              2|
+-------------+---------------+



In [None]:
# Compter le nombre personnes par gender qui ont atteints d'accident vasculaire cérébral
spark.sql("""
          SELECT gender, count(gender) as count_gender, count(gender)*100/sum(count(gender)) over() as percent 
          FROM table 
          GROUP BY gender""").show()

+------+------------+--------------------+
|gender|count_gender|             percent|
+------+------------+--------------------+
|Female|        2994|  58.590998043052835|
| Other|           1|0.019569471624266144|
|  Male|        2115|    41.3894324853229|
+------+------------+--------------------+



In [None]:
# Compter le nombre personnes par age qui ont atteints d'accident vasculaire cérébral
spark.sql("""
          SELECT age, count(age) as age_count 
          FROM table WHERE stroke == 1 
          GROUP BY age 
          ORDER BY age_count DESC""").show()

+----+---------+
| age|age_count|
+----+---------+
|78.0|       21|
|79.0|       17|
|80.0|       17|
|81.0|       14|
|57.0|       11|
|76.0|       10|
|63.0|        9|
|68.0|        9|
|74.0|        9|
|82.0|        9|
|59.0|        8|
|77.0|        8|
|71.0|        7|
|58.0|        7|
|70.0|        6|
|75.0|        6|
|69.0|        6|
|72.0|        6|
|54.0|        6|
|61.0|        6|
+----+---------+
only showing top 20 rows



In [None]:
# Compter le nombre personnes par smoking_status qui ont atteints d'accident vasculaire cérébral
spark.sql("""
          SELECT smoking_status, count(smoking_status) as smoking_status_count 
          FROM table WHERE stroke == 1 
          GROUP BY smoking_status 
          ORDER BY smoking_status_count DESC""").show()

+---------------+--------------------+
| smoking_status|smoking_status_count|
+---------------+--------------------+
|   never smoked|                  90|
|formerly smoked|                  70|
|        Unknown|                  47|
|         smokes|                  42|
+---------------+--------------------+



In [None]:
# Compter le nombre personnes par avg_glucose_level qui ont atteints d'accident vasculaire cérébral
spark.sql("""
          SELECT avg_glucose_level, count(avg_glucose_level) as avg_glucose_level_count 
          FROM table WHERE stroke == 1 
          GROUP BY avg_glucose_level 
          ORDER BY avg_glucose_level_count DESC""").show()

+-----------------+-----------------------+
|avg_glucose_level|avg_glucose_level_count|
+-----------------+-----------------------+
|           101.45|                      2|
|           242.52|                      1|
|            76.46|                      1|
|            60.98|                      1|
|           199.86|                      1|
|           165.31|                      1|
|            60.67|                      1|
|           110.66|                      1|
|            68.53|                      1|
|           207.28|                      1|
|            76.15|                      1|
|            61.94|                      1|
|           186.21|                      1|
|           199.84|                      1|
|            91.92|                      1|
|            96.97|                      1|
|           110.85|                      1|
|            72.81|                      1|
|            86.94|                      1|
|           130.54|             

In [None]:
# Compter le nombre personnes par heart_disease qui ont atteints d'accident vasculaire cérébral
spark.sql("""
          SELECT heart_disease, count(heart_disease) as heart_disease_count 
          FROM table WHERE stroke == 1 
          GROUP BY heart_disease 
          ORDER BY heart_disease_count DESC""").show()

+-------------+-------------------+
|heart_disease|heart_disease_count|
+-------------+-------------------+
|            0|                202|
|            1|                 47|
+-------------+-------------------+



#### Entrainement de données

In [None]:
from pyspark.ml.feature import StringIndexer
#Convertir la colonne work_type de type chaîne de caractères en numérique
indexer=StringIndexer(inputCol='work_type',outputCol='work')
indexed=indexer.fit(df).transform(df)
#Le nouveau DataFrame contient une nouvelle caractéristique 'work'
for item in indexed.head(5):
    print(item)
    print('\n')

Row(id=9046, gender='Male', age=67.0, hypertension=0, heart_disease=1, ever_married='Yes', work_type='Private', Residence_type='Urban', avg_glucose_level=228.69, bmi='36.6', smoking_status='formerly smoked', stroke=1, work=0.0)


Row(id=51676, gender='Female', age=61.0, hypertension=0, heart_disease=0, ever_married='Yes', work_type='Self-employed', Residence_type='Rural', avg_glucose_level=202.21, bmi='N/A', smoking_status='never smoked', stroke=1, work=1.0)


Row(id=31112, gender='Male', age=80.0, hypertension=0, heart_disease=1, ever_married='Yes', work_type='Private', Residence_type='Rural', avg_glucose_level=105.92, bmi='32.5', smoking_status='never smoked', stroke=1, work=0.0)


Row(id=60182, gender='Female', age=49.0, hypertension=0, heart_disease=0, ever_married='Yes', work_type='Private', Residence_type='Urban', avg_glucose_level=171.23, bmi='34.4', smoking_status='smokes', stroke=1, work=0.0)


Row(id=1665, gender='Female', age=79.0, hypertension=1, heart_disease=0, ever_marri

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
# Créer un vecteur features à partir des caractéristiques: age, hypertension, heart_disease, work et avg_glucose_level
assembler=VectorAssembler(inputCols=['id',
 'age',
 'hypertension',
 'heart_disease',
 'work',
 'avg_glucose_level'],outputCol='features')
output=assembler.transform(indexed)
output.select('features','stroke').show(5)

+--------------------+------+
|            features|stroke|
+--------------------+------+
|[9046.0,67.0,0.0,...|     1|
|[51676.0,61.0,0.0...|     1|
|[31112.0,80.0,0.0...|     1|
|[60182.0,49.0,0.0...|     1|
|[1665.0,79.0,1.0,...|     1|
+--------------------+------+
only showing top 5 rows



In [None]:
final_data=output.select('features','stroke')
# Diviser les données en données d'entrainement et données de test
train_data,test_data=final_data.randomSplit([0.7,0.3])
# Afficher quelques mesures de statistiques pour données d'entrainement
train_data.describe().show()


+-------+-------------------+
|summary|             stroke|
+-------+-------------------+
|  count|               3591|
|   mean|0.05123920913394597|
| stddev| 0.2205159722481186|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [None]:
# Afficher quelques mesures de statistiques pour données de test
test_data.describe().show()

+-------+--------------------+
|summary|              stroke|
+-------+--------------------+
|  count|                1519|
|   mean|0.042791310072416065|
| stddev| 0.20245294976310413|
|    min|                   0|
|    max|                   1|
+-------+--------------------+



In [None]:
# Importer LinearRegression library
from pyspark.ml.regression import LinearRegression
# LinearRefression prend en arguments features et stroke comme variable à prédire
ship_lr=LinearRegression(featuresCol='features',labelCol='stroke')
trained_ship_model=ship_lr.fit(train_data)
# Evaluer le modèle entraîné avec Rsquared error
ship_results=trained_ship_model.evaluate(train_data)
print('Rsquared Error :',ship_results.r2)

Rsquared Error : 0.08362734724414456


In [None]:
# Tester le modèle
unlabeled_data=test_data.select('features')
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|[77.0,13.0,0.0,0....|
|[91.0,42.0,0.0,0....|
|[187.0,20.0,0.0,0...|
|[242.0,4.0,0.0,0....|
|[298.0,41.0,0.0,0...|
+--------------------+
only showing top 5 rows



In [None]:
# Les prédictions de la variable stroke donnés par notre modèle en utilisant les données de test
predictions=trained_ship_model.transform(unlabeled_data)
predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|[77.0,13.0,0.0,0....|-0.02834621020510...|
|[91.0,42.0,0.0,0....| 0.03033340575553635|
|[187.0,20.0,0.0,0...|-0.01639112510436988|
|[242.0,4.0,0.0,0....|-0.04141245025843...|
|[298.0,41.0,0.0,0...|0.021827357307744377|
|[338.0,43.0,0.0,0...| 0.03609010653588775|
|[364.0,58.0,0.0,0...| 0.06329610377449525|
|[448.0,49.0,0.0,0...| 0.04668382037454695|
|[452.0,48.0,1.0,0...|  0.1254033089680207|
|[458.0,37.0,0.0,0...| 0.01360899977734796|
|[464.0,46.0,0.0,0...|0.031545218245174506|
|[479.0,59.0,1.0,0...| 0.11579430916988002|
|[545.0,42.0,0.0,0...| 0.06656636122713884|
|[563.0,41.0,0.0,0...| 0.06666954560423778|
|[721.0,52.0,1.0,0...| 0.11450683056965404|
|[724.0,17.0,0.0,0...|-0.02279317708587...|
|[739.0,73.0,0.0,0...| 0.08405787050458034|
|[742.0,39.0,0.0,0...|0.022397746247636324|
|[917.0,32.0,0.0,0...|0.007018204531225827|
|[937.0,7.0,0.0,0....|-0.0390069