In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=2916a424a43e91499eed307c6374f718c6722f65257eee38269ad383d70f4524
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegressionModel
import pandas as ps


In [3]:
spark = SparkSession.builder.appName("spark").getOrCreate()


In [4]:
df = spark.read.csv('/content/diabetes.csv', header = True, inferSchema=True) # predit le type de colonnes
df.show(6)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+


In [5]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [6]:
print(df.count(), ':', len(df.columns))

768 : 9


In [7]:
df.groupBy("Outcome").count().show()

+-------+-----+
|Outcome|count|
+-------+-----+
|      1|  268|
|      0|  500|
+-------+-----+



In [8]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|summary|       Pregnancies|          Glucose|     BloodPressure|     SkinThickness|           Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|  count|               768|              768|               768|               768|               768|               768|                     768|               768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.992578124999977|      0.4718763020833327|33.240885416666664|0.3489583333333333|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|15.95

In [9]:
for col in df.columns:
  print(col + ":", df[df[col].isNull()].count())

Pregnancies: 0
Glucose: 0
BloodPressure: 0
SkinThickness: 0
Insulin: 0
BMI: 0
DiabetesPedigreeFunction: 0
Age: 0
Outcome: 0


In [10]:
def count_zeros(df, columns):
  for col in columns:
    num_zeros = df.filter(df[col] == 0).count()
    total_rows = df.count()
    percentage = (num_zeros / total_rows) * 100
    print("{} : {} ({:.2f}%)".format(col, num_zeros, percentage))

In [11]:
liste_cols = ['Glucose', 'Bloodpressure', 'SkinThickness', 'Insulin', 'BMI']
count_zeros(df, liste_cols)

Glucose : 5 (0.65%)
Bloodpressure : 35 (4.56%)
SkinThickness : 227 (29.56%)
Insulin : 374 (48.70%)
BMI : 11 (1.43%)


In [12]:
for i in df.columns[1:6]:
  mean_val = df.agg({i:'mean'}).first()[0]
  print("la valeur moyenne de la colonne {} est : {}".format(i, int(mean_val)))
  # update the values : si la condition (val ==0) est vrai
  df = df.withColumn(i, when(df[i]==0, int(mean_val)).otherwise(df[i]))

df.show(10)

la valeur moyenne de la colonne Glucose est : 120
la valeur moyenne de la colonne BloodPressure est : 69
la valeur moyenne de la colonne SkinThickness est : 20
la valeur moyenne de la colonne Insulin est : 79
la valeur moyenne de la colonne BMI est : 31
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|     79|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|     79|26.6|                   0.351| 31|      0|
|          8|    183|           64|           20|     79|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           

In [13]:

# calculer la corrélation entre la variable de réponse et les autres variables
for col in df.columns:
  print('La correlation de  {} avec la variable outcome est {}.'.format(col, df.stat.corr('Outcome', col)))
  #print(f'La correlation de  {col} avec la variable outcome est {df.stat.corr('Outcome', col)}.')

La correlation de  Pregnancies avec la variable outcome est 0.22189815303398638.
La correlation de  Glucose avec la variable outcome est 0.49288410274882094.
La correlation de  BloodPressure avec la variable outcome est 0.16287909949861834.
La correlation de  SkinThickness avec la variable outcome est 0.171856814176564.
La correlation de  Insulin avec la variable outcome est 0.17869558803050842.
La correlation de  BMI avec la variable outcome est 0.31289043493401536.
La correlation de  DiabetesPedigreeFunction avec la variable outcome est 0.17384406565296007.
La correlation de  Age avec la variable outcome est 0.23835598302719757.
La correlation de  Outcome avec la variable outcome est 1.0.


In [14]:
inputCols = ['Pregnancies' , 'Glucose' , 'BloodPressure', 'SkinThickness' , 'Insulin' , 'BMI' , 'DiabetesPedigreeFunction' ,'Age'  ]
assembler = VectorAssembler(inputCols= inputCols, outputCol='features')
output_data = assembler.transform(df)


In [15]:
output_data.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)
 |-- features: vector (nullable = true)



In [16]:
output_data.show(3)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|          6|    148|           72|           35|     79|33.6|                   0.627| 50|      1|[6.0,148.0,72.0,3...|
|          1|     85|           66|           29|     79|26.6|                   0.351| 31|      0|[1.0,85.0,66.0,29...|
|          8|    183|           64|           20|     79|23.3|                   0.672| 32|      1|[8.0,183.0,64.0,2...|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
only showing top 3 rows



In [17]:
final_df = output_data.select('features', 'Outcome')
final_df.show(2)

+--------------------+-------+
|            features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|      1|
|[1.0,85.0,66.0,29...|      0|
+--------------------+-------+
only showing top 2 rows



In [18]:
train, test = final_df.randomSplit([0.7, 0.3])

# créer le modèle
models = LogisticRegression(labelCol='Outcome')

# entrainer le modèle
model = models.fit(train)

In [19]:
summary = model.summary

In [20]:
summary.predictions.show()
summary.predictions.describe().show()

+--------------------+-------+--------------------+--------------------+----------+
|            features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[0.0,57.0,60.0,20...|    0.0|[4.11526163127696...|[0.98394044909218...|       0.0|
|[0.0,67.0,76.0,20...|    0.0|[2.30752514872193...|[0.90949835391248...|       0.0|
|[0.0,73.0,69.0,20...|    0.0|[4.57468490770051...|[0.98979565504996...|       0.0|
|[0.0,74.0,52.0,10...|    0.0|[3.55503751398875...|[0.97221383429274...|       0.0|
|[0.0,78.0,88.0,29...|    0.0|[3.03085421060549...|[0.95394871347242...|       0.0|
|[0.0,86.0,68.0,32...|    0.0|[2.57637203825264...|[0.92932535635686...|       0.0|
|[0.0,91.0,68.0,32...|    0.0|[2.06895430950332...|[0.88784888086778...|       0.0|
|[0.0,93.0,60.0,25...|    0.0|[2.76081510609953...|[0.94052124827437...|       0.0|
|[0.0,93.0,100.0,3...|    0.0|[1.47167022781243...|[0.81331111969979...|    

In [21]:
predictions = model.evaluate(test)

In [22]:
predictions.predictions.show(15)

+--------------------+-------+--------------------+--------------------+----------+
|            features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[0.0,84.0,64.0,22...|      0|[2.40754164126069...|[0.91740058648549...|       0.0|
|[0.0,84.0,82.0,31...|      0|[2.80615722602472...|[0.94300764351404...|       0.0|
|[0.0,91.0,80.0,20...|      0|[2.78267808586654...|[0.94173257124649...|       0.0|
|[0.0,93.0,60.0,20...|      0|[2.25761700357103...|[0.90530554013226...|       0.0|
|[0.0,94.0,69.0,20...|      0|[2.86946588978411...|[0.94631622068313...|       0.0|
|[0.0,95.0,85.0,25...|      1|[2.43899202717853...|[0.91975272319051...|       0.0|
|[0.0,98.0,82.0,15...|      0|[3.64978800222408...|[0.97466206181831...|       0.0|
|[0.0,100.0,88.0,6...|      0|[0.63633938864085...|[0.65392550630144...|       0.0|
|[0.0,101.0,76.0,2...|      0|[2.31924110720608...|[0.91045809184128...|    

In [23]:
from pyspark.ml import evaluation
# évaluer le modèle
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Outcome')
evaluator.evaluate(model.transform(test))

0.8478459484152088

In [24]:
model.save("LogReg_model")

In [25]:
model = LogisticRegressionModel.load('LogReg_model')