<p> PONTIFICIA UNIVERSIDAD JAVERIANA </p>
<p> Autor: Santiago Ortiz </p> 
<p> Fecha: 29 de Julio 2024 </p>
<p> Tema: Procesamiento de datos 2430 </p>

In [0]:
## Importar bibliotecas
import pyspark
from pyspark import SparkFiles
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, FloatType, DoubleType, StringType

from pyspark.sql.functions import mean, col, split, when, isnan, count, udf

In [0]:
## Se carga la URL de los datos
urlStroke = "https://raw.githubusercontent.com/corredor-john/ExploratoryDataAnalisys/main/Varios/stroke_pyspark.csv"
sc.addFile(urlStroke)
## Se crea el objeto spark
path = SparkFiles.get("stroke_pyspark.csv")
dfStroke = spark.read.csv("file://"+ path, inferSchema = True, header = True, sep = ",")

dfStroke.show()

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female|79.0|           1|            0|         

In [0]:
## Inspección del dataset
dfStroke.columns

['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke']

In [0]:
## Se cambia los nombres de las columnas
nuevosNombres = ['Id','Genero','Edad','Hipertension','Enfermedad_Corazon','Estado_civil','Tipo_Trabajo','Tipo_Residencia','Nivel_Glucosa','IMC', 'Fumador','P_Cardiaco']

## Copia del original
dfStroke01 = dfStroke
for antes, nuevos in zip(dfStroke.columns, nuevosNombres):
    dfStroke01 = dfStroke01.withColumnRenamed(antes, nuevos)

## Revisión
dfStroke01.columns

['Id',
 'Genero',
 'Edad',
 'Hipertension',
 'Enfermedad_Corazon',
 'Estado_civil',
 'Tipo_Trabajo',
 'Tipo_Residencia',
 'Nivel_Glucosa',
 'IMC',
 'Fumador',
 'P_Cardiaco']

In [0]:
## Se castea a double IMC
dfStroke01 = dfStroke01.withColumn("IMC", dfStroke01.IMC.cast(DoubleType()))
dfStroke01.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Genero: string (nullable = true)
 |-- Edad: double (nullable = true)
 |-- Hipertension: integer (nullable = true)
 |-- Enfermedad_Corazon: integer (nullable = true)
 |-- Estado_civil: string (nullable = true)
 |-- Tipo_Trabajo: string (nullable = true)
 |-- Tipo_Residencia: string (nullable = true)
 |-- Nivel_Glucosa: double (nullable = true)
 |-- IMC: double (nullable = true)
 |-- Fumador: string (nullable = true)
 |-- P_Cardiaco: integer (nullable = true)



In [0]:
## Se castea a integer Edad
dfStroke01 = dfStroke01.withColumn("Edad", dfStroke01.Edad.cast(IntegerType()))
dfStroke01.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Genero: string (nullable = true)
 |-- Edad: integer (nullable = true)
 |-- Hipertension: integer (nullable = true)
 |-- Enfermedad_Corazon: integer (nullable = true)
 |-- Estado_civil: string (nullable = true)
 |-- Tipo_Trabajo: string (nullable = true)
 |-- Tipo_Residencia: string (nullable = true)
 |-- Nivel_Glucosa: double (nullable = true)
 |-- IMC: double (nullable = true)
 |-- Fumador: string (nullable = true)
 |-- P_Cardiaco: integer (nullable = true)



In [0]:
## Se hace una revisión sobre los datos que son NULOS
dfStroke01.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in dfStroke01.columns]).show()

+---+------+---+------------+-------------+------------+---------+--------------+-----------------+---+--------------+----------+
| id|gender|age|hypertension|heart_disease|ever_married|work_type|Residence_type|avg_glucose_level|bmi|smoking_status|P_Cardiaco|
+---+------+---+------------+-------------+------------+---------+--------------+-----------------+---+--------------+----------+
|  0|     0|  0|           0|            0|           0|        0|             0|                0|  0|             0|         0|
+---+------+---+------------+-------------+------------+---------+--------------+-----------------+---+--------------+----------+



In [0]:
dfStroke01.groupBy(['Genero']).count().show()
dfStroke01.groupBy(['Tipo_Trabajo']).count().show()
dfStroke01.groupBy(['Tipo_Residencia']).count().show()
dfStroke01.groupBy(['Fumador']).count().show()
dfStroke01.groupBy(['Estado_civil']).count().show()

+------+-----+
|Genero|count|
+------+-----+
|Female| 2994|
| Other|    1|
|  Male| 2115|
+------+-----+

+-------------+-----+
| Tipo_Trabajo|count|
+-------------+-----+
| Never_worked|   22|
|Self-employed|  819|
|      Private| 2925|
|     children|  687|
|     Govt_job|  657|
+-------------+-----+

+---------------+-----+
|Tipo_Residencia|count|
+---------------+-----+
|          Urban| 2596|
|          Rural| 2514|
+---------------+-----+

+---------------+-----+
|        Fumador|count|
+---------------+-----+
|         smokes|  789|
|        Unknown| 1544|
|   never smoked| 1892|
|formerly smoked|  885|
+---------------+-----+

+------------+-----+
|Estado_civil|count|
+------------+-----+
|          No| 1757|
|         Yes| 3353|
+------------+-----+



TRATAMIENTO DE COLUMNA GENDER

In [0]:
## Se elimina eñ dato "OTHER" en Genero
dfStroke02 = dfStroke01.where("Genero <> 'Other'")

dfStroke02.groupBy(['Genero']).count().show()

+------+-----+
|Genero|count|
+------+-----+
|Female| 2994|
|  Male| 2115|
+------+-----+



In [0]:
## Inspección visual estadística de EDAD
dfStroke02.select(dfStroke02['Edad']).describe().show()

+-------+-----------------+
|summary|             Edad|
+-------+-----------------+
|  count|             5109|
|   mean|43.21863378351928|
| stddev|22.63479930976257|
|    min|                0|
|    max|               82|
+-------+-----------------+

