In [27]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, Imputer, \
    VectorAssembler, ChiSqSelector, StandardScaler
from pyspark.ml.classification \
    import LogisticRegression, RandomForestClassifier
from pyspark.ml.regression import LinearRegression
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation \
    import MulticlassClassificationEvaluator

In [2]:
# Crear una sesión de Spark
spark = SparkSession.builder.appName('ml').getOrCreate()

24/11/07 12:46:51 WARN Utils: Your hostname, andres-b460mds3h resolves to a loopback address: 127.0.1.1; using 192.168.1.78 instead (on interface enp3s0)
24/11/07 12:46:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/07 12:46:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [28]:
# Leer el archivo CSV
df = spark.read.csv(path='./content/Covid Data.csv',
                    header = True, inferSchema=True)

# Reglas de reemplazo
columnas_reemplazar = {
    'INTUBED': {97: 0},
    'PREGNANT': {97: 0},
    'ICU': {97: 0}
}

# Realizar reemplazo de valores
for col, replacements in columnas_reemplazar.items():
    for old_val, new_val in replacements.items():
        df = df.withColumn(col,
                           F.when(F.col(col) == old_val, new_val)
                           .otherwise(F.col(col)))

# Crear columna 'DIED' y renombrar columnas
df = df.withColumn('DIED',
                   F.when(F.col('DATE_DIED') != '9999-99-99', 1)
                   .otherwise(0)) \
        .drop('DATE_DIED') \
        .withColumnRenamed('SEX', 'WOMAN') \
        .withColumnRenamed('COPD', 'EPOC')

# Eliminar filas con valores nulos
df_cleaned = df.dropna()

# Mostrar esquema y primeros registros
df_cleaned.printSchema()
df_cleaned.show()

root
 |-- USMER: integer (nullable = true)
 |-- MEDICAL_UNIT: integer (nullable = true)
 |-- WOMAN: integer (nullable = true)
 |-- PATIENT_TYPE: integer (nullable = true)
 |-- INTUBED: integer (nullable = true)
 |-- PNEUMONIA: integer (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- PREGNANT: integer (nullable = true)
 |-- DIABETES: integer (nullable = true)
 |-- EPOC: integer (nullable = true)
 |-- ASTHMA: integer (nullable = true)
 |-- INMSUPR: integer (nullable = true)
 |-- HIPERTENSION: integer (nullable = true)
 |-- OTHER_DISEASE: integer (nullable = true)
 |-- CARDIOVASCULAR: integer (nullable = true)
 |-- OBESITY: integer (nullable = true)
 |-- RENAL_CHRONIC: integer (nullable = true)
 |-- TOBACCO: integer (nullable = true)
 |-- CLASIFFICATION_FINAL: integer (nullable = true)
 |-- ICU: integer (nullable = true)
 |-- DIED: integer (nullable = false)

+-----+------------+-----+------------+-------+---------+---+--------+--------+----+------+-------+------------+---------

In [11]:
# Crear datos de ejemplo
datos_ejemplo = [
    ('Juan', 'Preparatoria', 15),
    ('María', 'Primaria', 8),
    ('Felipe', 'Secundaria', None),
    ('Nuria', 'Preparatoria', 15),
    ('Enrique', 'Universidad', 20),
    ('Juan', 'Preparatoria', None),
    ('Diana', 'Secundaria', 15),
]

# Crear DataFrame de ejemplo
df_ejemplo = spark.createDataFrame(
    datos_ejemplo,['nombre', 'escolaridad', 'edad'])
df_ejemplo.show()

+-------+------------+----+
| nombre| escolaridad|edad|
+-------+------------+----+
|   Juan|Preparatoria|  15|
|  María|    Primaria|   8|
| Felipe|  Secundaria|NULL|
|  Nuria|Preparatoria|  15|
|Enrique| Universidad|  20|
|   Juan|Preparatoria|NULL|
|  Diana|  Secundaria|  15|
+-------+------------+----+



In [None]:
# Imputar valores faltantes en la columna 'edad'
imputer = Imputer(strategy='mean', inputCols=["edad"],
                  outputCols=["edad_imputada"])
df_imputado = imputer.fit(df_ejemplo).transform(df_ejemplo)
df_imputado.show()

+-------+------------+----+-------------+
| nombre| escolaridad|edad|edad_imputada|
+-------+------------+----+-------------+
|   Juan|Preparatoria|  15|           15|
|  María|    Primaria|   8|            8|
| Felipe|  Secundaria|NULL|           14|
|  Nuria|Preparatoria|  15|           15|
|Enrique| Universidad|  20|           20|
|   Juan|Preparatoria|NULL|           14|
|  Diana|  Secundaria|  15|           15|
+-------+------------+----+-------------+



In [17]:
# Indexar la columna 'escolaridad'
indexer = StringIndexer(inputCol="escolaridad",
                        outputCol="escolaridad_indexada")
df_indexado = indexer.fit(df_imputado).transform(df_imputado)
df_indexado.show()

+-------+------------+----+-------------+--------------------+
| nombre| escolaridad|edad|edad_imputada|escolaridad_indexada|
+-------+------------+----+-------------+--------------------+
|   Juan|Preparatoria|  15|           15|                 0.0|
|  María|    Primaria|   8|            8|                 2.0|
| Felipe|  Secundaria|NULL|           14|                 1.0|
|  Nuria|Preparatoria|  15|           15|                 0.0|
|Enrique| Universidad|  20|           20|                 3.0|
|   Juan|Preparatoria|NULL|           14|                 0.0|
|  Diana|  Secundaria|  15|           15|                 1.0|
+-------+------------+----+-------------+--------------------+



In [29]:
# Obtener las columnas de entrada (todas menos
# 'CLASIFFICATION_FINAL')
columnas_entrada = [col for col in df.columns
                    if col != 'CLASIFFICATION_FINAL']

# Crear el vector de características
assembler = VectorAssembler(inputCols=columnas_entrada,
                            outputCol="features")
df_with_features = assembler.transform(df_cleaned)
df_with_features.show()

+-----+------------+-----+------------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+----+--------------------+
|USMER|MEDICAL_UNIT|WOMAN|PATIENT_TYPE|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|EPOC|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|DIED|            features|
+-----+------------+-----+------------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+----+--------------------+
|    2|           1|    1|           1|      0|        1| 65|       2|       2|   2|     2|      2|           1|            2|             2|      2|            2|      2|                   3|  0|   1|[2.0,1.0,1.0,1.0,...|
|    2|           1|    2|           1|      0|        1| 72|       0|       2|   2|     2|      2|         

In [24]:
# Selección de características usando Chi-Squared
selector = ChiSqSelector(numTopFeatures=3,
                         featuresCol="features",
                         labelCol="CLASIFFICATION_FINAL")
df_selected_features = selector.fit(df_with_features) \
    .transform(df_with_features)
df_selected_features.show()

[Stage 83:>                                                       (0 + 14) / 14]

+-----+------------+-----+------------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+----+--------------------+----------------------------------+
|USMER|MEDICAL_UNIT|WOMAN|PATIENT_TYPE|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|EPOC|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|DIED|            features|ChiSqSelector_a4564d8bb20a__output|
+-----+------------+-----+------------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+----+--------------------+----------------------------------+
|    2|           1|    1|           1|      0|        1| 65|       2|       2|   2|     2|      2|           1|            2|             2|      2|            2|      2|                   3|  0|   1|[2.0,1.0,1.0,1.0,...|    

                                                                                

In [26]:
# Escalar las características
scaler = StandardScaler(inputCol="features",
                        outputCol="features_scaled")
df_scaled = scaler.fit(df_selected_features) \
    .transform(df_selected_features)
df_scaled.show()

+-----+------------+-----+------------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+----+--------------------+----------------------------------+--------------------+
|USMER|MEDICAL_UNIT|WOMAN|PATIENT_TYPE|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|EPOC|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|DIED|            features|ChiSqSelector_a4564d8bb20a__output|     features_scaled|
+-----+------------+-----+------------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+----+--------------------+----------------------------------+--------------------+
|    2|           1|    1|           1|      0|        1| 65|       2|       2|   2|     2|      2|           1|            2|             2|      2|            2|

                                                                                

In [31]:
# Devisión de datos en entrenamiento y prueba
train, test = df_scaled.randomSplit([0.7, 0.3])
train.show()

+-----+------------+-----+------------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+----+--------------------+----------------------------------+--------------------+
|USMER|MEDICAL_UNIT|WOMAN|PATIENT_TYPE|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|EPOC|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|DIED|            features|ChiSqSelector_a4564d8bb20a__output|     features_scaled|
+-----+------------+-----+------------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+----+--------------------+----------------------------------+--------------------+
|    1|           2|    1|           1|      0|        2|  4|       2|       2|   2|     2|      2|           2|            2|             1|      2|            2|

                                                                                

In [32]:
# Modelos de clasificación y regresión

# Regresión Logística
lr = LogisticRegression(featuresCol="features_scaled",
                        labelCol="CLASIFFICATION_FINAL")
lr_model = lr.fit(train)
lr_predictions = lr_model.transform(test)
lr_evaluator = MulticlassClassificationEvaluator(
    labelCol="CLASIFFICATION_FINAL", predictionCol="prediction",
    metricName="accuracy")
lr_accuracy = lr_evaluator.evaluate(lr_predictions)

# Mostrar la exactitud de la regresión logística
print(f"Logistic Regression Accuracy: {lr_accuracy}")

[Stage 326:>                                                      (0 + 14) / 14]

Logistic Regression Accuracy: 0.5391016563509363


                                                                                

In [35]:
# Regresión Lineal
linearr = LinearRegression(featuresCol="features",
                           labelCol="CLASIFFICATION_FINAL", 
                           regParam=0.1)
linearr_model = linearr.fit(train)
linearr_predictions = linearr_model.transform(test)
linearr_predictions.select("prediction", "CLASIFFICATION_FINAL").show()

                                                                                

+------------------+--------------------+
|        prediction|CLASIFFICATION_FINAL|
+------------------+--------------------+
| 5.667647875096131|                   6|
| 5.643704576207443|                   6|
| 5.363414766441022|                   3|
| 5.125386416321824|                   7|
|  4.47877847888306|                   7|
|5.1777655752307155|                   3|
|  5.23366823572977|                   7|
| 5.163720695134667|                   3|
|6.7622205774587085|                   7|
| 5.975543851185798|                   7|
| 6.736867818404116|                   7|
|6.6622190014062435|                   7|
| 5.576384374040521|                   6|
| 5.567933454355657|                   6|
| 5.559482534670793|                   3|
| 5.474973337822151|                   6|
|5.4327187393978305|                   3|
| 6.389269324556091|                   6|
| 4.218393000826522|                   7|
|5.1201305528748335|                   7|
+------------------+--------------

In [37]:
rf = RandomForestClassifier(featuresCol="features",
                            labelCol="CLASIFFICATION_FINAL")
rf_model = rf.fit(train)
rf_predictions = rf_model.transform(test)
rf_predictions.select("prediction", "CLASIFFICATION_FINAL").show()

24/11/07 13:26:33 WARN MemoryStore: Not enough space to cache rdd_887_1 in memory! (computed 2.5 MiB so far)
24/11/07 13:26:33 WARN MemoryStore: Not enough space to cache rdd_887_7 in memory! (computed 3.8 MiB so far)
24/11/07 13:26:33 WARN MemoryStore: Not enough space to cache rdd_887_8 in memory! (computed 5.8 MiB so far)
24/11/07 13:26:33 WARN MemoryStore: Not enough space to cache rdd_887_9 in memory! (computed 5.8 MiB so far)
24/11/07 13:26:33 WARN MemoryStore: Not enough space to cache rdd_887_3 in memory! (computed 3.8 MiB so far)
24/11/07 13:26:33 WARN MemoryStore: Not enough space to cache rdd_887_11 in memory! (computed 5.8 MiB so far)
24/11/07 13:26:33 WARN MemoryStore: Not enough space to cache rdd_887_13 in memory! (computed 9.0 MiB so far)
24/11/07 13:26:33 WARN MemoryStore: Not enough space to cache rdd_887_6 in memory! (computed 5.8 MiB so far)
24/11/07 13:26:33 WARN MemoryStore: Not enough space to cache rdd_887_0 in memory! (computed 5.8 MiB so far)
24/11/07 13:26:33

+----------+--------------------+
|prediction|CLASIFFICATION_FINAL|
+----------+--------------------+
|       7.0|                   6|
|       7.0|                   6|
|       7.0|                   3|
|       7.0|                   7|
|       3.0|                   7|
|       7.0|                   3|
|       7.0|                   7|
|       7.0|                   3|
|       7.0|                   7|
|       7.0|                   7|
|       7.0|                   7|
|       7.0|                   7|
|       7.0|                   6|
|       7.0|                   6|
|       7.0|                   3|
|       7.0|                   6|
|       7.0|                   3|
|       6.0|                   6|
|       3.0|                   7|
|       7.0|                   7|
+----------+--------------------+
only showing top 20 rows



                                                                                

In [39]:
# KMeans (Clustering)
kmeans = KMeans(k=3, featuresCol="features", predictionCol="prediction")
kmeans_model = kmeans.fit(train)
kmeans_predictions = kmeans_model.transform(train)
kmeans_predictions.select("prediction").show()

24/11/07 13:28:21 WARN MemoryStore: Not enough space to cache rdd_938_5 in memory! (computed 5.3 MiB so far)
24/11/07 13:28:21 WARN BlockManager: Persisting block rdd_938_5 to disk instead.
24/11/07 13:28:21 WARN MemoryStore: Not enough space to cache rdd_938_9 in memory! (computed 1544.0 KiB so far)
24/11/07 13:28:21 WARN MemoryStore: Not enough space to cache rdd_938_1 in memory! (computed 3.5 MiB so far)
24/11/07 13:28:21 WARN MemoryStore: Not enough space to cache rdd_938_0 in memory! (computed 2.3 MiB so far)
24/11/07 13:28:21 WARN MemoryStore: Not enough space to cache rdd_938_3 in memory! (computed 5.3 MiB so far)
24/11/07 13:28:21 WARN MemoryStore: Not enough space to cache rdd_938_13 in memory! (computed 5.3 MiB so far)
24/11/07 13:28:21 WARN MemoryStore: Not enough space to cache rdd_938_7 in memory! (computed 5.3 MiB so far)
24/11/07 13:28:21 WARN BlockManager: Persisting block rdd_938_3 to disk instead.
24/11/07 13:28:21 WARN BlockManager: Persisting block rdd_938_13 to dis

+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         1|
|         1|
|         2|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
+----------+
only showing top 20 rows

