In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
# Crear una sesión de Spark
spark = SparkSession.builder.appName('ml').getOrCreate()

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [None]:
# Leer el archivo CSV
df = spark.read.csv(path='/content/Covid Data.csv', header = True,
                    inferSchema=True)

In [None]:
# Mostrar el esquema y los primeros registros
df.printSchema()
df.show()

In [None]:
# Reglas de reemplazo
columnas_reemplazar = {
    'INTUBED': {97, 0},
    'PREGNANT': {97: 0},
    'ICU': {97, 0}
}

# Reemplazo de valores
for col, replacements in columnas_reemplazar.items():
    for old_val, new_val in replacements.items():
        df = df.withColumn(
            col, F.when(F.col(col) == old_val, new_val).otherwise(F.col(col))
        )

In [None]:
# Crear columna 'DIED'
df = df.withColumn(
    'DIED', F.when(F.col('DATE_DIED') != '9999-99-99', 1).otherwise(0)
).drop('DATE_DIED')

In [None]:
# Renombrar columnas
df = df.withColumnRenamed('SEX', 'WOMAN').withColumnRenamed('COPD', 'EPOC')

In [None]:
# Mostrar resultado final
df.printSchema()
df.show()

In [None]:
from pyspark.ml.feature import StringIndexer, Imputer

In [None]:
datos_ejemplo = [
    ('Juan', 'Preparatoria', 15),
    ('María', 'Primaria', 8),
    ('Felipe', 'Secundaria', None),
    ('Nuria', 'Preparatoria', 15),
    ('Enrique', 'Universidad', 20),
    ('Juan', 'Preparatoria', None),
    ('Diana', 'Secundaria', 15),
]

df_ejemplo = spark.sparkContext.parallelize(datos_ejemplo).toDF(['nombre', 'escolaridad', 'edad'])
df_ejemplo.show()

In [None]:
imputer = Imputer(strategy = 'mean', # median, mode
                  inputCols = ["edad"], outputCols = ["edad_imputada"])
df_imputado = imputer.fit(df_ejemplo).transform(df_ejemplo)

In [None]:
df_imputado.show()

In [None]:
indexer = StringIndexer(inputCol = "escolaridad", outputCol = "escolaridad_indexada")
df_indexado = indexer.fit(df_imputado).transform(df_imputado)

In [None]:
df_indexado.show()

In [None]:
df_schema.printSchema()

In [None]:
df_dropna = df_schema.dropna()

In [None]:
df_schema.count()

In [None]:
df_dropna.count()

In [None]:
from pyspark.ml.feature import VectorAssembler, ChiSqSelector

In [None]:
df_dropna.printSchema()

In [None]:
inputCols = ['USMER', 'MEDICAL_UNIT', 'WOMAN', 'PATIENT_TYPE', 'INTUBED',
             'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES', 'PULMONARY_CHRONIC',
             'ASTHMA', 'INMSUPR', 'HIPERTENSION', 'OTHER_DISEASE',
             'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO', 'ICU',
             'DIED']
assembler = VectorAssembler(inputCols = inputCols, outputCol = "features")
df_with_features = assembler.transform(df_dropna)

df_with_features.select('features').show(100, False)

In [None]:
df_with_features.show(3)

In [None]:
# Selección de características usando Chi-Square
selector = ChiSqSelector(numTopFeatures = 3, featuresCol = "features", labelCol="CLASIFFICATION_FINAL",
                         outputCol="selected_features")
df_sel = selector.fit(df_with_features).transform(df_with_features)
df_sel.show(5)

In [None]:
from pyspark.ml.feature import StandardScaler

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="features_scaled")
df_escalado = scaler.fit(df_with_features).transform(df_with_features)

In [None]:
df_escalado.show()