# Regresión Logística Titanic

In [1]:
import warnings
warnings.filterwarnings("ignore")

## Creación Spark

In [2]:
import os, subprocess

java8_home = "/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home"

os.environ["JAVA_HOME"] = java8_home
os.environ["PATH"] = os.path.join(java8_home, "bin") + os.pathsep + os.environ.get("PATH","")

os.environ["HADOOP_USER_NAME"] = os.environ.get("USER", "tomas")

print("JAVA_HOME fijado a:", os.environ["JAVA_HOME"])
try:
    print("which java (kernel):", subprocess.check_output(["which","java"]).decode().strip())
    print("java -version (kernel):")
    print(subprocess.check_output(["java","-version"], stderr=subprocess.STDOUT).decode())
except Exception as e:
    print("Error llamando a java desde kernel:", e)

JAVA_HOME fijado a: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
which java (kernel): /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home/bin/java
java -version (kernel):
openjdk version "1.8.0_292"
OpenJDK Runtime Environment (AdoptOpenJDK)(build 1.8.0_292-b10)
OpenJDK 64-Bit Server VM (AdoptOpenJDK)(build 25.292-b10, mixed mode)



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('myproj').getOrCreate()

25/09/13 21:25:09 WARN Utils: Your hostname, MacBook-Air-de-Tomas-3.local resolves to a loopback address: 127.0.0.1; using 192.168.1.4 instead (on interface en0)
25/09/13 21:25:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/09/13 21:25:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/13 21:25:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/09/13 21:25:12 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Importación de datos

In [4]:
data = spark.read.csv('../PySparkCourse/MLData/titanic.csv',inferSchema=True,header=True)

                                                                                

In [5]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [6]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [7]:
selected_cols = data.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [8]:
clean_data = selected_cols.na.drop()

## Transformación de Datos

- **VectorAssembler:** Concatena columnas numéricas (Vectores en una sola columna vectorial para features)
- **VectorIndexer:** Indexar la categoría (String -> Integer index)
- **OneHotEncoder:** Conversión de índice a binarios
<br>

- **StringIndexer:** Convierte strings a índices numéricos

In [None]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [10]:
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [11]:
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkVec'], outputCol='features')

## Modelo

- **Pipeline:** Encadena transformaciones

In [12]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [13]:
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')

In [18]:
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler,log_reg_titanic])

In [21]:
train_titanic_data, test_titanic_data = clean_data.randomSplit([0.7,.3])
fit_model = pipeline.fit(train_titanic_data)
results = fit_model.transform(test_titanic_data)

## Evaluación del Modelo

In [33]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

In [24]:
results.select('Survived','features','prediction').show(10)

+--------+--------------------+----------+
|Survived|            features|prediction|
+--------+--------------------+----------+
|       0|[1.0,1.0,29.0,1.0...|       0.0|
|       0|[1.0,1.0,36.0,0.0...|       1.0|
|       0|[1.0,1.0,36.0,1.0...|       0.0|
|       0|[1.0,1.0,37.0,0.0...|       1.0|
|       0|[1.0,1.0,38.0,0.0...|       0.0|
|       0|(8,[0,1,2,6],[1.0...|       0.0|
|       0|[1.0,1.0,45.0,0.0...|       0.0|
|       0|[1.0,1.0,50.0,1.0...|       0.0|
|       0|[1.0,1.0,51.0,0.0...|       0.0|
|       0|[1.0,1.0,54.0,0.0...|       0.0|
+--------+--------------------+----------+
only showing top 10 rows



In [30]:
evaluator = BinaryClassificationEvaluator(
    rawPredictionCol='rawPrediction', 
    labelCol='Survived', 
    metricName='areaUnderROC'
)
auc_roc = evaluator.evaluate(results)
print("AUC (ROC):", auc_roc)

# AUC (PR) si quieres precisión-recall area
auc_pr = evaluator.setMetricName("areaUnderPR").evaluate(results)
print("AUC (PR):", auc_pr)

AUC (ROC): 0.8539886039886044
AUC (PR): 0.8312582236915125


**Análisis**

- **AUC (ROC):** área bajo la curva ROC. Mide la capacidad del modelo para ordenar ejemplos positivos por encima de negativos sin depender de un umbral (1 = perfecto, 0.5 = aleatorio). **Indica que el modelo ordena bien las instancias con un 85% de que un positivo tenga score mayor que un negativo**
<br>

- **AUC PR:** área bajo la curva Precisión-Recall. Confirma que sobre la clase positiva, el rendimiento en la región de interés también es bueno

In [34]:
tp = results.filter((col("Survived") == 1.0) & (col("prediction") == 1.0)).count()
tn = results.filter((col("Survived") == 0.0) & (col("prediction") == 0.0)).count()
fp = results.filter((col("Survived") == 0.0) & (col("prediction") == 1.0)).count()
fn = results.filter((col("Survived") == 1.0) & (col("prediction") == 0.0)).count()
total = results.count()

accuracy = (tp + tn) / total if total > 0 else 0
precision_pos = tp / (tp + fp) if (tp + fp) > 0 else 0
recall_pos = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_pos = 2 * precision_pos * recall_pos / (precision_pos + recall_pos) if (precision_pos + recall_pos) > 0 else 0

In [36]:
print("\nConfusion Matrix and Metrics:")
print(f"TP (1,1): {tp}")
print(f"TN (0,0): {tn}")
print(f"FP (0,1): {fp}")
print(f"FN (1,0): {fn}")
print(f"Total: {total}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (pos): {precision_pos:.4f}")
print(f"Recall (pos): {recall_pos:.4f}")
print(f"F1 (pos): {f1_pos:.4f}")

results.groupBy("Survived", "prediction").count().orderBy("Survived", "prediction").show()


Confusion Matrix and Metrics:
TP (1,1): 55
TN (0,0): 110
FP (0,1): 16
FN (1,0): 23
Total: 204
Accuracy: 0.8088
Precision (pos): 0.7746
Recall (pos): 0.7051
F1 (pos): 0.7383


                                                                                

+--------+----------+-----+
|Survived|prediction|count|
+--------+----------+-----+
|       0|       0.0|  110|
|       0|       1.0|   16|
|       1|       0.0|   23|
|       1|       1.0|   55|
+--------+----------+-----+



**Análisis**

- **Accuracy ≈ 80.9%:** buen nivel global, pero en clasificación binaria la accuracy puede ocultar errores si hay costos diferentes para FP y FN.

- **Precision ≈ 77.5%:** cuando el modelo predice “sobrevivió”, ~77.5% de esos casos son correctos — es una tasa razonable.

- **Recall ≈ 70.5%:** detecta ~70% de los sobrevivientes reales; se pierden ~29.5% de positivos (FN).

- **F1 ≈ 73.8%:** balance decente entre precision y recall.

- **FP = 16 (12.7% FPR):** hay falsos positivos, pero relativamente pocos comparado con los negativos totales.

- **FN = 23:** se perdieron 23 sobrevivientes reales — si el negocio considera costoso no detectar a un sobreviviente, esto puede ser importante.