In [None]:
#Reinitialize the Python interpreter, clearing all variables and imports
#%reset -f

*Configuración global de entorno*

In [42]:
# ============================================================
# (Importaciones, semilla, estilo)
# ============================================================

# --- Librerías base ---
import os, sys, random, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- Scikit-learn (modelos, métricas, CV, etc.) ---
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, KFold, GridSearchCV, cross_val_score
)
from sklearn.metrics import (
    f1_score, classification_report, confusion_matrix,
    ConfusionMatrixDisplay, mean_squared_error, accuracy_score, zero_one_loss
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.datasets import make_classification, make_blobs
from sklearn.preprocessing import StandardScaler

# --- Librerías NLP ---
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# --- Librerías de ISLP (para Carseats) ---
from ISLP import load_data

# --- Configuración general y reproducibilidad ---
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# --- Configuración visual de gráficos ---
plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["axes.grid"] = True
plt.rcParams["figure.dpi"] = 110
plt.rcParams["font.size"] = 11

# --- Descargas NLTK (solo la primera vez) ---
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# --- Recursos NLP comunes ---
stop_en = set(stopwords.words('english'))
lem = WordNetLemmatizer()

print("Entorno cargado correctamente - listo para Part A, B y C")


Entorno cargado correctamente - listo para Part A, B y C


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


# Part A: Binary Classification on Text Data

## PASO 1 — Carga del dataset y split 70/30 estratificado

In [43]:

# 1) Buscar el CSV según dónde se ejecute el notebook (raíz o /src)
paths = ["data/disaster_tweets.csv", "../data/disaster_tweets.csv"]
csv_path = next((p for p in paths if os.path.exists(p)), None)
assert csv_path, "No se encontró 'data/disaster_tweets.csv'. Verifica la ruta."

In [44]:
# 2) Cargar y validar columnas requeridas
df = pd.read_csv(csv_path)
assert {"text","target"}.issubset(df.columns), "Faltan columnas requeridas: 'text' y/o 'target'"


In [48]:

# 3) % de clases
y = df["target"].astype(int)
X_text = df["text"].astype(str)

pct1 = y.mean()*100
print(f"Porcentaje target=1 (desastre): {pct1:.2f}% | target=0: {100-pct1:.2f}%")

Porcentaje target=1 (desastre): 42.97% | target=0: 57.03%


In [49]:
# 4) Split 70/30 estratificado

X_tr, X_te, y_tr, y_te = train_test_split(
    X_text, y, test_size=0.30, stratify=y, random_state=SEED
)
print(f"Train: {len(X_tr)} | Test: {len(X_te)}")

Train: 5329 | Test: 2284


## PASO 2-Preprocesamiento (limpieza de texto)

In [55]:
# =========================
# PASO 2: PREPROCESAMIENTO
# =========================
stop_en = set(stopwords.words('english'))
lem = WordNetLemmatizer()
pat_url = re.compile(r'https?://\S+|www\.\S+')
pat_at  = re.compile(r'@\w+')

def clean_tweet(s: str) -> str:
    s = s.lower()
    s = pat_url.sub(' ', s)            # quita URLs
    s = pat_at.sub(' ', s)             # quita @menciones
    s = re.sub(r'#', ' ', s)           # elimina '#' pero deja la palabra
    s = re.sub(r'[^a-z0-9\s]', ' ', s) # quita puntuación/símbolos
    toks = [t for t in s.split() if t not in stop_en]
    toks = [lem.lemmatize(t) for t in toks]
    return " ".join(toks)

X_tr_clean = X_tr.apply(clean_tweet)
X_te_clean = X_te.apply(clean_tweet)

pd.DataFrame({"raw": X_tr.head(3).values, "clean": X_tr_clean.head(3).values})

Unnamed: 0,raw,clean
0,Las Vegas in top 5 cities for red-light runnin...,la vega top 5 city red light running fatality
1,Do you feel like you are sinking in unhappines...,feel like sinking unhappiness take quiz
2,The Architect Behind Kanye WestÛªs Volcano ht...,architect behind kanye west volcano


## PASO 3 — Bag of Words binario (solo fit en train)

In [56]:
# ==========================================
# PASO 3: BoW BINARIO (CountVectorizer)
# ==========================================
M = 5  # razonable: 3–10. Reduce ruido de términos ultra raros.
vec_uni = CountVectorizer(binary=True, min_df=M)

Xtr_uni = vec_uni.fit_transform(X_tr_clean)   # fit en train
Xte_uni = vec_uni.transform(X_te_clean)       # transform en test

print("Tamaño del vocabulario (unigramas):", len(vec_uni.vocabulary_))

Tamaño del vocabulario (unigramas): 1952


## PASO 4 — Regresión Logística (none, L1, L2) con F1

In [57]:
# (a) Sin regularización (penalty=None en sklearn>=1.4)
lr_none = LogisticRegression(penalty=None, solver='lbfgs', max_iter=5000)
lr_none.fit(Xtr_uni, y_tr)
f1_tr_none = f1_score(y_tr, lr_none.predict(Xtr_uni))
f1_te_none = f1_score(y_te, lr_none.predict(Xte_uni))
print(f"[LR none] F1 train={f1_tr_none:.3f} | test={f1_te_none:.3f}")

[LR none] F1 train=0.967 | test=0.705


In [58]:
# (b) L1 con búsqueda de C
C_grid = np.logspace(-2, 2, 11)  # 0.01 ... 100
cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
lr_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=5000)
gs_l1 = GridSearchCV(lr_l1, {"C": C_grid}, scoring="f1", cv=cv5, n_jobs=-1)
gs_l1.fit(Xtr_uni, y_tr)
best_l1 = gs_l1.best_estimator_
f1_tr_l1 = f1_score(y_tr, best_l1.predict(Xtr_uni))
f1_te_l1 = f1_score(y_te, best_l1.predict(Xte_uni))
print(f"[LR L1] best={gs_l1.best_params_} | F1 train={f1_tr_l1:.3f} | test={f1_te_l1:.3f}")


[LR L1] best={'C': np.float64(2.5118864315095824)} | F1 train=0.891 | test=0.737


In [59]:
# (c) L2 con búsqueda de C
lr_l2 = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=5000)
gs_l2 = GridSearchCV(lr_l2, {"C": C_grid}, scoring="f1", cv=cv5, n_jobs=-1)
gs_l2.fit(Xtr_uni, y_tr)
best_l2 = gs_l2.best_estimator_
f1_tr_l2 = f1_score(y_tr, best_l2.predict(Xtr_uni))
f1_te_l2 = f1_score(y_te, best_l2.predict(Xte_uni))
print(f"[LR L2] best={gs_l2.best_params_} | F1 train={f1_tr_l2:.3f} | test={f1_te_l2:.3f}")


[LR L2] best={'C': np.float64(0.3981071705534973)} | F1 train=0.838 | test=0.757


In [60]:

pd.DataFrame({
    "Modelo": ["LR none","LR L1","LR L2"],
    "F1_train": [f1_tr_none, f1_tr_l1, f1_tr_l2],
    "F1_test":  [f1_te_none, f1_te_l1, f1_te_l2]
})

Unnamed: 0,Modelo,F1_train,F1_test
0,LR none,0.967387,0.704534
1,LR L1,0.891462,0.737008
2,LR L2,0.838208,0.757143


## A.5 Interpretación del modelo L1 (palabras clave)