# FERNANDO LEON FRANCO

In [1]:
import os
import re
import xml.etree.ElementTree as ET

import matplotlib.pyplot as plt
import nltk
import numpy as np
import polars as pl
from wordcloud import WordCloud
from bs4 import BeautifulSoup
from colorstreak import Logger
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn import metrics, preprocessing, svm
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import normalize

In [2]:
def print_bar(i, cantidad_registros, contexto="PROGRESO"):
    porcentaje = (i + 1) / cantidad_registros * 100
    # Con emojis
    barra = int(50 * (i + 1) / cantidad_registros) * "🟩"
    espacio = int(50 - len(barra)) * "⬛️"

    print(f"\r{contexto}: |{barra}{espacio}| {porcentaje:6.2f}%", end="", flush=True)

In [3]:

# ======================= Limpieza de texto =======================
def limpiar_texto(texto):
    
    texto = BeautifulSoup(texto, "html.parser").get_text()
    texto = texto.lower()
    texto = re.sub(r"http\S+|www\S+|https\S+", "", texto)
    texto = re.sub(r"@\w+", "", texto)
    texto = re.sub(r"#+", "", texto)
    stop_words = set(stopwords.words("spanish"))
    texto_limpio = [word for word in texto.split() if word not in stop_words]
    return " ".join(texto_limpio)


# ======================= Carga de datos =======================
def get_texts_from_folder(path_folder):
    tr_txt = []  # aquí van los documentos
    tr_y = []  # aquí van las etiquetas

    for file in os.listdir(path_folder):
        if file.endswith(".xml"):
            tree = ET.parse(os.path.join(path_folder, file))
            root = tree.getroot()
            docs = []
            for doc in root.iter("document"):
                texto_limpio = limpiar_texto(doc.text)
                docs.append(texto_limpio)
    
            tr_txt.append(" ".join(docs))

    truth_file = os.path.join(path_folder, "truth.txt")

    file_to_label = {}
    with open(truth_file, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(":::")
            # País
            
            # pais = parts[2]
            # file_to_label[parts[0]] = pais
            
            # Genero
            genero = parts[1]
            file_to_label[parts[0]] = genero



    for file in os.listdir(path_folder):
        if file.endswith(".xml"):
            file_id = file.split(".")[0]
            if file_id in file_to_label:
                tr_y.append(file_to_label[file_id])

        print_bar(len(tr_y), len(file_to_label), contexto="CARGA DE ETIQUETAS")

    return tr_txt, tr_y


In [4]:
# ======================= Carga de datos =======================
path_test = '/Users/ferleon/Github/semestre_v/procesamiento_lenguaje/data/author_profiling/es_test'
path_train = '/Users/ferleon/Github/semestre_v/procesamiento_lenguaje/data/author_profiling/es_train'
tr_txt_train, tr_y_train = get_texts_from_folder(path_train)
#tr_txt_test, tr_y_test = get_texts_from_folder(path_test)

print(f"\nTextos train: {len(tr_txt_train)}, Etiquetas train: {len(tr_y_train)}")
#print(f"Textos test: {len(tr_txt_test)}, Etiquetas test: {len(tr_y_test)}")


paises = sorted(list(set(tr_y_train)))
paises_numericas = {pais: idx for idx, pais in enumerate(paises)}




y_train = [paises_numericas[pais] for pais in tr_y_train]
#y_test = [paises_numericas[pais] for pais in tr_y_test]


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  texto = BeautifulSoup(texto, "html.parser").get_text()


CARGA DE ETIQUETAS: |🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩| 100.02%
Textos train: 4200, Etiquetas train: 4200


In [None]:
tokenizer = TweetTokenizer()
# ======================= Parámetros =======================
TOP_PALABRAS = 1_000
MAX_ITERACIONES = 1_000

# ======================= Creación del conteo de palabras por documento =======================

tokens_por_documento = []
for doc in tr_txt_train:                # baja a minúsculas, quita urls, etc.
    toks = tokenizer.tokenize(doc.lower())              # tu TweetTokenizer
    tokens_por_documento.append(toks)

print(f"Tamaño del tokens por documento: {len(tokens_por_documento)}")
print(f"Primeros 3 documentos: {tokens_por_documento[:10]}")

vocabulario_por_documento = {}
for doc in tokens_por_documento:
    palabras_unicas = set(doc)
    for token in palabras_unicas:
        vocabulario_por_documento[token] = vocabulario_por_documento.get(token, 0) + 1
Logger.debug(f"Total de palabras únicas en vocabulario: {len(vocabulario_por_documento)}")
Logger.debug(f"Vocabulario por documento: {list(vocabulario_por_documento.items())[:10]}")

# ======================= Creación del corpus de palabras =======================

corpus_de_palabras = []
for doc in tr_txt_train:
    corpus_de_palabras += tokenizer.tokenize(doc.lower())

corpus_de_palabras = [token for token in corpus_de_palabras if re.match(r"^[a-zA-ZáéíóúÁÉÍÓÚñÑ]+$", token)]

stop_words = stopwords.words('spanish')
corpus_de_palabras_clean = [token for token in corpus_de_palabras if token not in stop_words]

print(f"Tamaño del corpus de palabras: {len(corpus_de_palabras)}")
print(f"Primeros 10 tokens del corpus: {corpus_de_palabras[:10]}")

In [None]:
distribucion_frecuencias = nltk.FreqDist(corpus_de_palabras)
distribucion_frecuencias_clean = nltk.FreqDist(corpus_de_palabras_clean)

vocabulario = [word for word, _ in distribucion_frecuencias.most_common(TOP_PALABRAS)]
vocabulario_clean = [word for word, _ in distribucion_frecuencias_clean.most_common(TOP_PALABRAS)]
print(f"Tamaño del vocabulario: {len(vocabulario)} | Primeras 10 palabras: {vocabulario[:10]}")
print(f"Tipo de vocabulario: {type(vocabulario)}")

dict_indices = {word: i for i, word in enumerate(vocabulario)}
dict_indices_clean = {word: i for i, word in enumerate(vocabulario_clean)}
print(f"Tipo de dict_indices: {type(dict_indices)}")
print(f"Diccionario de índices (primeras 10 entradas): {dict_indices}")

In [None]:
def built_bow_tr_binario(tr_txt, vocabulario, dict_indices):
    bow = np.zeros((len(tr_txt), len(vocabulario)), dtype=np.int8)
    for cont_doc, tr in enumerate(tr_txt):
        if not tr or not isinstance(tr, str):
            continue
        tokens = tokenizer.tokenize(tr.lower())
        for word in tokens:
            if word in dict_indices:
                bow[cont_doc, dict_indices[word]] = 1
    return bow

In [None]:
bow_tr_binario = built_bow_tr_binario(tr_txt_train, vocabulario, dict_indices)
bow_tr_binario

# Ejercicio 1 | Clasificación de autores usando Bag of Words Binario

In [None]:
x_train_80, x_val_20, y_train_80, y_val_20 = train_test_split(bow_tr_binario, y_train, test_size=0.2, stratify=y_train, random_state=42)

parametros = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
svr = svm.LinearSVC(class_weight='balanced', max_iter=MAX_ITERACIONES)

grid = GridSearchCV(estimator=svr, param_grid=parametros, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(x_train_80, y_train_80)
y_pred = grid.predict(x_val_20)

prec_bow_binario, rec_bow_binario, f1_bow_binario, _ = precision_recall_fscore_support(y_val_20, y_pred, average='weighted')

print(confusion_matrix(y_val_20, y_pred))
print(metrics.classification_report(y_val_20, y_pred))


# Ejercicio 2 | Clasificación de autores usando Bag of Words Frecuencia

In [None]:
def built_bow_tr_frecuencia(tr_txt, vocabulario, dict_indices):
    bow = np.zeros((len(tr_txt), len(vocabulario)), dtype=int)
    for cont_doc, tr in enumerate(tr_txt):
        if not tr or not isinstance(tr, str):
            continue
        tokens = tokenizer.tokenize(tr.lower())
        fdist_doc = nltk.FreqDist(tokens)
        for word in fdist_doc:
            if word in dict_indices:
                bow[cont_doc, dict_indices[word]] = fdist_doc[word]
    return bow

In [None]:
# 1) TF: tu función tal cual
bow_train_frecuencia = built_bow_tr_frecuencia(tr_txt_train, vocabulario, dict_indices).astype(float)
print(f"BoW de frecuencia (shape): {bow_train_frecuencia.shape}")

# 2) DF: usando tokens_por_documento que ya generaste
N = len(tokens_por_documento)  # número de documentos
df = np.zeros(len(dict_indices), dtype=int)

for doc in tokens_por_documento:
    presentes = set()
    for tok in doc:
        j = dict_indices.get(tok)
        if j is not None:
            presentes.add(j)
    for j in presentes:
        df[j] += 1

# 3) IDF: fórmula estándar suavizada
idf = np.log((1.0 + N) / (1.0 + df)) + 1.0

# 4) TF-IDF: multiplicar cada columna por su IDF
bow_tfidf = bow_train_frecuencia * idf[None, :]  # broadcasting en eje columnas

# 5) Normalización L2 por documento (opcional pero buena para SVM)
normas = np.linalg.norm(bow_tfidf, axis=1, keepdims=True) + 1e-12
bow_tfidf = bow_tfidf / normas

print(f"TF-IDF listo (shape): {bow_tfidf.shape}")
print(f"Primeras filas TF-IDF:\n{bow_tfidf[:3, :10]}")

In [None]:
x_train_80, x_val_20, y_train_80, y_val_20 = train_test_split(bow_train_frecuencia, y_train, test_size=0.2, stratify=y_train, random_state=42)

parametros = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
svr = svm.LinearSVC(class_weight='balanced', max_iter=MAX_ITERACIONES)
grid = GridSearchCV(estimator=svr, param_grid=parametros, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(x_train_80, y_train_80)
y_pred = grid.predict(x_val_20)

prec_bow_frecuencia, rec_bow_frecuencia, f1_bow_frecuencia, _ = precision_recall_fscore_support(y_val_20, y_pred, average='macro')

print(confusion_matrix(y_val_20, y_pred))
print(metrics.classification_report(y_val_20, y_pred))


In [None]:
x_train_80, x_val_20, y_train_80, y_val_20 = train_test_split(bow_tfidf, y_train, test_size=0.2, stratify=y_train, random_state=42)

parametros = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
svr = svm.LinearSVC(class_weight='balanced', max_iter=MAX_ITERACIONES)
grid = GridSearchCV(estimator=svr, param_grid=parametros, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(x_train_80, y_train_80)
y_pred = grid.predict(x_val_20)

prec_bow_tfidf, rec_bow_tfidf, f1_bow_tfidf, _ = precision_recall_fscore_support(y_val_20, y_pred, average='macro')

print(confusion_matrix(y_val_20, y_pred))
print(metrics.classification_report(y_val_20, y_pred))

# Ejercicio 3 | Clasificación de autores usando Bag of Words Binario y normalización L2

In [None]:

bow_train_L2 = normalize(bow_tr_binario, norm='l2')

x_train_80, x_val_20, y_train_80, y_val_20 = train_test_split(bow_train_L2, y_train, test_size=0.2, stratify=y_train)

parametros = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
svr = svm.LinearSVC(class_weight='balanced', max_iter=MAX_ITERACIONES)

grid = GridSearchCV(estimator=svr, param_grid=parametros, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(x_train_80, y_train_80)
y_pred = grid.predict(x_val_20)

prec_bow_binario_l2, rec_bow_binario_l2, f1_bow_binario_l2, _ = precision_recall_fscore_support(y_val_20, y_pred, average='macro')

print(confusion_matrix(y_val_20, y_pred))
print(metrics.classification_report(y_val_20, y_pred))

# Ejercicio 4 | Clasificación de autores usando Bag of Words Frecuencia y normalización L2

In [None]:
bow_train_frecuencia_L2 = normalize(bow_train_frecuencia, norm='l2')

x_train_80, x_val_20, y_train_80, y_val_20 = train_test_split(bow_train_frecuencia_L2, y_train, test_size=0.2, stratify=y_train, random_state=42)

parametros = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
svr = svm.LinearSVC(class_weight='balanced', max_iter=MAX_ITERACIONES)
grid = GridSearchCV(estimator=svr, param_grid=parametros, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(x_train_80, y_train_80)
y_pred = grid.predict(x_val_20)

prec_bow_frecuencia_L2, rec_bow_frecuencia_L2, f1_bow_frecuencia_L2, _ = precision_recall_fscore_support(y_val_20, y_pred, average='macro')

print(confusion_matrix(y_val_20, y_pred))
print(metrics.classification_report(y_val_20, y_pred))

In [None]:
def ganador_metricas( etiquetas, precisiones, f1_scores):
    max_prec = max(precisiones)
    max_f1 = max(f1_scores)
    
    if max_prec >= max_f1:
        indice_ganador = precisiones.index(max_prec)
        return etiquetas[indice_ganador], max_prec
    else:
        indice_ganador = f1_scores.index(max_f1)
        return etiquetas[indice_ganador], max_f1


etiquetas = ['BOW Binario', 'BOW Frecuencia', 'BOW TF-IDF', 'BOW Binario + L2', 'BOW Frecuencia + L2' ]



# ======================= Tabla comparativa =======================
tabla_comparativa = {
    'Experimento': etiquetas,
    'Accuracy': [prec_bow_binario, prec_bow_frecuencia, prec_bow_tfidf, prec_bow_binario_l2, prec_bow_frecuencia_L2,],
    'F1-Score (Macro)': [f1_bow_binario, f1_bow_frecuencia, f1_bow_tfidf, f1_bow_binario_l2, f1_bow_frecuencia_L2,],
    'F1-Score (Weighted)': [f1_bow_binario, f1_bow_frecuencia, f1_bow_tfidf, f1_bow_binario_l2, f1_bow_frecuencia_L2,],
    'Precisión (Macro)': [prec_bow_binario, prec_bow_frecuencia, prec_bow_tfidf, prec_bow_binario_l2, prec_bow_frecuencia_L2,]
}

df_comparativo = pl.DataFrame(tabla_comparativa)

print("TABLA COMPARATIVA DE LOS 4 EJERCICIOS")
print("=" * 50)
print(df_comparativo)
print("=" * 50)

ganador = ganador_metricas(etiquetas, tabla_comparativa['Precisión (Macro)'], tabla_comparativa['F1-Score (Macro)'])
print(f"Gano el modelo basado en: {ganador[0]} con un valor de: {ganador[1]:.4f}")

# VECTORES DE PALABRAS MÁS IMPORTANTES

In [None]:
bow_train_frecuencia_clean = built_bow_tr_frecuencia(tr_txt_train,vocabulario_clean,dict_indices_clean)
print(bow_train_frecuencia_clean)
bow_train_frecuencia_clean.shape

In [None]:
def compute_dor_profe(TR):
    DTR = np.zeros((TR.shape[1], TR.shape[0]), dtype=float)

    tam_v = TR.shape[1] # Tamaño del vocabulario TOTAL
    
    for i,doc in enumerate(TR):
        non_zero_positions = np.nonzero(doc)[0] # Esto me dice las dimensiones que no son cero
        tamaño_vocabulario = len(non_zero_positions)
        for termino in non_zero_positions:
            DTR[termino, i] = doc[termino] * np.log(tam_v / tamaño_vocabulario)
    return DTR

In [None]:
dor = compute_dor_profe(bow_train_frecuencia_clean)

dor_normalizado_clean = preprocessing.normalize(dor, norm='l2')

print(dor_normalizado_clean.shape)

In [None]:

feature_selector = SelectKBest(chi2, k=1000)

# Aprende a como hacer selección de las palabras de manera muy interesante en automatico les da la relevancia
feature_selector = SelectKBest(chi2, k=1000)
feature_selector.fit(bow_train_frecuencia, y_train)
best = feature_selector.get_support(indices=True)

In [None]:
best

In [None]:
dict_indices_invertido_clean = {valor: key for key, valor in dict_indices_clean.items()}
dict_indices_invertido_clean

In [None]:
t_words = [dict_indices_invertido_clean[index] for index in best]
# dict_indices {"palabra": "dimension_en_bow",...}
matris_objetivo_clean = np.array([dor_normalizado_clean[dict_indices_clean[word]] for word in t_words])
matris_objetivo_clean

In [None]:
matris_objetivo_clean.shape

In [None]:
dor_normalizado_clean.shape

In [None]:
from sklearn.manifold import TSNE


reduce_matrix = TSNE(n_components=2).fit_transform(matris_objetivo_clean)

max_x , max_y = np.max(reduce_matrix, axis=0)
min_x , min_y = np.min(reduce_matrix, axis=0)


In [None]:
from sklearn.manifold import TSNE


reduce_matrix = TSNE(n_components=2).fit_transform(matris_objetivo_clean)
reduce_matrix

In [None]:
from matplotlib import patheffects

x, y = reduce_matrix[:, 0], reduce_matrix[:, 1]

plt.figure(figsize=(50, 50), dpi=120)
plt.xlim(min_x, max_x)
plt.ylim(min_y, max_y)
plt.scatter(x, y, s=30, color='black')






stop_words = stopwords.words('spanish')


for i, word in enumerate(t_words):
    if word in stop_words:
        plt.annotate(
            word, 
            (x[i], y[i]), 
            fontsize=18, 
            color='red', 
            fontweight='bold',
            path_effects=[patheffects.withStroke(linewidth=3, foreground="white")]
        )
    elif word in ['politicos', 'corrupción', 'PRI', 'feliz', 
              'hermosa', 'chica', 'tu', 'hdp','madre','madres',
              '@usuario' ,'hijos', 'pendeja', 'pendejo','mierda', 
              'loca', 'hijo', 'hija', 'mamá', 'tía']:
        plt.annotate(
            word, 
            (x[i], y[i]), 
            fontsize=22, 
            color='blue', 
            fontweight='bold',
            path_effects=[patheffects.withStroke(linewidth=3, foreground="white")]
        )
    else:
        plt.annotate(
            word, 
            (x[i], y[i]), 
            fontsize=16, 
            color='black',
            path_effects=[patheffects.withStroke(linewidth=3, foreground="white")]
        )

plt.gca().set_facecolor('whitesmoke')
plt.show()


In [None]:
len(t_words)

In [None]:
subsetword = ['politicos', 'corrupción', 'PRI', 'feliz', 
              'hermosa', 'chica', 'tu', 'hdp','madre','madres',
              '@usuario' ,'hijos', 'pendeja', 'pendejo','mierda', 
              'loca', 'hijo', 'hija', 'mamá', 'tía','padre','papá']

# t_words Las 1,000 mejores palabras según chi2 ( Algoritmo de selección de características)
# reduce_matriz Aquí estan las 1,000 mejores palabras en dos dimensiones

subreduce_matriz = []
ploted_subsetwords = []


for idx, word in enumerate(t_words):
    if word in subsetword:
        subreduce_matriz.append(reduce_matrix[idx])
        ploted_subsetwords.append(word)
    print_bar(idx, len(t_words), contexto="PROGRESO")

print()
Logger.debug(f"Palabras a graficar: {ploted_subsetwords}")
Logger.debug(f"Coordenadas: {subreduce_matriz}")

# Convertir a numpy array
subreduce_matriz = np.array(subreduce_matriz)


# Hacemos la gráfica de flechas
fig , ax = plt.subplots(figsize=(15, 15))

for word in subreduce_matriz:
    ax.arrow(0, 0, word[0], word[1], head_width=0.8, head_length=0.8, fc='red', ec='red', width=0.1e-2)
    ax.annotate(
        ploted_subsetwords[subreduce_matriz.tolist().index(word.tolist())], 
        (word[0], word[1]), 
        fontsize=12, 
        color='blue', 
        fontweight='bold',
        path_effects=[patheffects.withStroke(linewidth=3, foreground="white")]
    )

ax.scatter(subreduce_matriz[:,0], subreduce_matriz[:,1])

# NUBE DE PALABRAS

In [None]:

# Chi2
feature_selector = SelectKBest(chi2, k=1000)
feature_selector.fit(bow_train_frecuencia_clean, y_train) # EL y_train son los paises 


dict_indices_invertido_clean = {valor: key for key, valor in dict_indices_clean.items()}
Logger.debug(f" Diccionario invertido de índices : {dict_indices_invertido_clean}")

# Extraer las palabras seleccionadas
palabras_mejores_indices = feature_selector.get_support(indices=True)
palabras = [dict_indices_invertido_clean[indice] for indice in palabras_mejores_indices]


# Extraer los scores
chi2_scores = feature_selector.scores_
best_scores = chi2_scores[palabras_mejores_indices] # type: ignore
Logger.debug(f" Puntajes Chi2 de las mejores palabras: {best_scores}")


# Distribución de k mejores palabras
distribucion_k_mejores_tweets = {palabra: score for palabra, score in zip(palabras, best_scores)} # type: ignore
mejores_50 = dict(sorted(distribucion_k_mejores_tweets.items(), key=lambda item: item[1], reverse=True)[:500])
Logger.debug(f"Frecuencia de palabras en tweets: {mejores_50}")


# Generar y mostrar la nube de palabras
wordcloud = WordCloud(width=800, height=400, background_color='black').generate_from_frequencies(mejores_50)

# Mostrar la nube de palabras
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# ======================= TCOR (Lavelli) con misma interfaz de uso =======================
from collections import Counter
from scipy import sparse
import numpy as np
from sklearn.preprocessing import normalize as sk_normalize

def compute_tcor_profe(tokens_por_documento, dict_indices, min_pair_count=1):
    """
    Construye TCOR según Lavelli:
      w_{k,j} = (1 + log #(k,j)) * log(|T| / T_k)
    Co-ocurrencia por DOCUMENTO (pares únicos por doc). Devuelve CSR (|V| x |V|).
    """
    V = len(dict_indices)
    pair_counts = Counter()
    vecinos = [set() for _ in range(V)]

    # 1) Contar co-ocurrencias por documento (sin duplicar dentro del doc)
    for toks in tokens_por_documento:
        idxs = [dict_indices[t] for t in toks if t in dict_indices]
        if not idxs:
            continue
        uniq = sorted(set(idxs))
        for a in range(len(uniq)):
            i = uniq[a]
            for b in range(a+1, len(uniq)):
                j = uniq[b]
                pair_counts[(i, j)] += 1
                vecinos[i].add(j); vecinos[j].add(i)

    if not pair_counts:
        raise ValueError("No se encontraron co-ocurrencias con el vocabulario dado.")

    # 2) Matriz simétrica dispersa de conteos
    rows, cols, data = [], [], []
    for (i, j), c in pair_counts.items():
        if c >= min_pair_count:
            rows += [i, j]; cols += [j, i]; data += [c, c]
    C = sparse.coo_matrix((data, (rows, cols)), shape=(V, V)).tocsr()

    # 3) tff(i,j) = 1 + log(count)
    C.data = 1.0 + np.log(C.data)

    # 4) Escalado por fila con log(|T| / T_k), T_k = #vecinos únicos de k
    Tk = np.array([max(1, len(S)) for S in vecinos], dtype=float)
    s_k = np.log((V) / Tk)                 # |T| = V
    S = sparse.diags(s_k, format="csr")
    TCOR = S @ C

    # 5) Limpieza
    TCOR.setdiag(0)
    TCOR.eliminate_zeros()
    return TCOR

In [None]:
# Antes tenías:
# dor = compute_dor_profe(bow_train_frecuencia_clean)
# dor_normalizado_clean = preprocessing.normalize(dor, norm='l2')

# Ahora:
tcor = compute_tcor_profe(tokens_por_documento, dict_indices_clean, min_pair_count=1)
tcor_normalizado_clean = preprocessing.normalize(tcor, norm='l2')  # término×término, L2 por fila

print(tcor_normalizado_clean.shape)

In [None]:
feature_selector = SelectKBest(chi2, k=1000)
feature_selector.fit(bow_tfidf, y_train)  # o bow_train_frecuencia, como prefieras
best = feature_selector.get_support(indices=True)

dict_indices_invertido_clean = {v:k for k,v in dict_indices_clean.items()}
t_words = [dict_indices_invertido_clean[idx] for idx in best]

In [None]:
# Índices de las palabras seleccionadas en el vocab limpio
idx_words = [dict_indices_clean[w] for w in t_words]

# Igual que con DOR: toma las filas de esas palabras
# (opción A — filas contra TODO el vocab; puede ser grande si |V| es grande)
# matris_objetivo_clean = tcor_normalizado_clean[idx_words, :].toarray()

# Mejor para estabilidad/tiempo: filas y columnas en las mismas 1000 palabras (1000x1000)
tcor_1k = tcor_normalizado_clean[idx_words][:, idx_words].toarray()
matris_objetivo_clean = tcor_1k

In [None]:
from sklearn.manifold import TSNE

reduce_matrix = TSNE(n_components=2, init="pca", random_state=42, perplexity=30, learning_rate="auto", max_iter=1000)\
                .fit_transform(matris_objetivo_clean)

max_x, max_y = np.max(reduce_matrix, axis=0)
min_x, min_y = np.min(reduce_matrix, axis=0)

# Tu misma rutina de anotado
from matplotlib import patheffects
x, y = reduce_matrix[:,0], reduce_matrix[:,1]

plt.figure(figsize=(50,50), dpi=120)
plt.xlim(min_x, max_x); plt.ylim(min_y, max_y)
plt.scatter(x, y, s=30, color='black')

stop_words_es = stopwords.words('spanish')
for i, word in enumerate(t_words):
    if word in stop_words_es:
        plt.annotate(word, (x[i],y[i]), fontsize=18, color='red', fontweight='bold',
                     path_effects=[patheffects.withStroke(linewidth=3, foreground="white")])
    elif word in ['politicos','corrupción','PRI','feliz','hermosa','chica','tu','hdp','madre','madres',
                  '@usuario','hijos','pendeja','pendejo','mierda','loca','hijo','hija','mamá','tía']:
        plt.annotate(word, (x[i],y[i]), fontsize=22, color='blue', fontweight='bold',
                     path_effects=[patheffects.withStroke(linewidth=3, foreground="white")])
    else:
        plt.annotate(word, (x[i],y[i]), fontsize=16, color='black',
                     path_effects=[patheffects.withStroke(linewidth=3, foreground="white")])

plt.gca().set_facecolor('whitesmoke')
plt.show()