# Rodrigo Mendoza Rodriguez SVM Bow

In [1]:
import zipfile
with zipfile.ZipFile("author_profiling_pan.zip", "r") as z:
    z.extractall("carpeta_docs")

In [2]:
import xml.etree.ElementTree as ET

tree = ET.parse('carpeta_docs/author_profiling_pan/es_test/ff011bde2e7212a3229d462b6809be9b.xml')
root = tree.getroot()
print(root.tag)
for child in root:
    print(child.tag, child.attrib)

author
documents {}


In [3]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

stop_words = set(stopwords.words("spanish"))
tokenizer = TweetTokenizer()


def limpiar_texto(texto):
    texto = BeautifulSoup(texto, "html.parser").get_text()
    texto = texto.lower()
    texto = re.sub(r"http\S+|www\S+|https\S+", "", texto)
    texto = re.sub(r"@\w+", "", texto)
    texto = re.sub(r"#+", "", texto)
    tokens = tokenizer.tokenize(texto)
    tokens = [tok for tok in tokens if tok not in stop_words]
    return " ".join(tokens)


In [4]:
import os
def get_texts_from_folder(path_folder):
    tr_txt = []  # aquí van los documentos
    tr_y = []    # aquí van las etiquetas

    for file in os.listdir(path_folder):
        if file.endswith(".xml"):
            tree = ET.parse(os.path.join(path_folder, file))
            root = tree.getroot()
            docs = []
            for doc in root.iter("document"):
                texto_limpio = limpiar_texto(doc.text)
                if texto_limpio:  #
                    docs.append(texto_limpio)
            if docs:
                tr_txt.append(" ".join(docs))

    truth_file = os.path.join(path_folder, "truth.txt")
    if os.path.exists(truth_file):
        file_to_label = {}
        with open(truth_file, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split(":::")
                if len(parts) >= 3:
                    file_to_label[parts[0]] = parts[2]

        for file in os.listdir(path_folder):
            if file.endswith(".xml"):
                file_id = file.split('.')[0]
                if file_id in file_to_label:
                    tr_y.append(file_to_label[file_id])

    return tr_txt, tr_y

In [5]:
path_test = 'carpeta_docs/author_profiling_pan/es_test/'
path_train = 'carpeta_docs/author_profiling_pan/es_train/'
tr_txt_train, tr_y_train = get_texts_from_folder(path_train)
tr_txt_test, tr_y_test = get_texts_from_folder(path_test)

print(f"Textos train: {len(tr_txt_train)}, Etiquetas train: {len(tr_y_train)}")
print(f"Textos test: {len(tr_txt_test)}, Etiquetas test: {len(tr_y_test)}")

all_countries = sorted(list(set(tr_y_train + tr_y_test)))
paises_numericas = {pais: idx for idx, pais in enumerate(all_countries)}

y_train = [paises_numericas[pais] for pais in tr_y_train]
y_test = [paises_numericas[pais] for pais in tr_y_test]

  texto = BeautifulSoup(texto, "html.parser").get_text()
  texto = BeautifulSoup(texto, "html.parser").get_text()


Textos train: 4200, Etiquetas train: 4200
Textos test: 2800, Etiquetas test: 2800


In [6]:
len(tr_txt_train)

4200

In [7]:
import nltk

corpus_de_palabras = []
for doc in tr_txt_train:
    if doc and isinstance(doc, str):
        corpus_de_palabras += tokenizer.tokenize(doc)

fdist = nltk.FreqDist(corpus_de_palabras)
V = [word for word, _ in fdist.most_common(10000)]
dict_indices = {word: i for i, word in enumerate(V)}

In [8]:
import numpy as np
def built_bow_tr_binario(tr_txt, vocabulario, dict_indices):
    bow = np.zeros((len(tr_txt), len(vocabulario)), dtype=np.int8)
    for cont_doc, tr in enumerate(tr_txt):
        if not tr or not isinstance(tr, str):
            continue
        tokens = tokenizer.tokenize(tr.lower())
        for word in tokens:
            if word in dict_indices:
                bow[cont_doc, dict_indices[word]] = 1
    return bow

In [9]:
bow_tr = built_bow_tr_binario(tr_txt_train, V, dict_indices)
bow_tr

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 0, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]], dtype=int8)

In [10]:
bow_tr.shape

(4200, 10000)

# Ejercicio 1

In [11]:
from sklearn import svm, metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score

In [12]:
X_train80, X_val20, y_train80, y_val20 = train_test_split(
    bow_tr, y_train, test_size=0.2, stratify=y_train, random_state=42
)
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
svr = svm.LinearSVC(class_weight='balanced', max_iter=10000)

grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(X_train80, y_train80)
y_pred = grid.predict(X_val20)

p, r, f, _ = precision_recall_fscore_support(y_val20, y_pred, average='weighted')

print(confusion_matrix(y_val20, y_pred))
print(metrics.classification_report(y_val20, y_pred))


[[116   0   1   0   1   1   1]
 [  1 117   2   0   0   0   0]
 [  0   1 112   2   3   0   2]
 [  0   1   3 112   1   1   2]
 [  1   0   2   3 111   2   1]
 [  0   0   0   0   1 115   4]
 [  0   0   3   2   1   1 113]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       120
           1       0.98      0.97      0.98       120
           2       0.91      0.93      0.92       120
           3       0.94      0.93      0.94       120
           4       0.94      0.93      0.93       120
           5       0.96      0.96      0.96       120
           6       0.92      0.94      0.93       120

    accuracy                           0.95       840
   macro avg       0.95      0.95      0.95       840
weighted avg       0.95      0.95      0.95       840



# Ejercicio 2

In [13]:
import numpy as np
def built_bow_tr_frecuencia(tr_txt, vocabulario, dict_indices):
    bow = np.zeros((len(tr_txt), len(vocabulario)), dtype=int)
    for cont_doc, tr in enumerate(tr_txt):
        if not tr or not isinstance(tr, str):
            continue
        tokens = tokenizer.tokenize(tr.lower())
        fdist_doc = nltk.FreqDist(tokens)
        for word in fdist_doc:
            if word in dict_indices:
                bow[cont_doc, dict_indices[word]] = fdist_doc[word]
    return bow

In [14]:
tokenizer = TweetTokenizer()
corpus_de_palabras_f = []
for doc in tr_txt_train:
    if isinstance(doc, str):
        corpus_de_palabras_f += tokenizer.tokenize(doc)

fdist = nltk.FreqDist(corpus_de_palabras_f)

V_f = [word for word, _ in fdist.most_common(10000)]
dict_indices_f = {word: i for i, word in enumerate(V_f)}


In [15]:
bow_train_frecuencia = built_bow_tr_frecuencia(tr_txt_train,V_f,dict_indices_f)
print(bow_train_frecuencia)
bow_train_frecuencia.shape

[[ 34  14  73 ...   0   0   0]
 [ 21   3   0 ...   0   0   0]
 [ 48  73  11 ...   3   1   2]
 ...
 [ 11   0  22 ...   0   0   0]
 [ 67  24   4 ...   0   0   0]
 [ 52 107   0 ...   0   0   0]]


(4200, 10000)

In [16]:
X_train80, X_val20, y_train80, y_val20 = train_test_split(
    bow_train_frecuencia, y_train, test_size=0.2, stratify=y_train, random_state=42
)
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
svr = svm.LinearSVC(class_weight='balanced', max_iter=10000)
grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(X_train80, y_train80)
y_pred = grid.predict(X_val20)

p, r, f, _ = precision_recall_fscore_support(y_val20, y_pred, average='macro')

print(confusion_matrix(y_val20, y_pred))
print(metrics.classification_report(y_val20, y_pred))


[[110   0   2   0   5   1   2]
 [  1 115   1   0   2   0   1]
 [  0   1 108   1   8   0   2]
 [  0   2   5 101   5   3   4]
 [  3   0   2   4 108   2   1]
 [  2   0   1   3   1 112   1]
 [  1   2   4   4   3   2 104]]
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       120
           1       0.96      0.96      0.96       120
           2       0.88      0.90      0.89       120
           3       0.89      0.84      0.87       120
           4       0.82      0.90      0.86       120
           5       0.93      0.93      0.93       120
           6       0.90      0.87      0.89       120

    accuracy                           0.90       840
   macro avg       0.90      0.90      0.90       840
weighted avg       0.90      0.90      0.90       840



# Ejercicio 3

In [17]:
from sklearn.preprocessing import normalize

bow_tr = bow_tr.astype(np.float32)
bow_train_L2 = normalize(bow_tr, norm='l2')
X_train80, X_val20, y_train80, y_val20 = train_test_split(
    bow_train_L2, y_train, test_size=0.2, stratify=y_train
)
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
svr = svm.LinearSVC(class_weight='balanced', max_iter=10000)

grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(X_train80, y_train80)
y_pred = grid.predict(X_val20)

p, r, f, _ = precision_recall_fscore_support(y_val20, y_pred, average='macro')

print(confusion_matrix(y_val20, y_pred))
print(metrics.classification_report(y_val20, y_pred))

[[115   1   0   2   0   1   1]
 [  0 111   3   2   2   1   1]
 [  0   1 112   2   2   1   2]
 [  1   0   3 109   3   2   2]
 [  2   0   0   1 111   4   2]
 [  0   0   0   1   0 118   1]
 [  0   0   4   3   2   2 109]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       120
           1       0.98      0.93      0.95       120
           2       0.92      0.93      0.93       120
           3       0.91      0.91      0.91       120
           4       0.93      0.93      0.93       120
           5       0.91      0.98      0.95       120
           6       0.92      0.91      0.92       120

    accuracy                           0.93       840
   macro avg       0.94      0.93      0.93       840
weighted avg       0.94      0.93      0.93       840



# Ejercicio 4

In [18]:
from sklearn.preprocessing import normalize

bow_train_frecuencia = bow_train_frecuencia.astype(np.float32)
bow_train_frecuencia_L2 = normalize(bow_train_frecuencia, norm='l2')

X_train80, X_val20, y_train80, y_val20 = train_test_split(
    bow_train_frecuencia_L2, y_train, test_size=0.2, stratify=y_train, random_state=42
)
parameters = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
svr = svm.LinearSVC(class_weight='balanced', max_iter=10000)
grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(X_train80, y_train80)
y_pred = grid.predict(X_val20)

p, r, f, _ = precision_recall_fscore_support(y_val20, y_pred, average='macro')

print(confusion_matrix(y_val20, y_pred))
print(metrics.classification_report(y_val20, y_pred))

[[111   0   2   1   3   1   2]
 [  0 115   1   0   2   1   1]
 [  0   1 112   1   5   0   1]
 [  0   2   6 100   4   3   5]
 [  0   2   3   4 107   2   2]
 [  3   0   0   1   1 110   5]
 [  1   1   7   2   3   3 103]]
              precision    recall  f1-score   support

           0       0.97      0.93      0.94       120
           1       0.95      0.96      0.95       120
           2       0.85      0.93      0.89       120
           3       0.92      0.83      0.87       120
           4       0.86      0.89      0.87       120
           5       0.92      0.92      0.92       120
           6       0.87      0.86      0.86       120

    accuracy                           0.90       840
   macro avg       0.90      0.90      0.90       840
weighted avg       0.90      0.90      0.90       840



In [19]:
import pandas as pd

# Tabla comparativa con los resultados de tus 4 ejercicios
tabla_comparativa = {
    'Experimento': ['1. BOW Binario', '2. BOW Frecuencia', '3. BOW Binario + L2', '4. BOW Frecuencia + L2'],
    'Accuracy': [0.94, 0.91, 0.95, 0.92],  # Reemplaza 0.XX con tus valores
    'F1-Score (Macro)': [0.94, 0.91, 0.95, 0.92],  # Reemplaza 0.XX con tus valores
    'F1-Score (Weighted)': [0.94, 0.91, 0.95, 0.92],  # Reemplaza 0.XX con tus valores
    'Precisión (Macro)': [0.95, 0.91, 0.95, 0.92]  # Reemplaza 0.XX con tus valores
}

df_comparativo = pd.DataFrame(tabla_comparativa)
df_comparativo = df_comparativo.set_index('Experimento')

print("TABLA COMPARATIVA DE LOS 4 EJERCICIOS")
print("=" * 50)
print(df_comparativo.round(3))

TABLA COMPARATIVA DE LOS 4 EJERCICIOS
                        Accuracy  F1-Score (Macro)  F1-Score (Weighted)  \
Experimento                                                               
1. BOW Binario              0.94              0.94                 0.94   
2. BOW Frecuencia           0.91              0.91                 0.91   
3. BOW Binario + L2         0.95              0.95                 0.95   
4. BOW Frecuencia + L2      0.92              0.92                 0.92   

                        Precisión (Macro)  
Experimento                                
1. BOW Binario                       0.95  
2. BOW Frecuencia                    0.91  
3. BOW Binario + L2                  0.95  
4. BOW Frecuencia + L2               0.92  


# Comentario personal
Considero que el experimento 3 de Binario con normalizacion de L2 fue el que mejores resultados lanzo, tardo mucho menos en comparacion de los otros y consiguio una mejor puntuacion en las metricas.