# Rodrigo Mendoza Rodriguez SVM Bow

In [147]:
import zipfile
with zipfile.ZipFile("author_profiling_pan.zip", "r") as z:
    z.extractall("carpeta_docs")

In [148]:
import xml.etree.ElementTree as ET

tree = ET.parse('carpeta_docs/author_profiling_pan/es_test/ff011bde2e7212a3229d462b6809be9b.xml')
root = tree.getroot()
print(root.tag)
for child in root:
    print(child.tag, child.attrib)

author
documents {}


In [None]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

stop_words = set(stopwords.words("spanish"))
tokenizer = TweetTokenizer()


def limpiar_texto(texto):
    texto = BeautifulSoup(texto, "html.parser").get_text()
    texto = texto.lower()
    texto = re.sub(r"http\S+|www\S+|https\S+", "", texto)
    tokens = tokenizer.tokenize(texto)
    tokens = [tok for tok in tokens if tok not in stop_words]
    return " ".join(tokens)   


In [173]:
import os

def get_texts_from_folder(path_folder):
    tr_txt = []  # aquí van los documentos
    tr_y = []    # aquí van las etiquetas
    
    for file in os.listdir(path_folder):
        if file.endswith(".xml"):
            tree = ET.parse(os.path.join(path_folder, file))
            root = tree.getroot()
            docs = []
            for doc in root.iter("document"): 
                texto_limpio = limpiar_texto(doc.text)   
                if texto_limpio:  
                    docs.append(texto_limpio)
            tr_txt.append(" ".join(docs))

    truth_file = os.path.join(path_folder, "truth.txt")
    if os.path.exists(truth_file):
        with open(truth_file, "r", encoding="utf-8") as f:
            for line in f:
                tr_y.append(line.strip())
    
    return tr_txt, tr_y


In [174]:
path_test = 'carpeta_docs/author_profiling_pan/es_test/'
path_train = 'carpeta_docs/author_profiling_pan/es_train/'
tr_txt_test, tr_y_test = get_texts_from_folder(path_test)
tr_txt_train, tr_y_train = get_texts_from_folder(path_train)

  texto = BeautifulSoup(texto, "html.parser").get_text()
  texto = BeautifulSoup(texto, "html.parser").get_text()


In [175]:
len(tr_txt_train)

4200

In [180]:
import nltk

# Construir corpus de tokens
corpus_de_palabras = []
for doc in tr_txt_train:
    corpus_de_palabras += doc.split()

fdist = nltk.FreqDist(corpus_de_palabras)

def sortFreqDist(freqDist):
    aux = [(freqDist[key], key) for key in freqDist]
    aux.sort()
    aux.reverse()
    return aux

V = sortFreqDist(fdist)
V = [word for word, _ in fdist.most_common(10000)]
dict_indices = {word: i for i, word in enumerate(V)}


In [181]:
import numpy as np
def built_bow_tr_binario(tr_txt, vocabulario, dict_indices):
    # Objetivo: Construir la matriz de bow
    bow = np.zeros((len(tr_txt), len(vocabulario)), dtype=np.int8)
    cont_doc = 0
    # Rellenar la matriz, cada renglon es un twitt y cada renglon una palabra
    for tr in tr_txt:
        if not tr:  # ignora vacíos
            continue
        fdist_doc = nltk.FreqDist(tokenizer.tokenize(str(tr).lower()))
        for word in fdist_doc:
            if word in dict_indices:
                bow[cont_doc, dict_indices[word]] = 1
        cont_doc += 1
    return bow

In [182]:
bow_tr = built_bow_tr_binario(tr_txt_train, V, dict_indices)
bow_tr

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 0, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]], dtype=int8)

In [183]:
bow_tr.shape

(4200, 10000)

In [184]:
paises = []
for index, renglon in enumerate(tr_y_test):
    labels = tr_y_test[index].split(':::')
    if labels[2] not in paises:
        paises.append(labels[2])
paises_numericas_test = {}
for index, pais in enumerate(paises):
    paises_numericas_test[pais] = index
paises_numericas_test

{'argentina': 0,
 'chile': 1,
 'colombia': 2,
 'mexico': 3,
 'peru': 4,
 'spain': 5,
 'venezuela': 6}

In [195]:
# Crear un set con todos los países (train + test)
paises = set()

with open(path_train + "/truth.txt", encoding="utf-8") as f:
    for line in f:
        labels = line.strip().split(":::")
        paises.add(labels[2])

with open(path_test + "/truth.txt", encoding="utf-8") as f:
    for line in f:
        labels = line.strip().split(":::")
        paises.add(labels[2])

# Ahora asignamos un número fijo a cada país
paises_numericas = {pais: idx for idx, pais in enumerate(sorted(paises))}

# Usar SIEMPRE este mismo diccionario para codificar
y_train = []
with open(path_train + "/truth.txt", encoding="utf-8") as f:
    for line in f:
        labels = line.strip().split(":::")
        y_train.append(paises_numericas[labels[2]])

y_test = []
with open(path_test + "/truth.txt", encoding="utf-8") as f:
    for line in f:
        labels = line.strip().split(":::")
        y_test.append(paises_numericas[labels[2]])


# Ejercicio 1

In [192]:
from sklearn import svm, metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score

In [196]:
X_train80, X_val20, y_train80, y_val20 = train_test_split(
    bow_tr, y_train, test_size=0.2, stratify=y_train, random_state=42
)
parameters = {'C': [.05, .12, .25, .5, 1, 2, 4]}

svr = svm.LinearSVC(class_weight='balanced')
grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(X_train80, y_train80)
y_pred = grid.predict(X_val20)

p, r, f, _ = precision_recall_fscore_support(y_val20, y_pred, average='macro')

print(confusion_matrix(y_val20, y_pred))
print(metrics.classification_report(y_val20, y_pred))


[[17 14 18 22 16 19 14]
 [21 20 15 13 23 15 13]
 [20 24 17 12 20 10 17]
 [17 26 21 19 11 17  9]
 [16 11 21 20 17 15 20]
 [13 19 23  5 23 17 20]
 [18 15 19 22 17 13 16]]
              precision    recall  f1-score   support

           0       0.14      0.14      0.14       120
           1       0.16      0.17      0.16       120
           2       0.13      0.14      0.13       120
           3       0.17      0.16      0.16       120
           4       0.13      0.14      0.14       120
           5       0.16      0.14      0.15       120
           6       0.15      0.13      0.14       120

    accuracy                           0.15       840
   macro avg       0.15      0.15      0.15       840
weighted avg       0.15      0.15      0.15       840





# Ejercicio 2

In [168]:
import numpy as np
def built_bow_tr_frecuencia(tr_txt, vocabulario, dict_indices):
    # Objetivo: Construir la matriz de bow
    bow = np.zeros((len(tr_txt), len(vocabulario)), dtype=int)
    cont_doc = 0
    # Rellenar la matriz, cada renglon es un twitt y cada renglon una palabra
    for tr in tr_txt:
        fdist_doc = nltk.FreqDist(tokenizer.tokenize(tr.lower()))
        for word in fdist_doc:
            if word in dict_indices:
                bow[cont_doc, dict_indices[word]] = fdist_doc[word]
        cont_doc +=1
    return bow

In [169]:
tokenizer = TweetTokenizer()
corpus_de_palabras_f = []
for doc in tr_txt_train:
    if isinstance(doc, str):              
        corpus_de_palabras_f += tokenizer.tokenize(doc)
    
fdist = nltk.FreqDist(corpus_de_palabras)
def sortFreqDist(freqDist):
    aux = [(freqDist[key], key) for key in freqDist]
    aux.sort()
    aux.reverse()
    return aux
V_f = sortFreqDist(fdist)
V_f = V[:10000]
dict_indices_f = dict()

cont = 0
for weight, word in V_f:
    dict_indices_f[word] = cont
    cont += 1

In [170]:
bow_train_frecuencia = built_bow_tr_frecuencia(tr_txt_train,V_f,dict_indices_f)
print(bow_train_frecuencia)
bow_train_frecuencia.shape

[[ 0  1 34 ...  0  0  0]
 [ 0  1 26 ...  0  0  0]
 [ 0  2 31 ...  0  0  0]
 ...
 [ 0  1 21 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 [ 0  0 23 ...  0  0  0]]


(4200, 1805)

In [171]:
X_train80, X_val20, y_train80, y_val20 = train_test_split(
    bow_train_frecuencia, y_train, test_size=0.2, stratify=y_train, random_state=42
)
parameters = {'C': [.05, .12, .25, .5, 1, 2, 4]}

svr = svm.LinearSVC(class_weight='balanced')
grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(X_train80, y_train80)
y_pred = grid.predict(X_val20)

p, r, f, _ = precision_recall_fscore_support(y_val20, y_pred, average='macro')

print(confusion_matrix(y_val20, y_pred))
print(metrics.classification_report(y_val20, y_pred))


KeyboardInterrupt: 

# Ejercicio 3

In [None]:
from sklearn.preprocessing import normalize

bow_tr = bow_tr.astype(np.float32)
bow_train_L2 = normalize(bow_tr, norm='l2')
X_train80, X_val20, y_train80, y_val20 = train_test_split(
    bow_train_L2, y_train, test_size=0.2, stratify=y_train, random_state=42
)
parameters = {'C': [.05, .12, .25, .5, 1, 2, 4]}

svr = svm.LinearSVC(class_weight='balanced')
grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(X_train80, y_train80)
y_pred = grid.predict(X_val20)

p, r, f, _ = precision_recall_fscore_support(y_val20, y_pred, average='macro')

print(confusion_matrix(y_val20, y_pred))
print(metrics.classification_report(y_val20, y_pred))

19 fits failed out of a total of 35.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
19 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Personal Computer\instalar_anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Personal Computer\instalar_anaconda\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Personal Computer\instalar_anaconda\Lib\site-packages\sklearn\svm\_classes.py", line 305, in fit
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
     

[[20 13 15 18 26 10 18]
 [21 14 17 18 20 15 15]
 [21 12 15 19 17 15 21]
 [24  6 27 13 14 16 20]
 [16 10 20 19 18 16 21]
 [21 13 20 13 23 16 14]
 [17 16 16 16 20 16 19]]
              precision    recall  f1-score   support

           0       0.14      0.17      0.15       120
           1       0.17      0.12      0.14       120
           2       0.12      0.12      0.12       120
           3       0.11      0.11      0.11       120
           4       0.13      0.15      0.14       120
           5       0.15      0.13      0.14       120
           6       0.15      0.16      0.15       120

    accuracy                           0.14       840
   macro avg       0.14      0.14      0.14       840
weighted avg       0.14      0.14      0.14       840



# Ejercicio 4

In [None]:
from sklearn.preprocessing import normalize

bow_tr = bow_tr.astype(np.float32)
bow_train_frecuencia_L2 = normalize(bow_train_frecuencia, norm='l2')
X_train80, X_val20, y_train80, y_val20 = train_test_split(
    bow_train_frecuencia_L2, y_train, test_size=0.2, stratify=y_train, random_state=42
)
parameters = {'C': [.05, .12, .25, .5, 1, 2, 4]}

svr = svm.LinearSVC(class_weight='balanced')
grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=4, scoring='f1_macro', cv=5)
grid.fit(X_train80, y_train80)
y_pred = grid.predict(X_val20)

p, r, f, _ = precision_recall_fscore_support(y_val20, y_pred, average='macro')

print(confusion_matrix(y_val20, y_pred))
print(metrics.classification_report(y_val20, y_pred))

[[13 12 17 24 29 10 15]
 [20  7 17 33 22 11 10]
 [11 12 23 25 15 11 23]
 [29  6 20 21 17 12 15]
 [22  7 20 23 22 16 10]
 [23  4 25 26 19 10 13]
 [23 12 17 21 18  9 20]]
              precision    recall  f1-score   support

           0       0.09      0.11      0.10       120
           1       0.12      0.06      0.08       120
           2       0.17      0.19      0.18       120
           3       0.12      0.17      0.14       120
           4       0.15      0.18      0.17       120
           5       0.13      0.08      0.10       120
           6       0.19      0.17      0.18       120

    accuracy                           0.14       840
   macro avg       0.14      0.14      0.13       840
weighted avg       0.14      0.14      0.13       840

