# Tarea #2: Representación vectorial de palabras con Word2Vec y FastText usando Gensim

Realziado por:
- Jose Luis Hincapie Bucheli (2125340)
- Sebastián Idrobo Avirama (2122637)
- Paul Rodrigo Rojas Guerrero (2127891)
---

# Creación de listas de sentencias

In [None]:
!pip install datasets spacy

In [None]:
from datasets import load_dataset

In [None]:
raw_text = load_dataset('large_spanish_corpus', name='DGT', split='train', streaming=True)

## Tokenización

In [None]:
import spacy
from tqdm import tqdm
import time

!python -m spacy download es_core_news_sm

In [None]:
#Correr antes de hacer la limpieza
nlp = spacy.load('es_core_news_sm')

## Depuración

In [None]:
# Si ya tienes el archivo output2 actualizado, puedes omitir este paso

import re, string

with open('output2.txt', 'w',encoding="utf-8") as output_file:
    for sent in tqdm(raw_text):     
        sent = sent['text']
        doc = nlp(sent)
        for token in doc:
            # Verificar que sea una palabra, no sea un dígito y no sea una stop-word
            w = token.text
            if not w.isdigit() and not token.is_stop:
                w = re.sub(r'[0-9]','', w) # Depurar de dígitos
                re_punc = re.compile('[%s]' % re.escape(string.punctuation)) 
                w = re_punc.sub('', w) # Depurar de signos de puntuación
                w = re.sub("\!|\'|\?|\¿|\¡|\«|\»|\—","",w) # Depurar de símbolos especiales
                w = w.lower() # Utilizar sólo letras minúsculas
                if w.strip(): # Si el token se convirtió en una cadena vacía o con espacios, no añadirla
                    output_file.write(f'{w}, ')
        output_file.write('\n')

In [None]:
with open('output2.txt', 'r', encoding="utf-8") as file:
    lines = file.readlines()
    sentences = []
    for line in lines:
        words = line.split(', ')
        last_index = len(words)-1
        if words[last_index] == '\n':
            words.pop(last_index)
        sentences.append(words)

print(sentences[1:50])

# Representación Vectorial

## Construcción de modelos de FastText y Word2Vec

In [None]:
!pip install gensim

In [None]:
from gensim.models import FastText
from gensim.test.utils import common_texts
from gensim.models.word2vec import Word2Vec

In [None]:
# Reorganizando algunas oraciones para incluir las palabras madrid y bogotá en un mismo grupo de 300

# Tomar las últimas oraciones en una variable independiente
sentences_300 = sentences[:292].copy()
madrid_lines = [10330, 10331, 10332, 10333] # Líneas que contienen madrid
bogota_lines = [844192, 1493819, 2841959, 3070992] # Líneas que contienen bogotá

for l in madrid_lines:
    sent = sentences[l-1]
    print(sent)
    sentences_300.insert(0, sent)

words_to_append_bogota = []
for l in bogota_lines:
    sent = sentences[l-1]
    print(sent)
    sentences_300.insert(0, sent)

len(sentences_300)

### Word2Vec

In [None]:
word2vec_300 = Word2Vec(sentences=sentences_300, vector_size=300, window=5, min_count=1, workers=4) # 300 sentences

In [None]:
word2vec_1m = Word2Vec(sentences=sentences[:1_000_000], vector_size=300, window=5, min_count=1, workers=4) #1000000 sentences

In [None]:
word2vec_full = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=1, workers=4) # aprox 3000000 sentences

### FastText

In [None]:
fasttext_300 = FastText(vector_size=300, window=5, min_count=1, workers=4)
fasttext_300.build_vocab(corpus_iterable=sentences_300) # 300 sentences

In [None]:
fasttext_1m = FastText(vector_size=300, window=5, min_count=1, workers=4)
fasttext_1m.build_vocab(corpus_iterable=sentences[:1_000_000]) #1000000 sentences

In [None]:
fasttext_full = FastText(vector_size=300, window=5, min_count=1, workers=4)
fasttext_full.build_vocab(corpus_iterable=sentences) # aprox 3000000 sentences

### Ejemplos

Resumido: https://docs.google.com/spreadsheets/d/1vhS3BcYz8Ocxebu1RmwyGj8Ac9xiK_ctfhrQVVkHFAI/edit?usp=sharing

In [None]:
target_words = ['importación', 'europea', 'permita', 'madrid', 'bogotá']

for w in target_words:
    print("*******************************")
    print(f"Para {w}")
    print()
    print("Usando 300 oraciones:")
    print()
    for w1, p in word2vec_300.wv.most_similar(w):
        print(f"{w1}: {p}")
    print("\n")
        
    print("Usando 1000000 oraciones:")
    print()
    for w1, p in word2vec_1m.wv.most_similar(w):
        print(f"{w1}: {p}")
    print("\n")

    print("Usando 3168368 oraciones:")
    print()
    for w1, p in word2vec_full.wv.most_similar(w):
        print(f"{w1}: {p}")
    print("\n")
    



In [None]:
# FastText

target_words = ['importación', 'europea', 'permita', 'madrid', 'bogotá']

for w in target_words:
    print("*******************************")
    print(f"Para {w}")
    print()
    print("Usando 300 oraciones:")
    print()
    for w1, p in fasttext_300.wv.most_similar(w):
        print(f"{w1}: {p}")
    print("\n")
        
    print("Usando 1000000 oraciones:")
    print()
    for w1, p in fasttext_1m.wv.most_similar(w):
        print(f"{w1}: {p}")
    print("\n")

    print("Usando 3168368 oraciones:")
    print()
    for w1, p in fasttext_full.wv.most_similar(w):
        print(f"{w1}: {p}")
    print("\n")
    



## Construcción de vectores para palabras únicas

In [None]:
# Inserte código aquí

## Visualización de vectores en plano cartesiano

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Funciones para entrenar modelos
def entrenar_modelo_w2v(dim, numSentences):
    modelo = Word2Vec(sentences=sentences[:numSentences], vector_size=300, window=5, min_count=1, workers=4)
    return modelo

def entrenar_modelo_ft(dim, numSentences):
    modelo = FastText(vector_size=300, window=5, min_count=1, workers=4)
    modelo.build_vocab(corpus_iterable=sentences[:numSentences])
    return modelo


In [None]:
# Visualización en plano cartesinano
def visualizar_tsne(modelo, dim, perplexity=30):
    # Obtener vectores de palabras y palabras
    vectores = modelo.wv.vectors
    palabras = modelo.wv.index_to_key

    # Aplicar t-SNE para reducción de dimensionalidad a 2D
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    vectores_2d = tsne.fit_transform(vectores)

    # Visualizar todas las palabras en el plano cartesiano
    plt.figure(figsize=(10, 10))
    for palabra, (x, y) in zip(palabras, vectores_2d):
        plt.scatter(x, y)
        plt.text(x, y, palabra, fontsize=8)
    plt.title(f'Visualización t-SNE - Dimensión {dim} - Sentencias: {numSentences}')
    plt.show()

### Word2vec

In [None]:
numSentences = 20
modelo_dim_20 = entrenar_modelo_w2v(100, numSentences)
visualizar_tsne(modelo_dim_20, 100, perplexity=5)

In [None]:
numSentences = 100
modelo_dim_300 = entrenar_modelo_w2v(300, numSentences)
visualizar_tsne(modelo_dim_300, 300, perplexity=5)

### fastText

In [None]:
numSentences = 150
modelo_dim_20 = entrenar_modelo_ft(100, numSentences)
visualizar_tsne(modelo_dim_20, 100, perplexity=5)

In [None]:
numSentences = 50
modelo_dim_300 = entrenar_modelo_ft(300, numSentences)
visualizar_tsne(modelo_dim_300, 300, perplexity=5)