# Tarea #2: Representación vectorial de palabras con Word2Vec y FastText usando Gensim

Realziado por:
- Jose Luis Hincapie Bucheli (2125340)
- Sebastián Idrobo Avirama (2122637)
- Paul Rodrigo Rojas Guerrero (2127891)
---

# Creación de listas de sentencias

In [None]:
!pip install datasets spacy

In [None]:
from datasets import load_dataset

In [None]:
raw_text = load_dataset('large_spanish_corpus', name='DGT', split='train', streaming=True)

## Tokenización

In [None]:
import spacy
from tqdm import tqdm
import time

!python -m spacy download es_core_news_sm

In [None]:
#Correr antes de hacer la limpieza
nlp = spacy.load('es_core_news_sm')

## Depuración

In [None]:
# Si ya tienes el archivo output2 actualizado, puedes omitir este paso

import re, string

with open('output2.txt', 'w',encoding="utf-8") as output_file:
    for sent in tqdm(raw_text):     
        sent = sent['text']
        doc = nlp(sent)
        for token in doc:
            # Verificar que sea una palabra, no sea un dígito y no sea una stop-word
            w = token.text
            if not w.isdigit() and not token.is_stop:
                w = re.sub(r'[0-9]','', w) # Depurar de dígitos
                re_punc = re.compile('[%s]' % re.escape(string.punctuation)) 
                w = re_punc.sub('', w) # Depurar de signos de puntuación
                w = re.sub("\!|\'|\?|\¿|\¡|\«|\»|\—","",w) # Depurar de símbolos especiales
                w = w.lower() # Utilizar sólo letras minúsculas
                if w.strip(): # Si el token se convirtió en una cadena vacía o con espacios, no añadirla
                    output_file.write(f'{w}, ')
        output_file.write('\n')

In [None]:
with open('output2.txt', 'r', encoding="utf-8") as file:
    lines = file.readlines()
    sentences = []
    for line in lines:
        words = line.split(', ')
        last_index = len(words)-1
        if words[last_index] == '\n':
            words.pop(last_index)
        sentences.append(words)

print(sentences[1:50])

# Representación Vectorial

## Construcción de modelos de FastText y Word2Vec

In [None]:
!pip install gensim

In [None]:
from gensim.models import FastText
from gensim.test.utils import common_texts
from gensim.models.word2vec import Word2Vec

### Word2Vec

In [None]:
word2vec_300 = Word2Vec(sentences=sentences[:300], vector_size=300, window=5, min_count=1, workers=4) # 300 sentences

In [None]:
word2vec_1m = Word2Vec(sentences=sentences[:1_000_000], vector_size=300, window=5, min_count=1, workers=4) #1000000 sentences

In [None]:
word2vec_full = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=1, workers=4) # aprox 3000000 sentences

In [None]:
word2vec_300.wv.most_similar('director')

In [None]:
word2vec_1m.wv.most_similar('director')

In [None]:
word2vec_full.wv.most_similar('director')

### FastText

In [None]:
fasttext_300 = FastText(vector_size=300, window=5, min_count=1, workers=4)
fasttext_300.build_vocab(corpus_iterable=sentences[:300]) # 300 sentences

In [None]:
fasttext_1m = FastText(vector_size=300, window=5, min_count=1, workers=4)
fasttext_1m.build_vocab(corpus_iterable=sentences[:1_000_000]) #1000000 sentences

In [None]:
fasttext_full = FastText(vector_size=300, window=5, min_count=1)
fasttext_full.build_vocab(corpus_iterable=sentences) # aprox 3000000 sentences

In [None]:
fasttext_300.wv.most_similar('director')

In [None]:
fasttext_1m.wv.most_similar('director')

In [None]:
fasttext_full.wv.most_similar('director')

## Construcción de vectores para palabras únicas

In [None]:
# Inserte código aquí

## Visualización de vectores en plano cartesiano

In [None]:
# Inserte código aquí