# Word embeddings

We will train a couple of word embedding models to generate numerical features to apply on other models.
Specifically we will user word2vec and fasttext on gensim library

In [1]:
%cd ..

/home/lois/pojetos/ds4a/notebook


In [2]:
#%conda install gensim -y


In [3]:
from src.loading import load_dataset
from src.cleaning import build_df_from_RA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import unicodedata


df = build_df_from_RA(load_dataset("dataset"))
print(f"We have a total of {df.shape[0]} reviews!")

We have a total of 51655 reviews!


In [8]:
import string
import re

def normalize_text(text):
    """
    Strip accents and lower text string
    :param text: (str) text to be cleaned
    :return: (str) cleaned text
    """
    text = strip_accents(text)
    text = text.lower().strip()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text


def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def remove_numbers(text):
    return re.sub(r'\b[0-9]+\b', '', text)


texts = df.apply(lambda row: f"{remove_numbers(normalize_text(row['title']))} {remove_numbers(normalize_text(row['description']))}", axis=1)

In [9]:
texts.head().values

array(['peas danificada na montagem comprei um guardaroupa na empresa citada acima e na montagem foi danificada a lateral e tambem duas portas estao com defeitofoi feita uma reclamacao pessoalmente ao vendedor e ele se comprometei em fazer as trocas das mesmas so que ja chegaram as peas em minha residencia a mais de tres meses e eu ja fui varias vezes na loja pra eles providenciar a troca das mesmas e nao estou tendo exito',
       'fabrispuma jundiai faz o cliente de palhaco fiz uma compra de moveis pra cozinha na loja fabrispuma de jundiai no centro rua do rosario a vendedora e otima patricia fui atendida super bem tambem me atendeu nesta venda a gerente sabrina se nao me equivoco expliquei meu caso que precisava que nao demorassem na entrega e principalmente que nao demorassem com a montagem eu tinha preco melhor pela internete e a gerente mesmo me convenceu a comprar com a fabrispuma por que oferecem a montagem gratis eu fui clara que nao podia esperar muito pela montagem pois nao 

In [10]:
def tokenize(data, sep=None):
    if sep is not None:
        return data.split(sep)
    return data.split()

tokens_sq = [tokenize(i) for i in texts.values]
len(tokens_sq)

51655

In [None]:
from pprint import pprint as print
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath

seed = 2020
epochs = 100
ft_params = {"sg": 0,  # Training algorithm: skip-gram if sg=1, otherwise CBOW.
             "hs": 0,  #  If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
             "size": 100,  # Dimensionality of the word vectors.
             "alpha": 0.025,  # The initial learning rate.
             "window": 5, # The maximum distance between the current and predicted word within a sentence.
             "min_count": 5,  # The model ignores all words with total frequency lower than this.
             "max_vocab_size": None, 
             "word_ngrams": 1, 
             "sample": 0.001,  # the threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
             "seed": seed,
             "workers": 3, # Use these many worker threads to train the model (=faster training with multicore machines).
             "min_alpha": 0.0001, 
             "negative": 5,  #  If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
             "ns_exponent": 0.75, 
             "cbow_mean": 1,  # If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
             "iter": 5, 
             "min_n": 3, # Minimum length of char n-grams to be used for training word representations.
             "max_n": 6, #  Max length of char ngrams to be used for training word representations. Set max_n to be lesser than min_n to avoid char ngrams being used.
             "sorted_vocab": 1}


model = FT_gensim(**ft_params)

# build the vocabulary
model.build_vocab(corpus_file=tokens_sq)

# train the model
model.train(
    sentences=tokens_sq, epochs=epochs,
    total_examples=len(tokens_sq)
)


In [None]:
# saving a model trained via Gensim's fastText implementation
import tempfile
import os
with tempfile.NamedTemporaryFile(prefix='saved_model_gensim-', delete=False) as tmp:
    model.save(tmp.name, separately=[])

loaded_model = FT_gensim.load(tmp.name)
print(loaded_model)

os.unlink(tmp.name)

In [None]:
print(model.most_similar("nights"))
