# word2vec on Twitter data

Limitations in obtained results: 
- low cosine similarity between words which are supposed to be close ("prix" & "inflation")
- high computational time to retrieve embeddings for all tweets. 

## Setup

In [31]:
import pickle as pkl 
from itertools import chain
import numpy as np

In [4]:
from typing import List, Union
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [5]:
path = "../backup/data/tweets_preprocessed.pkl"
with open(path, "rb") as f: 
    tweets_preprocessed = pkl.load(f)

## Create corpus 

In [6]:
class Corpus: 
    """Description. An iterator that yields sentences (lists of str).
    
    Attributes: 
        - sentences: list of tokens or plain texts
        - tokenized: indicates whether the text has been tokenized"""

    def __init__(self, sentences: List[Union[List, str]], tokenized: bool):
        self.sentences = sentences
        self.tokenized = tokenized

    def __iter__(self):
        """Description. Iterator over sentences."""

        for s in self.sentences:
            if self.tokenized:
                s = " ".join(s)
            yield simple_preprocess(s)

In [18]:
corpus = Corpus(sentences=tweets_preprocessed["cleaned"], tokenized=False)

In [22]:
unique_words = set(list(chain(*corpus))) 
n_words = len(unique_words)

In [23]:
for text in corpus: 
    break 

print(text)

['le', 'tout', 'vendu', 'des', 'prix', 'défiant', 'toute', 'concurrence', 'ceci', 'pour', 'nous', 'annoncer', 'prochainement', 'que', 'ayant', 'un', 'trou', 'dans', 'notre', 'économie', 'circulaire', 'nous', 'allons', 'participer', 'au', 'renflouement']


## Run model

In [24]:
n_components = 200
window = 5
min_count = 5

model = Word2Vec(
    sentences=corpus, 
    min_count=min_count,
    vector_size=n_components, 
    window=window)

In [25]:
vocab = model.wv.index_to_key

## Results

In [26]:
vocab = model.wv.index_to_key
vocab_len = len(vocab)

In [27]:
print(vocab_len == len(set(vocab)))

print(f"w2v vocab: {vocab[:10]}...")
print(f"number of words in vocab={vocab_len}")
print(f"number of words in total={n_words}")

True
w2v vocab: ['de', 'la', 'le', 'les', 'et', 'des', 'en', 'inflation', 'est', 'pour']...
number of words in vocab=29850
number of words in total=113833


In [28]:
model.wv.most_similar("prix", topn=20)

[('tarifs', 0.5407853722572327),
 ('litre', 0.5063979625701904),
 ('tarif', 0.48051708936691284),
 ('loyers', 0.4737620949745178),
 ('électricité', 0.4498542249202728),
 ('coût', 0.4493483603000641),
 ('coûts', 0.44890135526657104),
 ('pâtes', 0.4442346692085266),
 ('gaz', 0.4326724708080292),
 ('panier', 0.4324531555175781),
 ('pain', 0.41085925698280334),
 ('sucre', 0.40907543897628784),
 ('blé', 0.4089013338088989),
 ('carburant', 0.40677961707115173),
 ('tabac', 0.40032240748405457),
 ('centimes', 0.3997615873813629),
 ('essence', 0.39639410376548767),
 ('palme', 0.3885374963283539),
 ('produits', 0.3879724442958832),
 ('passnavigo', 0.3873341381549835)]

In [29]:
model.wv.similarity(w1="inflation", w2="prix")

0.12892343

## Embeddings

In [32]:
def text_to_vec(text: List, vocab: List, model: Word2Vec) -> np.ndarray: 
    """Description. Convert text into embedding using word2vec."""

    return model.wv[np.array(text)[np.in1d(text, vocab)]]

In [34]:
sentences = list(corpus)[:10]

embeddings = [text_to_vec(sentence, vocab, model) for sentence in sentences]

In [39]:
for s, em in zip(sentences, embeddings): 
    break 

print(f"Tweet: {' '.join(s)}")
print("-"*100)
print(f"Embedding={em}")

Tweet: le tout vendu des prix défiant toute concurrence ceci pour nous annoncer prochainement que ayant un trou dans notre économie circulaire nous allons participer au renflouement
----------------------------------------------------------------------------------------------------
Embedding=[[ 0.85389745  2.358581    0.53904206 ...  0.9196409   1.3401673
  -0.4548108 ]
 [-0.75303626 -0.09547216 -0.63800025 ...  0.68792623  1.9537251
  -3.3618095 ]
 [-0.08319736 -0.5879957  -0.02148688 ... -0.0793516  -0.32891768
   0.11505453]
 ...
 [ 0.38658124  0.02952377  0.44992316 ...  0.58287007  0.21661347
   0.12841561]
 [ 0.05996609 -0.1715854  -0.7128676  ...  0.36932278  0.27788818
  -0.669887  ]
 [ 1.3771836   1.6620768  -0.69492906 ...  1.5061085   0.98723197
   0.4262609 ]]
