# Word Embeddings em um Dataset Aberto

## Terceiro Exercício Prático


Neste TP você deve explorar e criar os embeddings utilizando uma base de dados de sua preferência. Uma sugestão para encontrar bases interessantes seria a partir das competições do [Kaggle](https://www.kaggle.com/).

Para essa prática, solicito que o aluno prepare a base de dados e gere os embeddings utilizando obrigatoriamente o algoritmo Word2Vec. Dependendo do contexto da base, você pode utilizar o Doc2Vec em vez do Word2Vec ou ambos. 

As 3 etapas descritas abaixo devem ser seguidas obrigatoriamente:

1. Preparação da base de dados assim como visto na prática anterior.
2. Execução do Modelo Word2Vec usando o Gensim, ou outra implementação similar.
3. Teste do seu embedding assim como foi realizado na [demo](https://github.com/gesteves91/nlp/blob/master/notebooks/06-word2vec.ipynb).


Para o trabalho foi capturado os tweets sobre o ex-presidente Lula no dia em que recebeu autorização para deixar a cadeia. Especificamos o dia 08/11/2019 (data em que ele saiu), restringindo a localização na região de Belo Horizonte, num raio de 10.000 km.

Documentação:<br> 
https://tweepy.readthedocs.io/en/latest/index.html

In [1]:
# bibliotecas
import numpy as np
import pandas as pd
import tweepy
import nltk
import demoji
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from string import punctuation
from googletrans import Translator

In [None]:
# leitura das chaves 
with open('twitter-tokens.txt', 'r') as tfile:
    consumer_key = tfile.readline().strip('\n')
    consumer_secret = tfile.readline().strip('\n')
    access_token = tfile.readline().strip('\n')
    access_token_secret = tfile.readline().strip('\n')

In [None]:
# variaveis para fazer o login
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth)

In [None]:
# Query passando a busca e excluindo os retweets.
query_search = 'Lula' + '-filter:retweets'

#Gerando o cursor de busca.
cursor_tweets = tweepy.Cursor(api.search, q=query_search, tweet_mode='extended',lang="pt", 
                              since= "2019-11-07", until= "2019-11-09",
                              geocode='-19.9026615,-44.1041363,10000km').items(10000)

In [None]:
tw = []
for tweet in cursor_tweets:
    tw.append([tweet.created_at, tweet.full_text])

In [None]:
df = pd.DataFrame(tw, columns=['Data','Tweet'])

In [None]:
# Convertendo a data
df['Data'] = pd.to_datetime(df['Data'])
df['Data'] = df['Data'].dt.strftime('%d-%m-%Y')

In [None]:
# Salvando em um arquivo csv.
df.to_csv('Trump.csv')

In [None]:
df.shape

# Preparando a base de dados

In [2]:
df_bh = pd.read_csv('bh3.csv')

In [3]:
df_bh.shape

(6750, 3)

In [4]:
df_bh.head()

Unnamed: 0.1,Unnamed: 0,Data,Tweet
0,0,08-11-2019,Lula virou comediante na cadeia e saiu fazendo...
1,1,08-11-2019,Hoj o Lula transa com o super pênis dele https...
2,2,08-11-2019,@ana_claudiinha até o Lula tá beijando e você ...
3,3,08-11-2019,@ggreenwald Bravo! Bravo! Você tem grande impo...
4,4,08-11-2019,"Vou me ausentar desse site, quando pararem de ..."


In [5]:
df = df_bh[['Data', 'Tweet']].copy()

# Removendo stopwords

In [6]:
stopwords = set(nltk.corpus.stopwords.words('portuguese'))

In [7]:
def remov_stopwords(text):
    text = text.lower()
    palavras = [i for i in text.split() if not i in stopwords]
    return (" ".join(palavras))

In [8]:
df['Tweet'] = df.apply(lambda row: remov_stopwords(row['Tweet']), axis=1)

# Removendo links

In [9]:
df['Tweet'] = df.apply(lambda x: re.sub(r"http\S+", "", x['Tweet']), axis=1)

# Substituindo emojis

In [10]:
def remove_emoji(emoji):
    rep = demoji.findall(emoji)
    re = demoji.replace(emoji)
    if any(rep) == False:
        return re
    else:
        for x in rep:
            text = re + rep[x]
            return text.replace("  ", " ")

In [11]:
df['Tweet'] = df.apply(lambda x: remove_emoji(x['Tweet']), axis=1)

# Tokenização

In [12]:
# Tokenizando as frases.
df['Tokens'] = df.apply(lambda x: word_tokenize(x['Tweet'], language='portuguese'), axis=1)

In [13]:
# Removendo pontuação
pontos = list(punctuation)

def remove_pont(tweets):
    return(x for x in tweets if not x in pontos)

In [14]:
df['Tokens'] = df.apply(lambda x: remove_pont(x['Tokens']), axis=1)

# Lematização

In [15]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [16]:
# Lemmatizando os tweets.
def lemmatize_func(mylist):
    return [lemmatizer.lemmatize(w) for w in mylist]

df['Tokens'] = df.apply(lambda row: lemmatize_func(row['Tokens']), axis=1)

# Stemming

In [17]:
stemming = nltk.stem.RSLPStemmer()

In [18]:
def stemming_func(mylist):
    return [stemming.stem(w) for w in mylist]

df['Tokens'] = df.apply(lambda row: stemming_func(row['Tokens']), axis=1)

# Word2Vec com Gensim

In [19]:
# imports das bibliotecas
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [20]:
documents = df['Tokens']

In [21]:
documents.value_counts()

[lul, livr]                                                                                                                                                                       52
[lul, sai, sext]                                                                                                                                                                  24
[chor, livr, lul]                                                                                                                                                                 20
[lul, tá, livr, babac]                                                                                                                                                            18
[lul, tá, solt, babac]                                                                                                                                                            13
[lul, sai, sext, nad]                                                                          

In [22]:
# Treinando o modelo
model = gensim.models.Word2Vec(documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2019-11-12 21:51:39,065 : INFO : collecting all words and their counts
2019-11-12 21:51:39,066 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-12 21:51:39,111 : INFO : collected 8231 word types from a corpus of 73054 raw words and 6750 sentences
2019-11-12 21:51:39,112 : INFO : Loading a fresh vocabulary
2019-11-12 21:51:39,135 : INFO : effective_min_count=2 retains 3621 unique words (43% of original 8231, drops 4610)
2019-11-12 21:51:39,136 : INFO : effective_min_count=2 leaves 68444 word corpus (93% of original 73054, drops 4610)
2019-11-12 21:51:39,164 : INFO : deleting the raw counts dictionary of 8231 items
2019-11-12 21:51:39,165 : INFO : sample=0.001 downsamples 50 most-common words
2019-11-12 21:51:39,166 : INFO : downsampling leaves estimated 55238 word corpus (80.7% of prior 68444)
2019-11-12 21:51:39,192 : INFO : estimated required memory for 3621 words and 150 dimensions: 6155700 bytes
2019-11-12 21:51:39,193 : INFO : resetting layer weigh

2019-11-12 21:51:40,557 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-11-12 21:51:40,593 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-11-12 21:51:40,607 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-11-12 21:51:40,633 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-11-12 21:51:40,637 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-11-12 21:51:40,641 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-11-12 21:51:40,650 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-11-12 21:51:40,654 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-11-12 21:51:40,655 : INFO : EPOCH - 2 : training on 73054 raw words (55255 effective words) took 0.2s, 338168 effective words/s
2019-11-12 21:51:40,724 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-11-12 21:51:40,741 : INFO : worker thread f

2019-11-12 21:51:41,974 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-11-12 21:51:41,975 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-11-12 21:51:42,011 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-11-12 21:51:42,032 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-11-12 21:51:42,064 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-11-12 21:51:42,073 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-11-12 21:51:42,091 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-11-12 21:51:42,096 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-11-12 21:51:42,110 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-11-12 21:51:42,111 : INFO : EPOCH - 10 : training on 73054 raw words (55252 effective words) took 0.2s, 327087 effective words/s
2019-11-12 21:51:42,112 : INFO : training on a 

(552444, 730540)

In [23]:
model.wv['lul']

array([ 0.04974766,  0.24969278,  0.04253185, -0.04329717,  0.14836477,
        0.00620568, -0.35718933, -0.26831663,  0.04809745,  0.31747815,
       -0.00792868, -0.10631234, -0.1219658 ,  0.10713394, -0.2922856 ,
        0.01771216,  0.09189988,  0.10964385,  0.2998734 , -0.1050623 ,
        0.27233043, -0.26826772, -0.1325751 , -0.11193422,  0.11425116,
        0.01278124, -0.0343105 , -0.25998688,  0.05937512, -0.16419005,
        0.27513707,  0.2636102 , -0.04357855, -0.13926475,  0.39524028,
       -0.07676274,  0.1592483 , -0.5904779 ,  0.0744931 , -0.61838585,
       -0.1925213 , -0.00187638,  0.09848844, -0.11872341,  0.17471866,
        0.16083068,  0.04452417,  0.25293246, -0.14473403,  0.2292935 ,
        0.20342669,  0.4256172 ,  0.14984702,  0.5006885 ,  0.06774528,
        0.10693362, -0.08390249,  0.30299395,  0.45557714,  0.12453377,
        0.37542662,  0.33063924,  0.09326585,  0.09413619,  0.01439169,
        0.33567327,  0.08657842, -0.04485174,  0.16878368, -0.08

In [24]:
# Procurando palavras semelhantes
w1 = ['livr']
model.wv.most_similar(positive=w1)

2019-11-12 21:51:42,243 : INFO : precomputing L2-norms of word weight vectors


[('tbm', 0.9306883215904236),
 ('abrahamweint', 0.9271968603134155),
 ('bolsominiom', 0.926626443862915),
 ('coolll', 0.9170774817466736),
 ('barretorec', 0.9136011004447937),
 ('livree', 0.9126657247543335),
 ('skxnsfuckxt', 0.9107998013496399),
 ('choramaisbolsominion', 0.9104517698287964),
 ('carlux', 0.9084633588790894),
 ('coç', 0.9083201885223389)]

In [25]:
# vamos ver as 5 palavras mais similares a 'lul'
w1 = ["lul"]
model.wv.most_similar(positive=w1, topn=5)

[('️st', 0.7633270025253296),
 ('djan', 0.751136064529419),
 ('simmm', 0.745428740978241),
 ('villela206', 0.743656575679779),
 ('porraa', 0.7434346675872803)]

In [26]:
# vamos ver as 5 palavras mais similares a 'sext'
w1 = ["sext"]
model.wv.most_similar (positive=w1,topn=6)

[('bebemor', 0.9621407985687256),
 ('noiv', 0.9401246309280396),
 ('beij', 0.9299951791763306),
 ('burocrac', 0.9271803498268127),
 ('feir', 0.9265385866165161),
 ('boc', 0.9255303144454956)]

In [27]:
# palavras mais relacionadas 
w1 = ['lul', 'sai', 'sext']
model.wv.most_similar(positive=w1,topn=10)

[('colab', 0.9602494239807129),
 ('bebemor', 0.9600928425788879),
 ('noil', 0.9574834704399109),
 ('sexta-f', 0.9559686779975891),
 ('monograf', 0.9529237747192383),
 ('feir', 0.9523804187774658),
 ('fas', 0.9523721933364868),
 ('sel', 0.9523522853851318),
 ('23', 0.9498199224472046),
 ('crush', 0.9489154815673828)]

# Similaridade entre palavras

In [28]:
# similaridade de duas palavras diferentes
model.wv.similarity(w1="lul", w2="liv")

0.6762105

In [29]:
# similaridades de duas palavras idênticas
model.wv.similarity(w1="lul", w2="lul")

1.0

In [30]:
# similaridade de duas palavras opostas
model.wv.similarity(w1="lul", w2="bolsonar")

0.44598645

In [54]:
# itens que nao se relacionam
model.wv.doesnt_match(["lul","bolsonar","liv"])

'bolsonar'

In [31]:
for i, word in enumerate(documents):
    if i == 10:
        print(word)

['``', 'agor', 'quer', 'ver', 'bolsonar', 'ter', 'cu', 'quer', 'ver', 'ter', 'cu', 'pro', 'lul', "''", 'fras', 'solt', 'mãe']


In [33]:
# Recuperando o vocabulario
for i, word in enumerate(model.wv.vocab):
    if i == 10:
        break
    print(word)

lul
vir
cade
saiu
faz
ótim
pi
hoj
trans
sup


# Análise de sentimentos

In [35]:
from textblob import TextBlob as tb

In [36]:
# variavel para armazenar polaridade
analysis = None
tweets = []

In [37]:
public_tweets = df['Tweet']

In [38]:
for tweet in public_tweets:
    analysis = tb(tweet)
    polarity = analysis.sentiment.polarity
    tweets.append(polarity)
    print(polarity)

0.0
0.3333333333333333
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.4
0.3
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.4
0.0
0.4166666666666667
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.5859375
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.15
0.8
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.0
0.0
-0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.03333333333333333
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.8
0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.

0.0
-0.2
0.0
0.0
0.8
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.13333333333333333
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.25
0.5
0.0
0.0
0.0
0.0
0.0
0.0
-0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.5
0.0
0.0
0.0
0.4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.5
0.0
0.0
0.0
0.5
0.0
0.0
0.0
0.03333333333333333
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.8
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-

0.0
0.0
0.0
0.0
0.0
0.0
0.8
0.0
0.8
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.16666666666666666
0.0
0.0
-0.4
0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.13636363636363635
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.0546875
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.234375
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.13636363636363635
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.25
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.25
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.05208333333333333
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.25
0.0
0.0
0.0
0.0
0.0
0.6
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.6
0.0
0.0
0.0
0.4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.

In [39]:
print("Média de sentimento:", str(np.mean(tweets)))

Média de sentimento: 0.01627982086072364


# Visualizando a incorporação das palavras

In [49]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [52]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for i, word in enumerate(model.wv.vocab):
        if i == 500:
            break
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):    

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)