# Doing things with text 7

## Word embeddings with Word2vec

### Import packages

In [None]:
import os
import unicodedata
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib notebook
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import re  # For preprocessing
from time import time  # To time our operations
from gensim.models import Word2Vec
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from sklearn.manifold import TSNE
from numpy import dot
from numpy.linalg import norm

#### Define in- and out-directories

In [None]:
indir = r'/path_to_indir/'
outdir = r'/path_to_outdir/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already

corpus = 'dataset' # here the name of your actual dataset for output files

## Importing the data

### Option 1: with preprocessing (for raw data)

Input is multiple .txt files

In [None]:
data = []
file_count = 0

# list all files in a given directory
files = os.listdir(indir)

for infile in files:
    # avoid opening files such as .DS_Store
    if infile.startswith('.'):
        continue
    file_count + 1
    # open the file and do something with it, close when done
    with open(indir+infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
    
    # iterate through each sentence in the file
    for i in sent_tokenize(text):
        infile_list = []
        for word in word_tokenize(i):
            if len(word) > 3: # removing words of 3 letters and shorter
                new_word = re.sub(r'[^\w\s]', '', word) # preprocessing
                if new_word != '':
                    infile_list.append(new_word.lower())
        if infile_list != '':
            data.append(infile_list)

### Option 2: without preprocessing (for preprocessed data - much quicker)

Input is multiple .txt files

In [None]:
data = []
file_count = 0

# list all files in a given directory
files = os.listdir(indir)

for count, infile in enumerate(files):
    # avoid opening files such as .DS_Store
    if infile.startswith('.'):
        continue
    if infile.endswith('.txt'):
        file_count += 1
        # open the file and do something with it, close when done
        with open(indir+infile, "r") as f:
            # try / except clause to catch encoding errors
            try:
                text = f.read()
            except Exception:
                print(Exception)
        infile_list = [x for x in text.split(' ') if len(x) > 3] # removing words of 3 letters and shorter
        print('%s has %s words' %(infile, len(infile_list)))
    data.append(infile_list)

Check that list 'data' contains as many lists as there are files

In [None]:
print('List \'data\' is %s lists long, which equals the %s files in indir' %(len(data), file_count))

## Analysis

### Word2vec

from: https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92

In [None]:
model = Word2Vec(data, min_count=5, vector_size=128, workers=3, window=5)

### Save model to outdir

In [None]:
#model.save(outdir + corpus + "_w2v.model")

### Search most similar terms

In [None]:
keys = [] # can be one or more words as 'word', 'word', 'word'
n = 30

In [None]:
for key in keys:
    most_similar = model.wv.most_similar(positive=[key], topn=n)
    print('Words most similar to \'%s\':'%(key))
    for word in most_similar:
        print(word)
    print('\n')
    

In [None]:
model.wv.most_similar(positive=[], negative=[], topn = 10) # to positive and negative can be added 'word', 'word', 'word'

### Visualize most similar words as clusters

In [None]:
def to_string(list):
    string = '_'.join(list)
    return string

In [None]:
def for_title(list):
    string = ' and '.join(list)
    return string

In [None]:
embedding_clusters = []
word_clusters = []

for word in keys:
    embeddings = []
    words = []
    for similar_word, score in model.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(6, 6))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
tsne_plot_similar_words('Word clusters for %s' %(for_title(keys)), 
                        keys, 
                        embeddings_en_2d, 
                        word_clusters, 
                        0.7, 
                        outdir + corpus + '_%s_tsne.png' %(to_string(keys)))