# Doing things with text 6

## Word2vec

### Import packages

In [None]:
import os
from bs4 import BeautifulSoup
import unicodedata
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib notebook
import seaborn as sns
from collections import Counter
from collections import defaultdict  # For word frequency
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import re  # For preprocessing
from time import time  # To time our operations
import spacy  # For preprocessing
from gensim.models import Word2Vec
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from sklearn.manifold import TSNE
from numpy import dot
from numpy.linalg import norm

#### Define in- and out-directories

Indir is a folder on your computer with multiple text files. Outdir is a folder (to be made) to store cleaned versions of the files

In [None]:
indir = r'/Users/huijn001/data/archaeology/data/totaal_per_jaar_schoon/'

## Preprocessing

In [None]:
def remove_html(text):
    """ Use the library BeautifulSoup (bs4) to remove html tags """
    soup = BeautifulSoup(text, "lxml")
    clean_text = soup.get_text()
    return clean_text

In [None]:
data = []

# list all files in a given directory
files = os.listdir(indir)

for infile in files:
    # avoid opening files such as .DS_Store
    if infile.startswith('.'):
        continue
    # open the file and do something with it, close when done
    with open(indir+infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
    # remove html
    clean_text = remove_html(text)
    
    # iterate through each sentence in the file
    for i in sent_tokenize(clean_text):
        temp = []
        for word in word_tokenize(i):
            if len(word) > 3:
                new_word = re.sub(r'[^\w\s]', '', word)
                if new_word != '':
                    temp.append(new_word.lower())
        if temp != '':
            data.append(temp)

In [None]:
print(data[:1])

## Analysis

### Word2vec

from: https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92

In [None]:
model = Word2Vec(data, min_count=5, size=128, workers=3, window=5)

In [None]:
#model.save("popularscience_w2v.model")

In [None]:
model.most_similar(positive='klokbekercultuur', topn=30)

In [None]:
model.most_similar(positive=['sugar', 'healthy'], negative=['unhealthy'], topn = 10)

In [None]:
## om meerdere zoekwoorden tegelijk te printen

keys = ['passief', 'actief']
n = 30

for key in keys:
    print(model.most_similar(positive=[key], topn=n))
    

Om meerdere zoekwoorden (in 'keys') te visualiseren in een scatter plot

In [None]:
embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in model.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(6, 6))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
tsne_plot_similar_words('Similar word clusters', keys, embeddings_en_2d, word_clusters, 0.7, '/Users/huijn001/Desktop/test_ps.png')