# Doing things with text 7: word2vec

This notebook provides various functionalities for word embeddings with word2vec.

The code assumes that the input is a series of txt files.

### Step 0 (only if needed): install required packages

In [None]:
!pip install networkx

In [None]:
import nltk
nltk.download('punkt_tab')

### Step 1: Importing required packages

* `Pathlib.Path`: Provides an object-oriented interface for filesystem paths
* `unicodedata`: Works with Unicode characters and normalization
* `re`: Provides regular expression tools for text pattern matching
* `nltk.tokenize`: Splits text into sentences (`sent_tokenize`) and words (`word_tokenize`).
* `matplotlib.pyplot`: Creates static, interactive, and animated visualizations
* `matplotlib.cm`: Manages colormap settings for visualizations
* `seaborn`: Simplifies complex data visualization based on Matplotlib
* `numpy`: Performs numerical computations and array manipulations. `dot` and `norm` from `numpy.linalg` calculate dot products and vector norms
* `warnings`: Manages and suppresses warning messages in Python
* `time`: Tracks time intervals and performance measurements
* `gensim.models.Word2Vec`: Builds and trains Word2Vec word embedding models
* `logging`: Provides customizable logging for debugging and diagnostics
* `sklearn.manifold.TSNE`: Reduces dimensions for high-dimensional data visualization
* `scipy.cluster.hierarchy`: Performs hierarchical clustering and dendrogram visualization
* `networkx`: Creates and analyzes complex networks and graph data structures

In [None]:
from pathlib import Path
import unicodedata
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns
import numpy as np
from numpy import dot
from numpy.linalg import norm
import warnings
warnings.filterwarnings('ignore')
from time import time
from gensim.models import Word2Vec
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage
import networkx as nx

### Step 2: Define input and output paths

Define where your texts files are located (indir) and where you want to save your output (outdir)

In [None]:
# Define input and output paths
indir = Path('/Path/to/indir/')
outdir = Path('/Path/to/outdir/')
outdir.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist

allfiles = sorted(indir.glob("*.txt"))

dataset = 'dataset' # here the name of your actual dataset for output files

In [None]:
def save_corpus(corpus):
    corpus_out = corpus.replace(" ", "_").lower()
    return corpus_out

def to_string(list):
    string = '_'.join(list)
    return string

def to_title(words):
    if not words:
        return ''
    elif len(words) == 1:
        return f"'{words[0]}'"
    else:
        formatted_list = [f"'{word}'" for word in words[:-1]]
        return ', '.join(formatted_list) + f" and '{words[-1]}'"

### Step 3: Importing the data

#### Option 1: with preprocessing (for raw data)

Input is multiple .txt files

In [None]:
data = []
file_count = 0

for infile in allfiles:
    file_count + 1
    # open the file and do something with it, close when done
    with open(infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
    
    # iterate through each sentence in the file
    for i in sent_tokenize(text):
        infile_list = []
        for word in word_tokenize(i):
            if len(word) > 3: # removing words of 3 letters and shorter
                new_word = re.sub(r'[^\w\s]', '', word) # preprocessing
                if new_word != '':
                    infile_list.append(new_word.lower())
        if infile_list != '':
            data.append(infile_list)

#### Option 2: without preprocessing (for preprocessed data - much quicker)

Input is multiple .txt files

In [None]:
data = []
file_count = 0

for count, infile in enumerate(allfiles):
    file_count += 1
    # open the file and do something with it, close when done
    with open(infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
    infile_list = [x for x in text.split(' ') if len(x) > 3] # removing words of 3 letters and shorter
    print('%s has %s words' %(infile, len(infile_list)))
    data.append(infile_list)

Check that list 'data' contains as many lists as there are files

In [None]:
print('List \'data\' is %s lists long, which equals the %s files in indir' %(len(data), file_count))

### Step 4: Create a word2vec model

from: https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92

In [None]:
model = Word2Vec(data, min_count=5, vector_size=128, workers=3, window=5)

#### (Optional) Step 4a: Save model to outdir

In [None]:
#model.save(outdir + dataset + "_w2v.model")

### Step 5: Search most similar terms

In [None]:
keys = ['word'] # can be one or more words as 'word', 'word', 'word'
n = 30

In [None]:
for key in keys:
    most_similar = model.wv.most_similar(positive=[key], topn=n)
    print('Words most similar to \'%s\':'%(key))
    for word in most_similar:
        print(word)
    print('\n')
    

### Step 6: Visualize most similar words as clusters

In [None]:
embedding_clusters = []
word_clusters = []

for word in keys:
    embeddings = []
    words = []
    for similar_word, score in model.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(6, 6))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
tsne_plot_similar_words('Word clusters for %s' %(to_title(keys)), 
                        keys, 
                        embeddings_en_2d, 
                        word_clusters, 
                        0.7, 
                        str(outdir) + dataset + '_%s_tsne.png' %(to_string(keys)))

### Step 8: Visualize most similar words in a dendogram

In [None]:
def plot_dendrogram_for_similar_words(search_term, model, top_n=100):
    # Retrieve the most similar words
    similar_words = model.wv.most_similar(search_term, topn=top_n)
    words = [word for word, _ in similar_words]
    
    # Include the search term itself
    words.append(search_term)
    
    # Get the word vectors
    word_vectors = [model.wv[word] for word in words]
    
    # Perform hierarchical clustering
    linkage_matrix = linkage(word_vectors, 'ward')
    
    # Plot the dendrogram
    plt.figure(figsize=(15, 10))
    dendrogram(linkage_matrix, labels=words, leaf_rotation=90)
    plt.title(f"Dendrogram for the top {top_n} words similar to '{search_term}'")
    plt.xlabel("Words")
    plt.ylabel("Distance")
    plt.show()

In [None]:
search_term = "word"
plot_dendrogram_for_similar_words(search_term, model, top_n=30)

### Step 9: Visualize most similar words in a heatmap

In [None]:
def plot_heatmap_for_similar_words(search_term, model, top_n=100):
    # Retrieve the most similar words
    similar_words = model.wv.most_similar(search_term, topn=top_n)
    words = [word for word, _ in similar_words]
    
    # Include the search term itself
    words.append(search_term)
    
    # Get the word vectors
    word_vectors = np.array([model.wv[word] for word in words])
    
    # Compute the cosine similarity matrix
    similarities = np.inner(word_vectors, word_vectors)
    
    # Normalize the similarity values to the range [0, 1]
    norms = np.linalg.norm(word_vectors, axis=1)
    similarities = similarities / norms[:, np.newaxis] / norms[np.newaxis, :]
    
    # Plot the heatmap
    plt.figure(figsize=(15, 12))
    sns.heatmap(similarities, xticklabels=words, yticklabels=words, cmap='coolwarm', annot=False)
    plt.title(f"Cosine Similarity Heatmap for the top {top_n} words similar to '{search_term}'")
    plt.xlabel("Words")
    plt.ylabel("Words")
    plt.show()

In [None]:
search_term = "word"
plot_heatmap_for_similar_words(search_term, model, top_n=30)

### Step 10: Visualize most similar words as a network

In [None]:
def get_similar_words(model, keyword, top_n):
    similar_words = model.wv.most_similar(keyword, topn=top_n)
    return [word for word, _ in similar_words]

def build_network(model, seed_words, top_n):
    network = nx.Graph()

    for seed_word in seed_words:
        similar_words = get_similar_words(model, seed_word, top_n)
        network.add_node(seed_word)

        for word in similar_words:
            network.add_node(word)
            network.add_edge(seed_word, word)

            second_degree_words = get_similar_words(model, word, top_n2)
            for second_word in second_degree_words:
                network.add_node(second_word)
                network.add_edge(word, second_word)

    return network

In [None]:
def visualize_network(network):
    node_sizes = [d * 150 for n, d in network.degree()]

    #pos = nx.shell_layout(network)
    #pos = nx.circular_layout(network)
    # Use Fruchterman-Reingold layout
    pos = nx.spring_layout(network, seed=42)
    
    plt.figure(figsize=(12, 12))
    nx.draw_networkx_nodes(network, pos, node_size=node_sizes, node_color='skyblue')
    nx.draw_networkx_edges(network, pos, width=1.0, alpha=0.5)
    nx.draw_networkx_labels(network, pos, font_size=10, font_color='black')
    
    plt.title("Network of most similar words of %s in %s" %(to_title(seed_words), dataset))
    plt.savefig(str(outdir / ("%s_%s_%s_network.png" % (to_string(seed_words), str(top_n), str(top_n2)))), dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
seed_words = ['word']
top_n = 10
top_n2 = 2
network_words = build_network(model, seed_words, top_n)

In [None]:
visualize_network(network_words)