<a href="https://colab.research.google.com/github/Rotem2411/alephBERTgimmelDalet/blob/main/word_embedding1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.manifold import TSNE

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
file_path = 'sentiments.csv'
df = pd.read_csv(file_path)

FileNotFoundError: [Errno 2] No such file or directory: 'sentiments.csv'

Convert words to embeddings without text cleaning, using Word2Vec model

In [19]:
docs = df['text'].tolist()
tokenized_text = [nltk.word_tokenize(text) for text in docs]
model = Word2Vec(tokenized_text, min_count=1, vector_size=20, window=5)

Example of most similar words to 'נתניהו'

In [8]:
display(model.wv.most_similar('נתניהו', topn=10))

[('הליכוד', 0.9680353999137878),
 ('פוטין', 0.9610236287117004),
 ('לפיד', 0.9584319591522217),
 ('הפרצוף', 0.9553976058959961),
 ('בנט', 0.9455683827400208),
 ('טראמפ', 0.9421078562736511),
 ('ביב', 0.9407416582107544),
 ('המבחן', 0.9383149147033691),
 ('יאיר', 0.9372230768203735),
 ('ליברמן', 0.9363155961036682)]

Document Vectors from Word Embedding

In [20]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

vectorized_text = vectorize(tokenized_text, model=model)
len(vectorized_text), len(vectorized_text[0])

(75151, 20)

Apply K-means clustering

In [21]:
def mbkmeans_clusters(
	X,
    k,
    mb,
    print_silhouette_values,
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [26]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_text,
    k=50,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_text],
    "cluster": cluster_labels
})



For n_clusters = 50
Silhouette coefficient: 0.04
Inertia:155906.8827806821
Silhouette values:
    Cluster 37: Size:1312 | Avg:0.33 | Min:0.02 | Max: 0.53
    Cluster 19: Size:555 | Avg:0.14 | Min:-0.11 | Max: 0.38
    Cluster 30: Size:1021 | Avg:0.11 | Min:-0.10 | Max: 0.33
    Cluster 38: Size:2316 | Avg:0.10 | Min:-0.06 | Max: 0.33
    Cluster 20: Size:4522 | Avg:0.10 | Min:-0.02 | Max: 0.26
    Cluster 24: Size:587 | Avg:0.10 | Min:-0.11 | Max: 0.37
    Cluster 7: Size:1918 | Avg:0.09 | Min:-0.14 | Max: 0.29
    Cluster 47: Size:2987 | Avg:0.09 | Min:-0.04 | Max: 0.27
    Cluster 36: Size:1382 | Avg:0.08 | Min:-0.09 | Max: 0.32
    Cluster 2: Size:1090 | Avg:0.08 | Min:-0.16 | Max: 0.32
    Cluster 27: Size:2161 | Avg:0.08 | Min:-0.13 | Max: 0.31
    Cluster 13: Size:1727 | Avg:0.08 | Min:-0.11 | Max: 0.27
    Cluster 3: Size:1773 | Avg:0.07 | Min:-0.04 | Max: 0.26
    Cluster 22: Size:3201 | Avg:0.07 | Min:-0.04 | Max: 0.25
    Cluster 11: Size:2655 | Avg:0.06 | Min:-0.04 | Max: 0.

In [8]:
word_vectors = model.wv.vectors
kmeans = KMeans(n_clusters=4, n_init='auto').fit(word_vectors)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [94881  8699   935  2404]


Reduce dimensionality to two dimensions using T-SNE

In [None]:
arrays = np.empty((0, 300), dtype='f')
word_labels = [word]
color_list  = ['red']

# adds the vector of the query word
arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)

# gets list of most similar words
close_words = model.wv.most_similar([word])

# adds the vector for each of the closest words to the array
for wrd_score in close_words:
  wrd_vector = model.wv.__getitem__([wrd_score[0]])
  word_labels.append(wrd_score[0])
  color_list.append('blue')
  arrays = np.append(arrays, wrd_vector, axis=0)

# adds the vector for each of the words from list_names to the array
for wrd in list_names:
  wrd_vector = model.wv.__getitem__([wrd])
  word_labels.append(wrd)
  color_list.append('green')
  arrays = np.append(arrays, wrd_vector, axis=0)

# Reduces the dimensionality from 300 to 50 dimensions with PCA
reduc = PCA(n_components=20).fit_transform(arrays)

# Finds t-SNE coordinates for 2 dimensions
np.set_printoptions(suppress=True)

In [None]:
tsne = TSNE(n_components=2, angle=0.2, perplexity=30, learning_rate=200, n_iter=250)
tsne_results = tsne.fit_transform(word_vectors)

Visualize t-SNE representations of the most common words

In [None]:
# prompt: Display on graph/plot the results based on the clustering performed in k-mins before

# Create a scatter plot with different colors for each cluster
plt.figure(figsize=(10, 7))
for i in range(len(tsne_results)):
  plt.scatter(tsne_results[i, 0], tsne_results[i, 1], c=kmeans.labels_[i], alpha=0.5)
plt.title('Visualization of Word Clusters using T-SNE')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend(cluster_ids)
plt.show()