<a href="https://colab.research.google.com/github/Rotem2411/alephBERTgimmelDalet/blob/main/word_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

In [4]:
file_path = 'sentiments.csv'
df = pd.read_csv(file_path)

Convert words to embeddings without text cleaning, using Word2Vec model

In [17]:
tokenized_text = [nltk.word_tokenize(text) for text in df['text'].tolist()]
model = Word2Vec(tokenized_text, min_count=1, vector_size=20, window=5)
word_embeddings = model.wv

Example of most similar words to 'נתניהו'

In [18]:
display(word_embeddings.most_similar('נתניהו'))

[('לפיד', 0.9667624831199646),
 ('פוטין', 0.9572262763977051),
 ('טראמפ', 0.9514205455780029),
 ('הליכוד', 0.9411493539810181),
 ('ליברמן', 0.9389529824256897),
 ('ביבי', 0.936684787273407),
 ('בנט', 0.9331871271133423),
 ('האוצר', 0.9327473044395447),
 ('גנץ', 0.9278356432914734),
 ('התפקיד', 0.9270130395889282)]

Apply K-means clustering

In [19]:
word_vectors = word_embeddings.vectors
kmeans = KMeans(n_clusters=4, n_init='auto').fit(word_vectors)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [95040  2373  8603   903]


Reduce dimensionality to two dimensions using T-SNE

In [36]:
tsne = TSNE(n_components=2, angle=0.2, perplexity=30, learning_rate=200, n_iter=250)
tsne_results = tsne.fit_transform(word_vectors)

KeyboardInterrupt: 

Visualize t-SNE representations of the most common words

In [None]:
# prompt: Display on graph/plot the results based on the clustering performed in k-mins before

# Create a scatter plot with different colors for each cluster
plt.figure(figsize=(10, 7))
for i in range(len(tsne_results)):
  plt.scatter(tsne_results[i, 0], tsne_results[i, 1], c=kmeans.labels_[i], alpha=0.5)
plt.title('Visualization of Word Clusters using T-SNE')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend(cluster_ids)
plt.show()