In [10]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds

from IPython.display import clear_output
from tqdm.notebook import tqdm

In [11]:
class Vectorizer():
    def __init__(self, corp_path):
        self.corp_path = corp_path
    
    def upload_corp(self):
        with open(self.corp_path, 'r', encoding='utf-8') as f:
            self.corp = list(set(f.read().split('\n')[:-1]))
        
        self.corp = [text.replace(',', ' ') for text in self.corp]
    
    
    def log(self, part):
        clear_output(wait=True)
        print(f'{part} is processing')
        
    def make_tf_idf_matrix(self, token_pattern=None):
        if token_pattern:
            self.tfidf = TfidfVectorizer(token_pattern=token_pattern)
        else:
            self.tfidf = TfidfVectorizer()
            
        self.A = self.tfidf.fit_transform(self.corp)
        self.feature_list = self.tfidf.get_feature_names_out()
        
    def make_svd(self, n=30):
        self.u, self.sigma, self.vT = svds(self.A, n)
        self.singular_indicies = np.argsort(-self.sigma)
        
        self.u = self.u[:, self.singular_indicies]
        self.sigma = np.diag(self.sigma[self.singular_indicies])
        self.vT = self.vT[self.singular_indicies, :]
        
        self.embedded_matrix = self.sigma@self.vT
        
        self.words_embedding_dict = dict(zip(self.feature_list, self.embedded_matrix.T))
    
    def get_emb_dict(self):
        
        self.log('Upload')
        self.upload_corp()
        self.log('TfIdf')
        self.make_tf_idf_matrix()
        self.log('SVD')
        self.make_svd(n=20)
        
        return self.words_embedding_dict

In [12]:
vect = Vectorizer('/content/persian_cleaned_corpus.txt')
emb_dict = vect.get_emb_dict()


SVD is processing


In [22]:
import random

# Select 20 random keys from the embedding dictionary
keys = random.sample(emb_dict.keys(), 20)

# Print the embeddings of the selected keys, rounded to 4 decimal places
for k in keys:
    print(f'{k}: {emb_dict[k].round(4)}')

میکنیمر: [ 0.0001  0.      0.0001 -0.0002  0.     -0.      0.     -0.      0.0001
 -0.0001  0.     -0.0001  0.      0.     -0.      0.      0.      0.
  0.      0.0001]
بندآخر: [ 0.0004 -0.0001  0.     -0.0003  0.0002 -0.0001  0.0001  0.0002 -0.0003
 -0.0002 -0.0003  0.0002  0.0002  0.     -0.0001  0.0001  0.      0.0001
  0.0003  0.0001]
مندیگک: [ 0.0003  0.0001 -0.0005  0.0002 -0.      0.0004 -0.0001 -0.0002 -0.0001
 -0.0002  0.      0.      0.     -0.      0.     -0.0001  0.      0.
  0.     -0.    ]
ابدیزد: [ 0.0001 -0.     -0.      0.0001  0.     -0.      0.0001  0.      0.
 -0.     -0.     -0.     -0.0001  0.     -0.      0.      0.      0.
 -0.0001 -0.    ]
گزپن: [ 0.0003 -0.0001  0.     -0.0002  0.0001 -0.      0.     -0.0001 -0.0001
 -0.     -0.0001  0.     -0.0001 -0.0001  0.     -0.      0.0001 -0.0001
  0.0001 -0.0003]
برایبتلاشدن: [ 0.0002 -0.0001  0.     -0.     -0.0002 -0.0001  0.     -0.      0.
  0.     -0.0001  0.     -0.      0.     -0.0001  0.      0.      0.
 -0.00

since Python 3.9 and will be removed in a subsequent version.
  keys = random.sample(emb_dict.keys(), 20)
