In [2]:
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds

from IPython.display import clear_output
from tqdm.notebook import tqdm

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
class Vectorizer():
    def __init__(self, corp_path):
        self.corp_path = corp_path
    
    def upload_corp(self):
        with open(self.corp_path, 'r') as f:
            self.corp = list(set(f.read().split('\n')[:-1]))
        
        self.corp = [text.replace(',', ' ') for text in self.corp]
    
    
    def log(self, part):
        clear_output(wait=True)
        print(f'{part} is processing')
        
    def make_tf_idf_matrix(self, token_pattern=None):
        if token_pattern:
            self.tfidf = TfidfVectorizer(token_pattern=token_pattern)
        else:
            self.tfidf = TfidfVectorizer()
            
        self.A = self.tfidf.fit_transform(self.corp)
        self.feature_list = self.tfidf.get_feature_names_out()
        
    def make_svd(self, n=30):
        self.u, self.sigma, self.vT = svds(self.A, n)
        self.singular_indicies = np.argsort(-self.sigma)
        
        self.u = self.u[:, self.singular_indicies]
        self.sigma = np.diag(self.sigma[self.singular_indicies])
        self.vT = self.vT[self.singular_indicies, :]
        
        self.embedded_matrix = self.sigma@self.vT
        
        self.words_embedding_dict = dict(zip(self.feature_list, self.embedded_matrix.T))
    
    def get_emb_dict(self):
        
        self.log('Upload')
        self.upload_corp()
        self.log('TfIdf')
        self.make_tf_idf_matrix()
        self.log('SVD')
        self.make_svd(n=20)
        
        return self.words_embedding_dict

In [7]:
with open("/content/drive/MyDrive/Spot the Bot/Wiki-Files/stemmed_no_stopwords_corpus.json", 'r') as f:
        clean_corp = json.load(f)

In [8]:
len(clean_corp)

596137

In [27]:
with open('corpus.txt', 'w') as f:
    for text in clean_corp:
        f.write(text + '\n')

In [28]:
vect = Vectorizer('corpus.txt')

In [29]:
emb_dict = vect.get_emb_dict()

SVD is processing


In [30]:
for c, k in enumerate(emb_dict.keys()):
    print(f'{k}: ', emb_dict[k].round(4))
    if c == 20:
        break

ءامنوا:  [-0.     -0.0001 -0.0001  0.      0.     -0.     -0.0001 -0.     -0.0001
  0.0001  0.0002  0.      0.     -0.      0.0001  0.0001  0.      0.
 -0.     -0.    ]
ءفیزیکی:  [-0. -0. -0.  0.  0.  0. -0.  0. -0.  0.  0. -0.  0. -0.  0.  0.  0. -0.
 -0. -0.]
آآ:  [-0.0029 -0.006  -0.0032 -0.0007  0.0056  0.0023 -0.0012  0.0296  0.007
 -0.0048  0.0039  0.0001  0.     -0.0013  0.0016  0.0027  0.0067  0.0005
  0.0007 -0.001 ]
آآرآرسی:  [-0.0001 -0.0002 -0.0002 -0.0001  0.0001  0.     -0.0001 -0.0001 -0.0001
  0.0001  0.0004  0.0002  0.0001 -0.0002  0.0001  0.0001  0.0001  0.0001
 -0.0003 -0.    ]
آآروی:  [-0.0001 -0.0002 -0.0002  0.0001  0.     -0.0001 -0.0001 -0.     -0.0002
  0.0002  0.0004  0.0001 -0.0002 -0.0001  0.0001  0.0001  0.0001 -0.0001
 -0.0001 -0.0001]
آآمدده:  [-0.     -0.0001 -0.0001  0.      0.0001  0.     -0.0001 -0.     -0.0001
  0.0001  0.0001 -0.      0.     -0.0001  0.      0.     -0.     -0.
 -0.     -0.0001]
آئاسی:  [-0.0001 -0.0002 -0.0002  0.      0.0002 -0.   

In [32]:
len(emb_dict)

279005

In [33]:
len(vect.vT[1,:])

279005