# TFIDF matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def make_matrix_W_list_of_words(corpus_path, min_df, max_df=None, token_pattern = None, use_idf = True):
  '''
  corpus_path - is a path to the corpus, where one line - one text

  min_df - is the minimum times (or fraction of the texts) a word must occur in the corpus

  max_df - is the maximum times (or fraction of the texts) a word must occur in the corpus
  if it is None, there are no upper bound

  token_pattern - alphabet, which will be considered. Usually can be all letters of the language and numbers
  if None all symbols will be OK

  use_idf - is bool value whether to use idf
  '''
  with open(corpus_path, 'r') as corpus_file:
        if token_pattern:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, token_pattern=token_pattern, use_idf=use_idf)
        else:
            vectorizer = TfidfVectorizer(analyzer='word', min_df=min_df, use_idf=use_idf)
        data_vectorized = vectorizer.fit_transform(corpus_file)
  return data_vectorized, vectorizer.get_feature_names_out()

In [None]:
W, words_list  = make_matrix_W_list_of_words('filtered_slovenian_corpus.txt', 1)

In [None]:
W.shape

(4323, 243950)

In [None]:
W3, words_list3  = make_matrix_W_list_of_words('filtered_slovenian_corpus.txt', 3)

In [None]:
W3.shape

(4323, 243950)

In [None]:
W10, words_list10  = make_matrix_W_list_of_words('filtered_slovenian_corpus.txt', 10)

In [None]:
W10.shape

(4323, 91336)

In [None]:
W5, words_list5  = make_matrix_W_list_of_words('filtered_slovenian_corpus.txt', 5)

In [None]:
W5.shape

(4323, 155250)

In [None]:
# All the words
words_list5[:1000]

array(['aa', 'aaa', 'aaaa', 'aaaaa', 'aaaaaa', 'aac', 'aachen', 'aad',
       'aae', 'aah', 'aai', 'aaiti', 'aaj', 'aajati', 'aak', 'aal', 'aam',
       'aama', 'aamo', 'aan', 'aanovitno', 'aanovitnoft', 'aao', 'aaor',
       'aar', 'aaronov', 'aas', 'aat', 'aate', 'aati', 'aato', 'aau',
       'aav', 'aavolj', 'ab', 'aba', 'abadon', 'abak', 'abanski',
       'abarthov', 'abas', 'abat', 'abati', 'abba', 'abbaden', 'abbas',
       'abbe', 'abbiama', 'abc', 'abcd', 'abcda', 'abceden', 'abces',
       'abdijev', 'abdikacija', 'abdul', 'abe', 'abec', 'abeca', 'abece',
       'abeced', 'abeceda', 'abecedar', 'abecedario', 'abecedarium',
       'abecedarček', 'abeceden', 'abecednica', 'abecednik',
       'abecednikov', 'abecedno', 'abel', 'abeljen', 'abeljnov', 'abelka',
       'abelnov', 'abelov', 'aben', 'abesinija', 'abesinski', 'abet',
       'abeta', 'abeti', 'abeundi', 'abev', 'abfoluzion', 'abgesonderten',
       'abi', 'abies', 'abiit', 'abija', 'abijev', 'abil', 'abilejev',
       '

In [None]:
words_list10[:500]

array(['aa', 'aaa', 'aaaa', 'aai', 'aaj', 'aal', 'aar', 'aas', 'ab',
       'aba', 'abbe', 'abc', 'abe', 'abeceda', 'abecedar', 'abeceden',
       'abecednik', 'abecedno', 'abel', 'abeljen', 'abeljnov', 'abelnov',
       'abelov', 'aben', 'abesinski', 'abi', 'abisinski', 'abiturient',
       'abiturijent', 'ablativ', 'abnormalen', 'abo', 'abortus', 'abot',
       'abota', 'aboten', 'abotnež', 'abotnica', 'abotnik', 'abotno',
       'abotnost', 'abra', 'abraham', 'abrahamov', 'abs', 'absa',
       'absint', 'abso', 'absol', 'absolut', 'absoluten', 'absolutionem',
       'absolutist', 'absolutističen', 'absolutizem', 'absolutno',
       'absolutnost', 'absolutorij', 'absolvent', 'absolviran',
       'absolvirati', 'absorbirati', 'abstinenca', 'abstinent',
       'abstrahirati', 'abstrakcija', 'abstrakten', 'abstraktno',
       'abstraktnost', 'absurden', 'absurdnost', 'absurdum', 'ac',
       'acaulis', 'acclamationem', 'ace', 'acer', 'ach', 'aci', 'act',
       'acta', 'actus', 'ad', 'a

In [None]:
# orig non-filtered
words_list[:500]

array(['aa', 'aaa', 'aaaa', 'aaaaa', 'aaaaaa', 'aaaaaaaa',
       'aaaaaaaaaaaaaa', 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaan',
       'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaenea',
       'aaaaaaaaaaaaaaaaaaaaaaaaaaaac', 'aaaaaaazazazazazazvvavev',
       'aaaaaifj', 'aaaaatli', 'aaaac', 'aaaagng', 'aaaat', 'aaaaš',
       'aaac', 'aaae', 'aaaeco', 'aaaeut', 'aaaftgd', 'aaah',
       'aaahaaažihažhahahaažihaaaaaažažiaš', 'aaahati', 'aaahe', 'aaahhk',
       'aaaimmaja', 'aaaioa', 'aaaiti', 'aaaj', 'aaajati', 'aaajnt',
       'aaak', 'aaaka', 'aaakactp', 'aaako', 'aaalall', 'aaament',
       'aaanali', 'aaaniti', 'aaanta', 'aaao', 'aaaofjen', 'aaaoo',
       'aaaoonaao', 'aaaov', 'aaap', 'aaarfk', 'aaaruc', 'aaasa', 'aaatn',
       'aaatp', 'aaatra', 'aaatral', 'aaatri', 'aaatrlal', 'aaav', 'aaaz',
       'aaazaj', 'aaazz', 'aaačaez', 'aab', 'aabbcc', 'aaben', 'aabi',
       'aabuhnjen', 'aac', 'aace', 'aacelec', 'aacfe', 'aachen',
       'aachenski', 'aacjts', 'aacl', 'aacp', 'aacpakegtep', 'aacr

# Check words in word_list by classla

In [None]:
!pip install classla

In [None]:
import os
import re
import glob
import time
from tqdm import tqdm
import classla

classla.download('sl')
nlp = classla.Pipeline('sl', processors='tokenize,lemma,pos')

In [None]:
wordss = ['aa', 'aaa', 'aaaa', 'aai', 'aaj', 'aal', 'aar', 'aas', 'ab',
       'aba', 'abbe', 'abc', 'abe', 'abeceda', 'abecedar', 'abeceden',
       'abecednik', 'abecedno', 'abel', 'abeljen', 'abeljnov', 'abelnov',
       'abelov', 'aben', 'abesinski', 'abi', 'abisinski', 'abiturient',
       'abiturijent', 'ablativ', 'abnormalen', 'abo', 'abortus', 'abot',
       'abota', 'aboten', 'abotnež', 'abotnica', 'abotnik', 'abotno',
       'abotnost', 'abra', 'abraham', 'abrahamov', 'abs', 'absa',
       'absint', 'abso', 'absol', 'absolut', 'absoluten', 'absolutionem',
       'absolutist', 'absolutističen', 'absolutizem', 'absolutno',
       'absolutnost', 'absolutorij', 'absolvent', 'absolviran',
       'absolvirati', 'absorbirati', 'abstinenca', 'abstinent',
       'abstrahirati', 'abstrakcija', 'abstrakten', 'abstraktno',
       'abstraktnost', 'absurden', 'absurdnost', 'absurdum', 'ac',
       'acaulis', 'acclamationem', 'ace', 'acer', 'ach', 'aci', 'act',
       'acta', 'actus', 'ad', 'ada', 'adaj', 'adam']

In [None]:
def is_valid_lemma(word):
    """Returns False if CLASSLA doesn't know this word"""
    try:
        doc = nlp(word)
        token = doc.sentences[0].tokens[0].words[0]

        # When lemma matches input, verify CLASSLA actually knows it
        if token.lemma.lower() == word.lower():
            return (
                token.upos not in ['X', 'SYM', 'PUNCT'] and
                'Foreign=Yes' not in token.feats and
                'Typo=Yes' not in token.feats
            )
        return True

    except:
        return False

In [None]:
for word in wordss:
  print(word, is_valid_lemma(word))

aa False
aaa False
aaaa False
aai False
aaj False
aal False
aar False
aas False
ab False
aba False
abbe False
abc False
abe False
abeceda False
abecedar False
abeceden False
abecednik False
abecedno False
abel False
abeljen False
abeljnov False
abelnov False
abelov False
aben False
abesinski False
abi False
abisinski False
abiturient False
abiturijent False
ablativ False
abnormalen False
abo False
abortus False
abot False
abota False
aboten False
abotnež False
abotnica False
abotnik False
abotno False
abotnost False
abra False
abraham False
abrahamov False
abs False
absa False
absint False
abso False
absol False
absolut False
absoluten False
absolutionem False
absolutist False
absolutističen False
absolutizem False
absolutno False
absolutnost False
absolutorij False
absolvent False
absolviran False
absolvirati False
absorbirati False
abstinenca False
abstinent False
abstrahirati False
abstrakcija False
abstrakten False
abstraktno False
abstraktnost False
absurden False
absurdnost False

# SVD

In [None]:
W5.shape

(4323, 155250)

=> k cannot be greater than 4323

In [None]:
from scipy.sparse.linalg import svds
import numpy as np

In [None]:
def apply_svd(W, k, output_folder):
  '''
  W - matrix texts x words
  k - the rank of the SVD, must be less than any dimension of W
  '''
  # Apply the SVD function
  u, sigma, vt = svds(W, k)

  # The order of the singular values is descending
  descending_order_of_inds = np.flip(np.argsort(sigma))
  u = u[:,descending_order_of_inds]
  vt = vt[descending_order_of_inds]
  sigma = sigma[descending_order_of_inds]

  # Check that sizes are ok
  assert sigma.shape == (k,)
  assert vt.shape == (k, W.shape[1])
  assert u.shape == (W.shape[0], k)

  # Save all the matrices in folder (just in case)
  with open(output_folder+'/' + str(k) + '_sigma_vt.npy', 'wb') as f:
        np.save(f, np.dot(np.diag(sigma), vt).T)
  with open(output_folder+'/' +  str(k) + '_sigma.npy', 'wb') as f:
        np.save(f, sigma)
  with open(output_folder+'/' +  str(k) + '_u.npy', 'wb') as f:
        np.save(f, u)
  with open(output_folder+'/' +  str(k) + '_vt.npy', 'wb') as f:
        np.save(f, vt)
  return np.dot(np.diag(sigma), vt).T

In [None]:
vv1024 = apply_svd(W5, 1024, '/content')

In [None]:
vv1024.shape

(155250, 1024)

In [None]:
# k == w2v dimension
vv100 = apply_svd(W5, 100, '/content')

In [None]:
vv100.shape

(155250, 100)

In [None]:
!ls

1024_sigma.npy	   filtered_slovenian_corpus.txt
1024_sigma_vt.npy  sample_data
1024_u.npy	   slovenian_lit_SVD_dictionary.npy
1024_vt.npy


In [None]:
def create_dictionary(words_list, vv, output_file):
  dictionary = {}
  for word, vector in zip(words_list, vv):
    dictionary[word] = vector
  np.save(output_file, dictionary)
  return dictionary

In [None]:
dictionary1024 = create_dictionary(words_list5, vv1024, 'slovenian_lit_SVD_1024_dictionary.npy')

In [None]:
dictionary100 = create_dictionary(words_list5, vv100, 'slovenian_lit_SVD_100_dictionary.npy')

In [None]:
dict(list(dictionary1024.items())[:1000:100])

{'aa': array([ 0.01166676,  0.01487115, -0.00332046, ...,  0.00208464,
         0.00149882, -0.00070132]),
 'absolutizem': array([0.00517345, 0.00199884, 0.0030556 , ..., 0.00014632, 0.00062643,
        0.00032747]),
 'adieu': array([ 6.61194470e-05, -4.77929765e-05,  4.10827639e-05, ...,
        -8.53863839e-06, -1.47809404e-05,  1.14821625e-05]),
 'aee': array([ 2.77370156e-04,  1.96003210e-04, -1.30709146e-04, ...,
         6.59237971e-06,  1.18746917e-08,  1.64078674e-04]),
 'aglo': array([ 1.39387699e-03, -9.55595431e-04,  7.08297496e-04, ...,
        -7.18160492e-05, -1.53550357e-04,  6.32690943e-04]),
 'ajdati': array([ 1.96954923e-04,  1.84797020e-04, -1.91813020e-04, ...,
         1.00163002e-04,  6.93518387e-05,  1.22636021e-04]),
 'akopra': array([ 3.83434687e-04, -8.05688841e-05, -4.57984345e-05, ...,
         2.02577669e-04, -3.42835397e-05,  1.54111371e-05]),
 'albertinski': array([ 4.42174206e-04,  9.49407197e-04,  5.62833532e-04, ...,
         1.99680786e-04, -8.6469431

In [None]:
dict(list(dictionary100.items())[:1000:100])

{'aa': array([ 1.16667608e-02,  1.48711484e-02,  3.32046489e-03,  2.79877055e-04,
         4.27913941e-03, -1.71556131e-03,  3.35493795e-03,  3.10396850e-03,
         5.08858687e-04,  2.65847288e-03, -5.66621687e-03,  2.81115235e-03,
         1.34865197e-03,  7.83445898e-04, -2.51663374e-03, -1.92776228e-03,
         2.17892158e-03, -1.37528134e-03,  1.31366435e-03,  6.58399842e-04,
         4.24229503e-03,  5.38501427e-04, -5.78482655e-04, -1.12400276e-04,
         6.99674876e-04, -6.13379230e-04,  1.31217493e-03, -1.08123126e-03,
         1.69162255e-03,  9.62268565e-04, -8.22852554e-04, -4.65228361e-03,
        -1.92766555e-03,  9.99238869e-04,  2.27307411e-03, -9.09182361e-05,
        -4.85215896e-03,  6.22573153e-04, -8.54713720e-04, -3.72407317e-03,
         1.86805032e-03,  3.15073410e-04,  6.95349873e-04,  3.06849007e-03,
        -1.72103565e-03,  1.48549820e-03,  6.29087408e-04,  7.13767009e-04,
        -1.08279588e-03,  2.44485587e-03,  1.58516115e-03, -1.74326678e-04,
      