In [1]:
import tqdm
import json
import os
import scipy
import nltk
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [2]:
paths = os.listdir('data')
BATCH_SIZE = 72

titles = [[] for _ in range(len(paths) // BATCH_SIZE + 1)]
eng_stopwords = set(stopwords.words('english'))
eng_words = set(nltk.corpus.words.words())

small_it = 0
big_it = 0
for file in tqdm.tqdm(
    paths,
    desc='Loading data',
    unit='file'
    ):
    documents = []
    if file.endswith('.json'):
        with open(os.path.join('data', file)) as f:
            data = json.load(f)
            for part in data:
                text = part['text'].lower()
                text = ''.join([c for c in text if c.isascii()])
                text = nltk.word_tokenize(text)
                text = [word for word in text if word not in eng_stopwords and word in eng_words]
                if len(text) < 150: continue
                titles[big_it].append(part['title'])

    if small_it % BATCH_SIZE == BATCH_SIZE - 1:
        with open(f'./svd_matrix/titles/titles_{big_it}.pkl', 'wb') as f:
            pickle.dump(titles[big_it], f)
        small_it = 0
        big_it += 1
    else:
        small_it += 1

Loading data: 100%|██████████| 605/605 [3:56:44<00:00, 23.48s/file]  


In [3]:
for big_it in range(len(titles)):
    print(f'Total number of titles in batch {big_it + 1}:', len(titles[big_it]))
print('Total number of titles in all batches:', sum([len(titles[big_it]) for big_it in range(len(titles))]))

Total number of titles in batch 1: 243529
Total number of titles in batch 2: 241976
Total number of titles in batch 3: 242344
Total number of titles in batch 4: 238768
Total number of titles in batch 5: 244482
Total number of titles in batch 6: 218092
Total number of titles in batch 7: 226141
Total number of titles in batch 8: 229814
Total number of titles in batch 9: 92336
Total number of titles in all batches: 1977482


In [6]:
mini_it = 0
small_it = 0
big_it = 0
matrix_data = []
for file in tqdm.tqdm(
    paths,
    desc='Making tf-idf matrix',
    unit='file'
    ):
    if file.endswith('.json'):
        with open(os.path.join('data', file)) as f:
            data = json.load(f)
            for part in data:
                words_apperance_vector = {}
                text = part['text'].lower()
                text = ''.join([c for c in text if c.isascii()])
                text = nltk.word_tokenize(text)
                text = [word for word in text if word not in eng_stopwords and word in eng_words]
                if len(text) < 150: continue
                text = ''.join([f'{word} ' for word in text])
                matrix_data.append(text)
                mini_it += 1
    
    if small_it % BATCH_SIZE == BATCH_SIZE - 1 or file == paths[-1]:
        tfidf = TfidfVectorizer()
        tfidf_matrix = tfidf.fit_transform(matrix_data)
        u, s, v = scipy.sparse.linalg.svds(tfidf_matrix, k=300)
        np.save(f'./svd_matrix/v/v_{big_it}.npy', v)
        np.save(f'./svd_matrix/s/s_{big_it}.npy', s)
        np.save(f'./svd_matrix/u/u_{big_it}.npy', u)
        with open(f'./svd_matrix/voc/voc_{big_it}.pkl', 'wb') as f:
            pickle.dump(tfidf.vocabulary_, f)
        small_it = 0
        big_it += 1
        matrix_data = []
    else:
        small_it += 1

Making tf-idf matrix: 100%|██████████| 605/605 [4:32:58<00:00, 27.07s/file]    
