In [1]:
import tqdm
import json
import os
import scipy
import nltk
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from time import sleep
from nltk.corpus import stopwords

In [2]:
paths = os.listdir('data')
BATCH_SIZE = 72

titles = [[] for _ in range(len(paths) // BATCH_SIZE + 1)]
eng_stopwords = set(stopwords.words('english'))
eng_words = set(nltk.corpus.words.words())

small_it = 0
big_it = 0
for file in tqdm.tqdm(
    paths,
    desc='Loading data',
    unit='file'
    ):
    documents = []
    if file.endswith('.json'):
        with open(os.path.join('data', file)) as f:
            data = json.load(f)
            for part in data:
                text = part['text'].lower()
                text = ''.join([c for c in text if c.isascii()])
                text = nltk.word_tokenize(text)
                text = [word for word in text if word not in eng_stopwords and word in eng_words]
                if len(text) < 200: continue
                titles[big_it].append(part['title'])

    if small_it % BATCH_SIZE == BATCH_SIZE - 1:
        # save titles to file
        with open(f'./svd_matrix/titles/titles_{str(big_it).rjust(3, "0")}.pkl', 'wb') as f:
            pickle.dump(titles[big_it], f)
        small_it = 0
        big_it += 1
    else:
        small_it += 1

Loading data: 100%|███████████████████████████████████████████████| 605/605 [5:02:47<00:00, 30.03s/file]


In [4]:
for big_it in range(len(titles)):
    print(f'Total number of titles in batch {str(big_it + 1).rjust(3, "0")}:', len(titles[big_it]))
print('Total number of titles in all batches:', sum([len(titles[big_it]) for big_it in range(len(titles))]))

Total number of titles in batch 001: 188537
Total number of titles in batch 002: 185658
Total number of titles in batch 003: 184584
Total number of titles in batch 004: 184896
Total number of titles in batch 005: 185757
Total number of titles in batch 006: 165758
Total number of titles in batch 007: 175535
Total number of titles in batch 008: 174688
Total number of titles in batch 009: 69965
Total number of titles in all batches: 1515378


In [None]:
mini_it = 0
small_it = 0
big_it = 0
matrix_data = []
for file in tqdm.tqdm(
    paths,
    desc='Making tf-idf matrix',
    unit='file'
    ):
    if file.endswith('.json'):
        with open(os.path.join('data', file)) as f:
            data = json.load(f)
            for part in data:
                words_apperance_vector = {}
                text = part['text'].lower()
                text = ''.join([c for c in text if c.isascii()])
                text = nltk.word_tokenize(text)
                text = [word for word in text if word not in eng_stopwords and word in eng_words]

                if len(text) < 200: continue

                text = ''.join([f'{word} ' for word in text])
                matrix_data.append(text)
                mini_it += 1
    
    if small_it % BATCH_SIZE == BATCH_SIZE - 1:
        tfidf = TfidfVectorizer()
        tfidf_matrix = tfidf.fit_transform(matrix_data)
        u, s, v = scipy.sparse.linalg.svds(tfidf_matrix, k=100)
        np.save(f'./svd_matrix/v/v_{str(big_it).rjust(3, "0")}.npy', v)
        np.save(f'./svd_matrix/s/s_{str(big_it).rjust(3, "0")}.npy', s)
        np.save(f'./svd_matrix/u/u_{str(big_it).rjust(3, "0")}.npy', u)
        # save vocabulary to file
        with open(f'./svd_matrix/voc/voc_{str(big_it).rjust(3, "0")}.pkl', 'wb') as f:
            pickle.dump(tfidf.vocabulary_, f)
        small_it = 0
        big_it += 1
        matrix_data = []
        if big_it == 8: break
    else:
        small_it += 1

Making tf-idf matrix:   0%|          | 2/605 [01:26<7:14:05, 43.19s/file]


In [26]:
# remove all npy files from directory and subdirectories
for root, dirs, files in os.walk('./svd_matrix'):
    for file in files:
        if file.endswith('.npy') or file.endswith('.pkl'):
            os.remove(os.path.join(root, file))

In [30]:
for root, dirs, files in os.walk('svd_matrix'):
    for file in files:
        os.rename(os.path.join(root, file), os.path.join(root, 'test_' + file))