In [1]:
from nltk import WordPunctTokenizer
from tqdm import tqdm
import numpy as np
import pickle as pkl
import collections

In [2]:
tokenizer = WordPunctTokenizer()

In [3]:
def acquire_lemmas(filename):
    with open(filename+'.txt','r',encoding='utf-8') as l:
        lemmas = l.read().split('\n')
    l.close()
    tokens = tokenizer.tokenize_sents(lemmas)
    return tokens

In [4]:
train_lemmas = acquire_lemmas('Train_lemmas')

In [5]:
with open('Train_labels.txt', 'r', encoding='utf-8') as i:
    train_labels = i.read().split('\n')
i.close()

In [6]:
def return_last(listData, x):
    return len(listData)-listData[::-1].index(x)-1

In [7]:
def texts(label):
    return train_lemmas[train_labels.index(label) : return_last(train_labels, label) + 1]

In [8]:
anime = texts('Anime')

In [9]:
art = texts('Art')

In [10]:
books = texts('Books')

In [11]:
films = texts('Films')

In [12]:
food = texts('Food')

In [13]:
football = texts('Football')

In [14]:
games = texts('Games')

In [15]:
music = texts('Music')

In [16]:
nature = texts('Nature')

In [17]:
travel = texts('Travel')

In [18]:
def topdocuments(corpus):
    vocabulary = list(set(sum(corpus, [])))
    corpus = [set(t) for t in corpus]
    counts = {}
    for v in vocabulary:
        counts[v] = 0
    for v in tqdm(vocabulary):
        for c in corpus:
            if v in c:
                counts[v] += 1
    counts = {key:val for key, val in counts.items()}
    return counts

In [19]:
animects = topdocuments(anime)

100%|██████████████████████████████████| 16822/16822 [00:03<00:00, 4776.92it/s]


In [20]:
artcts = topdocuments(art)

100%|█████████████████████████████████| 11460/11460 [00:00<00:00, 15680.69it/s]


In [21]:
bookscts = topdocuments(books)

100%|███████████████████████████████████| 39987/39987 [00:44<00:00, 902.97it/s]


In [22]:
filmscts = topdocuments(films)

100%|██████████████████████████████████| 21919/21919 [00:11<00:00, 1978.25it/s]


In [23]:
foodcts = topdocuments(food)

100%|██████████████████████████████████| 23543/23543 [00:12<00:00, 1938.02it/s]


In [24]:
footballcts = topdocuments(football)

100%|███████████████████████████████████| 41982/41982 [02:40<00:00, 260.85it/s]


In [25]:
gamescts = topdocuments(games)

100%|███████████████████████████████████| 57836/57836 [03:37<00:00, 266.17it/s]


In [26]:
musiccts = topdocuments(music)

100%|██████████████████████████████████| 29103/29103 [00:23<00:00, 1263.86it/s]


In [28]:
naturects = topdocuments(nature)

100%|███████████████████████████████████| 8430/8430 [00:00<00:00, 20101.13it/s]


In [27]:
travelcts = topdocuments(travel)

100%|██████████████████████████████████| 16213/16213 [00:01<00:00, 9121.29it/s]


In [29]:
all = anime + art + books + films + food + football + games + music + nature + travel

In [31]:
allunnest = list(set(sum(all, [])))

In [32]:
animeset = set(sum(anime, []))

In [33]:
artset = set(sum(art, []))

In [34]:
booksset = set(sum(books, []))

In [35]:
filmsset = set(sum(films, []))

In [36]:
foodset = set(sum(food, []))

In [38]:
footballset = set(sum(football, []))

In [37]:
gamesset = set(sum(games, []))

In [40]:
musicset = set(sum(music, []))

In [39]:
natureset = set(sum(nature, []))

In [41]:
travelset = set(sum(travel, []))

In [42]:
allset = list(set(allunnest))

In [43]:
matrix_absolute_nonsmooth = {}

In [44]:
for i in tqdm(allset):
    counts = {}
    try:
        counts['anime'] = animects[i]
    except KeyError:
        counts['anime'] = 0
    try:
        counts['art'] = artcts[i]
    except KeyError:
        counts['art'] = 0
    try:
        counts['books'] = bookscts[i]
    except KeyError:
        counts['books'] = 0
    try:
        counts['films'] = filmscts[i]
    except KeyError:
        counts['films'] = 0
    try:
        counts['food'] = foodcts[i]
    except KeyError:
        counts['food'] = 0
    try:
        counts['football'] = footballcts[i]
    except KeyError:
        counts['football'] = 0
    try:
        counts['games'] = gamescts[i]
    except KeyError:
        counts['games'] = 0
    try:
        counts['music'] = musiccts[i]
    except KeyError:
        counts['music'] = 0
    try:
        counts['nature'] = naturects[i]
    except KeyError:
        counts['nature'] = 0
    try:
        counts['travel'] = travelcts[i]
    except KeyError:
        counts['travel'] = 0
    matrix_absolute_nonsmooth[i] = counts

100%|██████████████████████████████| 121101/121101 [00:00<00:00, 144507.32it/s]


In [45]:
matrix_absolute_nonsmooth['аниме']

{'anime': 1051,
 'art': 5,
 'books': 4,
 'films': 11,
 'food': 0,
 'football': 0,
 'games': 39,
 'music': 4,
 'nature': 0,
 'travel': 1}

In [46]:
matrix_absolute_add2 = {key:{k:(v+2) for k, v in val.items()} for key, val in matrix_absolute_nonsmooth.items()}

In [47]:
matrix_absolute_add2['аниме']

{'anime': 1053,
 'art': 7,
 'books': 6,
 'films': 13,
 'food': 2,
 'football': 2,
 'games': 41,
 'music': 6,
 'nature': 2,
 'travel': 3}

In [48]:
with open('Matrix Absolute Nonsmooth.pkl', 'wb') as m1:
    pkl.dump(matrix_absolute_nonsmooth, m1)
m1.close()

In [49]:
with open('Matrix Absolute Add2.pkl', 'wb') as m2:
    pkl.dump(matrix_absolute_add2, m2)
m2.close()

In [50]:
matrix_relative_nonsmooth = {}

In [51]:
for i in tqdm(allset):
    counts = {}
    
    try:
        c1 = animects[i]
    except KeyError:
        c1 = 0
    counts['anime'] = c1 / len(anime)
    
    try:
        c2 = artcts[i]
    except KeyError:
        c2 = 0
    counts['art'] = c2 / len(art)
    
    try:
        c3 = bookscts[i]
    except KeyError:
        c3 = 0
    counts['books'] = c3 / len(books)
    
    try:
        c4 = filmscts[i]
    except KeyError:
        c4 = 0
    counts['films'] = c4 / len(films)
    
    try:
        c5 = foodcts[i]
    except KeyError:
        c5 = 0
    counts['food'] = c5 / len(food)
    
    try:
        c6 = footballcts[i]
    except KeyError:
        c6 = 0
    counts['football'] = c6 / len(football)    
        
    try:
        c7 = gamescts[i]
    except KeyError:
        c7 = 0
    counts['games'] = c7 / len(games)   
        
    try:
        c8= musiccts[i]
    except KeyError:
        c8 = 0
    counts['music'] = c8 / len(music)    
        
    try:
        c9 = naturects[i]
    except KeyError:
        c9 = 0
    counts['nature'] = c9 / len(nature)
        
    try:
        c10 = travelcts[i]
    except KeyError:
        c10 = 0
    counts['travel'] = c10 / len(travel)    
        
    matrix_relative_nonsmooth[i] = counts

100%|██████████████████████████████| 121101/121101 [00:01<00:00, 113374.65it/s]


In [52]:
matrix_relative_nonsmooth['аниме']

{'anime': 0.34905347060777153,
 'art': 0.0051387461459403904,
 'books': 0.0004088725339875294,
 'films': 0.0019618334225075798,
 'food': 0.0,
 'football': 0.0,
 'games': 0.0013293792821351877,
 'music': 0.0005157962604771116,
 'nature': 0.0,
 'travel': 0.0005841121495327102}

In [53]:
with open('Matrix Relative Nonsmooth.pkl', 'wb') as m3:
    pkl.dump(matrix_relative_nonsmooth, m3)
m3.close()

In [54]:
matrix_relative_add2 = {}

In [55]:
for i in tqdm(allset):
    counts = {}
    
    try:
        c1 = animects[i] + 2
    except KeyError:
        c1 = 2
    counts['anime'] = c1 / len(anime)
    
    try:
        c2 = artcts[i] + 2
    except KeyError:
        c2 = 2
    counts['art'] = c2 / len(art)
    
    try:
        c3 = bookscts[i] + 2
    except KeyError:
        c3 = 2
    counts['books'] = c3 / len(books)
    
    try:
        c4 = filmscts[i] + 2
    except KeyError:
        c4 = 2
    counts['films'] = c4 / len(films)
    
    try:
        c5 = foodcts[i] + 2
    except KeyError:
        c5 = 2
    counts['food'] = c5 / len(food)
    
    try:
        c6 = footballcts[i] + 2
    except KeyError:
        c6 = 2
    counts['football'] = c6 / len(football)    
        
    try:
        c7 = gamescts[i] + 2
    except KeyError:
        c7 = 2
    counts['games'] = c7 / len(games)   
        
    try:
        c8= musiccts[i] + 2
    except KeyError:
        c8 = 2
    counts['music'] = c8 / len(music)    
        
    try:
        c9 = naturects[i] + 2
    except KeyError:
        c9 = 2
    counts['nature'] = c9 / len(nature)
        
    try:
        c10 = travelcts[i] + 2
    except KeyError:
        c10 = 2
    counts['travel'] = c10 / len(travel)    
        
    matrix_relative_add2[i] = counts

100%|██████████████████████████████| 121101/121101 [00:01<00:00, 113398.97it/s]


In [56]:
matrix_relative_add2['аниме']

{'anime': 0.34971770176021255,
 'art': 0.007194244604316547,
 'books': 0.0006133088009812941,
 'films': 0.002318530408418049,
 'food': 0.0003531696980399082,
 'football': 8.686587908269631e-05,
 'games': 0.0013975525786549408,
 'music': 0.0007736943907156673,
 'nature': 0.00234192037470726,
 'travel': 0.0017523364485981308}

In [57]:
with open('Matrix Relative Add2.pkl', 'wb') as m4:
    pkl.dump(matrix_relative_add2, m4)
m4.close()

In [58]:
len(matrix_absolute_add2)

121101

In [59]:
for k, val in matrix_relative_add2.items():
    if max(list(val.values())) < 0.01: #слова которые встречаются меньше чем в 1% текстов
        del matrix_absolute_add2[k]

In [60]:
len(matrix_absolute_add2)

2408

In [61]:
with open('Matrix Absolute Add2 Lessthan0.01.pkl', 'wb') as m5:
    pkl.dump(matrix_absolute_add2, m5)
m5.close()