In [2]:
from tqdm import tqdm
import numpy as np
import pickle as pkl
import collections

In [3]:
import feat_extr

In [4]:
def acquire_trigrams(filename):
    with open(filename+'.txt','r',encoding='utf-8') as l:
        sentences = l.read().split('\n')
    l.close()
    trigrams = [list(feat_extr.get_char_trigrams(s).keys()) for s in tqdm(sentences)]
    return(trigrams)

In [5]:
tris = acquire_trigrams('Train_lemmas')

100%|██████████████████████████████████| 87719/87719 [00:14<00:00, 5956.31it/s]


In [13]:
with open('Train_labels.txt', 'r', encoding='utf-8') as i:
    train_labels = i.read().split('\n')
i.close()

In [14]:
def return_last(listData, x):
    return len(listData)-listData[::-1].index(x)-1

In [16]:
def texts(label):
    return tris[train_labels.index(label) : return_last(train_labels, label) + 1]

In [18]:
anime = texts('Anime')
art = texts('Art')
books = texts('Books')
films = texts('Films')
food = texts('Food')
football = texts('Football')
games = texts('Games')
music = texts('Music')
nature = texts('Nature')
travel = texts('Travel')

In [19]:
def topdocuments(corpus):
    vocabulary = list(set(sum(corpus, [])))
    corpus = [set(t) for t in corpus]
    counts = {}
    for v in vocabulary:
        counts[v] = 0
    for v in tqdm(vocabulary):
        for c in corpus:
            if v in c:
                counts[v] += 1
    counts = {key:val for key, val in counts.items()}
    return counts

In [20]:
animects = topdocuments(anime)

100%|████████████████████████████████████| 8688/8688 [00:02<00:00, 3301.96it/s]


In [21]:
artcts = topdocuments(art)

100%|███████████████████████████████████| 7476/7476 [00:00<00:00, 10172.44it/s]


In [22]:
bookscts = topdocuments(books)

100%|███████████████████████████████████| 11650/11650 [00:19<00:00, 590.23it/s]


In [23]:
filmscts = topdocuments(films)

100%|████████████████████████████████████| 9466/9466 [00:08<00:00, 1086.80it/s]


In [24]:
foodcts = topdocuments(food)

100%|████████████████████████████████████| 9258/9258 [00:08<00:00, 1047.32it/s]


In [25]:
footballcts = topdocuments(football)

100%|███████████████████████████████████| 12315/12315 [00:46<00:00, 267.54it/s]


In [26]:
gamescts = topdocuments(games)

100%|███████████████████████████████████| 13890/13890 [01:05<00:00, 211.81it/s]


In [27]:
musiccts = topdocuments(music)

100%|███████████████████████████████████| 10849/10849 [00:11<00:00, 968.28it/s]


In [28]:
naturects = topdocuments(nature)

100%|███████████████████████████████████| 6417/6417 [00:00<00:00, 12387.43it/s]


In [29]:
travelcts = topdocuments(travel)

100%|████████████████████████████████████| 8417/8417 [00:01<00:00, 5831.50it/s]


In [30]:
all = anime + art + books + films + food + football + games + music + nature + travel

In [31]:
allunnest = list(set(sum(all, [])))

In [32]:
animeset = set(sum(anime, []))
artset = set(sum(art, []))
booksset = set(sum(books, []))
filmsset = set(sum(films, []))
foodset = set(sum(food, []))
footballset = set(sum(football, []))
gamesset = set(sum(games, []))
musicset = set(sum(music, []))
natureset = set(sum(nature, []))
travelset = set(sum(travel, []))
allset = list(set(allunnest))

In [65]:
matrix_absolute_nonsmooth = {}

In [66]:
for i in tqdm(allset):
    counts = {}
    try:
        counts['anime'] = animects[i]
    except KeyError:
        counts['anime'] = 0
    try:
        counts['art'] = artcts[i]
    except KeyError:
        counts['art'] = 0
    try:
        counts['books'] = bookscts[i]
    except KeyError:
        counts['books'] = 0
    try:
        counts['films'] = filmscts[i]
    except KeyError:
        counts['films'] = 0
    try:
        counts['food'] = foodcts[i]
    except KeyError:
        counts['food'] = 0
    try:
        counts['football'] = footballcts[i]
    except KeyError:
        counts['football'] = 0
    try:
        counts['games'] = gamescts[i]
    except KeyError:
        counts['games'] = 0
    try:
        counts['music'] = musiccts[i]
    except KeyError:
        counts['music'] = 0
    try:
        counts['nature'] = naturects[i]
    except KeyError:
        counts['nature'] = 0
    try:
        counts['travel'] = travelcts[i]
    except KeyError:
        counts['travel'] = 0
    matrix_absolute_nonsmooth[i] = counts

100%|████████████████████████████████| 18017/18017 [00:00<00:00, 138494.40it/s]


In [67]:
matrix_absolute_nonsmooth[' не']

{'anime': 1083,
 'art': 452,
 'books': 4689,
 'films': 2038,
 'food': 2223,
 'football': 7855,
 'games': 10982,
 'music': 2981,
 'nature': 338,
 'travel': 781}

In [68]:
matrix_absolute_add2 = {key:{k:(v+2) for k, v in val.items()} for key, val in matrix_absolute_nonsmooth.items()}

In [69]:
matrix_absolute_nonsmooth[' не']

{'anime': 1083,
 'art': 452,
 'books': 4689,
 'films': 2038,
 'food': 2223,
 'football': 7855,
 'games': 10982,
 'music': 2981,
 'nature': 338,
 'travel': 781}

In [70]:
with open('Tris Absolute Add2.pkl', 'wb') as m2:
    pkl.dump(matrix_absolute_add2, m2)
m2.close()

In [71]:
matrix_relative_nonsmooth = {}

In [72]:
for i in tqdm(allset):
    counts = {}
    
    try:
        c1 = animects[i]
    except KeyError:
        c1 = 0
    counts['anime'] = c1 / len(anime)
    
    try:
        c2 = artcts[i]
    except KeyError:
        c2 = 0
    counts['art'] = c2 / len(art)
    
    try:
        c3 = bookscts[i]
    except KeyError:
        c3 = 0
    counts['books'] = c3 / len(books)
    
    try:
        c4 = filmscts[i]
    except KeyError:
        c4 = 0
    counts['films'] = c4 / len(films)
    
    try:
        c5 = foodcts[i]
    except KeyError:
        c5 = 0
    counts['food'] = c5 / len(food)
    
    try:
        c6 = footballcts[i]
    except KeyError:
        c6 = 0
    counts['football'] = c6 / len(football)    
        
    try:
        c7 = gamescts[i]
    except KeyError:
        c7 = 0
    counts['games'] = c7 / len(games)   
        
    try:
        c8= musiccts[i]
    except KeyError:
        c8 = 0
    counts['music'] = c8 / len(music)    
        
    try:
        c9 = naturects[i]
    except KeyError:
        c9 = 0
    counts['nature'] = c9 / len(nature)
        
    try:
        c10 = travelcts[i]
    except KeyError:
        c10 = 0
    counts['travel'] = c10 / len(travel)    
        
    matrix_relative_nonsmooth[i] = counts

100%|████████████████████████████████| 18017/18017 [00:00<00:00, 116109.07it/s]


In [73]:
matrix_relative_nonsmooth[' не']

{'anime': 0.3596811690468283,
 'art': 0.4645426515930113,
 'books': 0.47930082796688134,
 'films': 0.36347422864276796,
 'food': 0.3925481193713579,
 'football': 0.3411657400972898,
 'games': 0.3743395711899649,
 'music': 0.3843971631205674,
 'nature': 0.3957845433255269,
 'travel': 0.45619158878504673}

In [74]:
matrix_relative_add2 = {}

In [75]:
for i in tqdm(allset):
    counts = {}
    
    try:
        c1 = animects[i] + 2
    except KeyError:
        c1 = 2
    counts['anime'] = c1 / len(anime)
    
    try:
        c2 = artcts[i] + 2
    except KeyError:
        c2 = 2
    counts['art'] = c2 / len(art)
    
    try:
        c3 = bookscts[i] + 2
    except KeyError:
        c3 = 2
    counts['books'] = c3 / len(books)
    
    try:
        c4 = filmscts[i] + 2
    except KeyError:
        c4 = 2
    counts['films'] = c4 / len(films)
    
    try:
        c5 = foodcts[i] + 2
    except KeyError:
        c5 = 2
    counts['food'] = c5 / len(food)
    
    try:
        c6 = footballcts[i] + 2
    except KeyError:
        c6 = 2
    counts['football'] = c6 / len(football)    
        
    try:
        c7 = gamescts[i] + 2
    except KeyError:
        c7 = 2
    counts['games'] = c7 / len(games)   
        
    try:
        c8= musiccts[i] + 2
    except KeyError:
        c8 = 2
    counts['music'] = c8 / len(music)    
        
    try:
        c9 = naturects[i] + 2
    except KeyError:
        c9 = 2
    counts['nature'] = c9 / len(nature)
        
    try:
        c10 = travelcts[i] + 2
    except KeyError:
        c10 = 2
    counts['travel'] = c10 / len(travel)    
        
    matrix_relative_add2[i] = counts

100%|████████████████████████████████| 18017/18017 [00:00<00:00, 123359.67it/s]


In [76]:
matrix_relative_add2[' не']

{'anime': 0.36034540019926936,
 'art': 0.46659815005138744,
 'books': 0.4795052642338751,
 'films': 0.3638309256286784,
 'food': 0.39290128906939786,
 'football': 0.34125260597637247,
 'games': 0.3744077444864846,
 'music': 0.38465506125080595,
 'nature': 0.3981264637002342,
 'travel': 0.45735981308411217}

In [77]:
len(matrix_absolute_add2)

18017

In [78]:
for k, val in matrix_relative_add2.items():
    if max(list(val.values())) < 0.005: # триграммы, которые встречаются меньше чем в .5% текстов какой-либо из категорий
        del matrix_absolute_add2[k]

In [79]:
len(matrix_absolute_add2)

6647

In [81]:
with open('Tris Absolute Add2.pkl', 'wb') as m5:
    pkl.dump(matrix_absolute_add2, m5)
m5.close()