In [1]:
from tqdm import tqdm
import numpy as np
import pickle as pkl
import collections

In [2]:
import feat_extr

In [3]:
def acquire_trigrams(filename):
    with open(filename+'.txt','r',encoding='utf-8') as l:
        sentences = l.read().split('\n')
    l.close()
    trigrams = [list(feat_extr.get_char_trigrams(s).keys()) for s in tqdm(sentences)]
    return(trigrams)

In [4]:
tris = acquire_trigrams('Train_lemmas')

100%|██████████████████████████████████| 87844/87844 [00:13<00:00, 6438.99it/s]


In [5]:
with open('Train_labels.txt', 'r', encoding='utf-8') as i:
    train_labels = i.read().split('\n')
i.close()

In [6]:
def return_last(listData, x):
    return len(listData)-listData[::-1].index(x)-1

In [7]:
def texts(label):
    return tris[train_labels.index(label) : return_last(train_labels, label) + 1]

In [8]:
anime = texts('Anime')
art = texts('Art')
books = texts('Books')
films = texts('Films')
food = texts('Food')
football = texts('Football')
games = texts('Games')
music = texts('Music')
nature = texts('Nature')
travel = texts('Travel')

In [9]:
def topdocuments(corpus):
    vocabulary = list(set(sum(corpus, [])))
    corpus = [set(t) for t in corpus]
    counts = {}
    for v in vocabulary:
        counts[v] = 0
    for v in tqdm(vocabulary):
        for c in corpus:
            if v in c:
                counts[v] += 1
    counts = {key:val for key, val in counts.items()}
    return counts

In [10]:
animects = topdocuments(anime)

100%|████████████████████████████████████| 9048/9048 [00:02<00:00, 3541.28it/s]


In [11]:
artcts = topdocuments(art)

100%|███████████████████████████████████| 7780/7780 [00:00<00:00, 10228.18it/s]


In [12]:
bookscts = topdocuments(books)

100%|███████████████████████████████████| 12215/12215 [00:18<00:00, 646.95it/s]


In [13]:
filmscts = topdocuments(films)

100%|████████████████████████████████████| 9888/9888 [00:06<00:00, 1498.79it/s]


In [14]:
foodcts = topdocuments(food)

100%|████████████████████████████████████| 9810/9810 [00:06<00:00, 1461.09it/s]


In [15]:
footballcts = topdocuments(football)

100%|███████████████████████████████████| 12900/12900 [00:44<00:00, 289.71it/s]


In [16]:
gamescts = topdocuments(games)

100%|███████████████████████████████████| 14562/14562 [01:06<00:00, 220.15it/s]


In [18]:
musiccts = topdocuments(music)

100%|██████████████████████████████████| 11365/11365 [00:11<00:00, 1004.72it/s]


In [17]:
naturects = topdocuments(nature)

100%|███████████████████████████████████| 6965/6965 [00:00<00:00, 12270.58it/s]


In [19]:
travelcts = topdocuments(travel)

100%|████████████████████████████████████| 8807/8807 [00:01<00:00, 6199.81it/s]


In [20]:
all = anime + art + books + films + food + football + games + music + nature + travel

In [21]:
allunnest = list(set(sum(all, [])))

In [22]:
animeset = set(sum(anime, []))
artset = set(sum(art, []))
booksset = set(sum(books, []))
filmsset = set(sum(films, []))
foodset = set(sum(food, []))
footballset = set(sum(football, []))
gamesset = set(sum(games, []))
musicset = set(sum(music, []))
natureset = set(sum(nature, []))
travelset = set(sum(travel, []))
allset = list(set(allunnest))

In [23]:
matrix_absolute_nonsmooth = {}

In [24]:
for i in tqdm(allset):
    counts = {}
    try:
        counts['anime'] = animects[i]
    except KeyError:
        counts['anime'] = 0
    try:
        counts['art'] = artcts[i]
    except KeyError:
        counts['art'] = 0
    try:
        counts['books'] = bookscts[i]
    except KeyError:
        counts['books'] = 0
    try:
        counts['films'] = filmscts[i]
    except KeyError:
        counts['films'] = 0
    try:
        counts['food'] = foodcts[i]
    except KeyError:
        counts['food'] = 0
    try:
        counts['football'] = footballcts[i]
    except KeyError:
        counts['football'] = 0
    try:
        counts['games'] = gamescts[i]
    except KeyError:
        counts['games'] = 0
    try:
        counts['music'] = musiccts[i]
    except KeyError:
        counts['music'] = 0
    try:
        counts['nature'] = naturects[i]
    except KeyError:
        counts['nature'] = 0
    try:
        counts['travel'] = travelcts[i]
    except KeyError:
        counts['travel'] = 0
    matrix_absolute_nonsmooth[i] = counts

100%|████████████████████████████████| 18898/18898 [00:00<00:00, 123354.20it/s]


In [25]:
matrix_absolute_nonsmooth[' не']

{'anime': 1075,
 'art': 444,
 'books': 4649,
 'films': 2002,
 'food': 2196,
 'football': 7763,
 'games': 10856,
 'music': 2949,
 'nature': 318,
 'travel': 766}

In [26]:
matrix_absolute_add2 = {key:{k:(v+2) for k, v in val.items()} for key, val in matrix_absolute_nonsmooth.items()}

In [27]:
matrix_absolute_nonsmooth[' не']

{'anime': 1075,
 'art': 444,
 'books': 4649,
 'films': 2002,
 'food': 2196,
 'football': 7763,
 'games': 10856,
 'music': 2949,
 'nature': 318,
 'travel': 766}

In [28]:
with open('Tris Absolute Add2.pkl', 'wb') as m2:
    pkl.dump(matrix_absolute_add2, m2)
m2.close()

In [29]:
matrix_relative_nonsmooth = {}

In [30]:
for i in tqdm(allset):
    counts = {}
    
    try:
        c1 = animects[i]
    except KeyError:
        c1 = 0
    counts['anime'] = c1 / len(anime)
    
    try:
        c2 = artcts[i]
    except KeyError:
        c2 = 0
    counts['art'] = c2 / len(art)
    
    try:
        c3 = bookscts[i]
    except KeyError:
        c3 = 0
    counts['books'] = c3 / len(books)
    
    try:
        c4 = filmscts[i]
    except KeyError:
        c4 = 0
    counts['films'] = c4 / len(films)
    
    try:
        c5 = foodcts[i]
    except KeyError:
        c5 = 0
    counts['food'] = c5 / len(food)
    
    try:
        c6 = footballcts[i]
    except KeyError:
        c6 = 0
    counts['football'] = c6 / len(football)    
        
    try:
        c7 = gamescts[i]
    except KeyError:
        c7 = 0
    counts['games'] = c7 / len(games)   
        
    try:
        c8= musiccts[i]
    except KeyError:
        c8 = 0
    counts['music'] = c8 / len(music)    
        
    try:
        c9 = naturects[i]
    except KeyError:
        c9 = 0
    counts['nature'] = c9 / len(nature)
        
    try:
        c10 = travelcts[i]
    except KeyError:
        c10 = 0
    counts['travel'] = c10 / len(travel)    
        
    matrix_relative_nonsmooth[i] = counts

100%|████████████████████████████████| 18898/18898 [00:00<00:00, 107810.73it/s]


In [31]:
matrix_relative_nonsmooth[' не']

{'anime': 0.3570242444370641,
 'art': 0.4563206577595067,
 'books': 0.47521210262700603,
 'films': 0.3570536828963795,
 'food': 0.3877803284478192,
 'football': 0.33716990965948573,
 'games': 0.37004465350922044,
 'music': 0.3802707930367505,
 'nature': 0.37236533957845436,
 'travel': 0.44742990654205606}

In [32]:
matrix_relative_add2 = {}

In [33]:
for i in tqdm(allset):
    counts = {}
    
    try:
        c1 = animects[i] + 2
    except KeyError:
        c1 = 2
    counts['anime'] = c1 / len(anime)
    
    try:
        c2 = artcts[i] + 2
    except KeyError:
        c2 = 2
    counts['art'] = c2 / len(art)
    
    try:
        c3 = bookscts[i] + 2
    except KeyError:
        c3 = 2
    counts['books'] = c3 / len(books)
    
    try:
        c4 = filmscts[i] + 2
    except KeyError:
        c4 = 2
    counts['films'] = c4 / len(films)
    
    try:
        c5 = foodcts[i] + 2
    except KeyError:
        c5 = 2
    counts['food'] = c5 / len(food)
    
    try:
        c6 = footballcts[i] + 2
    except KeyError:
        c6 = 2
    counts['football'] = c6 / len(football)    
        
    try:
        c7 = gamescts[i] + 2
    except KeyError:
        c7 = 2
    counts['games'] = c7 / len(games)   
        
    try:
        c8= musiccts[i] + 2
    except KeyError:
        c8 = 2
    counts['music'] = c8 / len(music)    
        
    try:
        c9 = naturects[i] + 2
    except KeyError:
        c9 = 2
    counts['nature'] = c9 / len(nature)
        
    try:
        c10 = travelcts[i] + 2
    except KeyError:
        c10 = 2
    counts['travel'] = c10 / len(travel)    
        
    matrix_relative_add2[i] = counts

100%|████████████████████████████████| 18898/18898 [00:00<00:00, 112408.98it/s]


In [34]:
matrix_relative_add2[' не']

{'anime': 0.3576884755895052,
 'art': 0.45837615621788286,
 'books': 0.4754165388939998,
 'films': 0.35741037988229,
 'food': 0.38813349814585907,
 'football': 0.3372567755385685,
 'games': 0.3701128268057402,
 'music': 0.38052869116698906,
 'nature': 0.3747072599531616,
 'travel': 0.4485981308411215}

In [35]:
len(matrix_absolute_add2)

18898

In [36]:
for k, val in matrix_relative_add2.items():
    if max(list(val.values())) < 0.005: # триграммы, которые встречаются меньше чем в .5% текстов какой-либо из категорий
        del matrix_absolute_add2[k]

In [37]:
len(matrix_absolute_add2)

6825

In [38]:
with open('Tris Absolute Add2.pkl', 'wb') as m5:
    pkl.dump(matrix_absolute_add2, m5)
m5.close()