In [14]:
import numpy as np
import nltk
from nltk.stem import PorterStemmer
import re
from scipy.sparse import csr_matrix
import sklearn
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer
import random
from sklearn.linear_model import LogisticRegression

class PorterRussian:
    PERFECTIVEGROUND =  re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
    REFLEXIVE = re.compile(u"(с[яь])$")
    ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
    PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
    VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
    NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
    RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
    DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
    DER = re.compile(u"ость?$")
    SUPERLATIVE = re.compile(u"(ейше|ейш)$")
    I = re.compile(u"и$")
    P = re.compile(u"ь$")
    NN = re.compile(u"нн$")

    def stem(word):
        word = word.lower()
        word = word.replace(u'ё', u'е')
        m = re.match(PorterRussian.RVRE, word)
        if m and m.groups():
            pre = m.group(1)
            rv = m.group(2)
            temp = PorterRussian.PERFECTIVEGROUND.sub('', rv, 1)
            if temp == rv:
                rv = PorterRussian.REFLEXIVE.sub('', rv, 1)
                temp = PorterRussian.ADJECTIVE.sub('', rv, 1)
                if temp != rv:
                    rv = temp
                    rv = PorterRussian.PARTICIPLE.sub('', rv, 1)
                else:
                    temp = PorterRussian.VERB.sub('', rv, 1)
                    if temp == rv:
                        rv = PorterRussian.NOUN.sub('', rv, 1)
                    else:
                        rv = temp
            else:
                rv = temp
            
            rv = PorterRussian.I.sub('', rv, 1)

            if re.match(PorterRussian.DERIVATIONAL, rv):
                rv = PorterRussian.DER.sub('', rv, 1)

            temp = PorterRussian.P.sub('', rv, 1)
            if temp == rv:
                rv = PorterRussian.SUPERLATIVE.sub('', rv, 1)
                rv = PorterRussian.NN.sub(u'н', rv, 1)
            else:
                rv = temp
            word = pre+rv
        return word
    stem=staticmethod(stem)

    


class Solution:
    
    def __init__(self, C=2000, tfidf=True, train_titles=True, titles_significance = 0.2):
        self.tfidf = tfidf
        self.C = C
        self.train_titles = train_titles
        self.titles_significance = titles_significance
        
        self.labels = ['Вредоносное ПО', 'Инцидент', 'Прочее', 'Угроза', 'Уязвимость', 'Эксплойт'];
        self.PorterEnglish = PorterStemmer()
        self.PorterRussian = PorterRussian()
        
        self.word_position_en = {}
        self.word_position_ru = {}
        
        self.word_position_en_titles = {}
        self.word_position_ru_titles = {}
        
    def stem(self, word):
        if re.search(r'[а-яА-ЯёЁ]', word):
            return self.PorterRussian.stem(word)
        else:
            return self.PorterEnglish.stem(word)
        
    def get_lang(self, text):
        if re.search(r'[а-яА-ЯёЁ]', text):
            return 'russian'
        else:
            return 'english'
    
    def make_matrix(self, texts, lang, mode='train', for_titles=False):
        indptr = [0]
        column_indices = []
        values = []
        
        if for_titles:
            if lang == 'english':
                word_position = self.word_position_en_titles
            else:
                word_position = self.word_position_ru_titles
        else:
            if lang == 'english':
                word_position = self.word_position_en
            else:
                word_position = self.word_position_ru
                
        if mode == 'train':
            for text in texts:
                for word in text:
                    index = word_position.setdefault(word, len(word_position))
                    column_indices.append(index)
                    values.append(1)
                indptr.append(len(column_indices))
        else:
            for text in texts:
                for word in text:
                    if word in word_position:
                        index = word_position[word]
                        column_indices.append(index)
                        values.append(1)
                    
                    index = len(word_position) - 1
                    column_indices.append(index)
                    values.append(0)
                    
                indptr.append(len(column_indices))
            
        M = csr_matrix((values, column_indices, indptr), dtype=int)
        
        if self.tfidf:
            trans = TfidfTransformer()
            trans.fit(M)
            M = trans.transform(M)
        return M

    def get_answers(self, train_corpus):
        answers = []
        for obj in train_corpus:
            all_users_labels = []
            for user in obj[2]:
                all_users_labels += list(obj[2][user])
            
            count = Counter(all_users_labels)
        
#             max_count = max(count.values())
#             candidates = [label for label in count.keys() if count[label] == max_count]
#             answers.append(random.choice(candidates))
            answers.append(count.most_common(1)[0][0])
        return np.array(answers)
    
    def train(self, train_corpus):
        texts = np.array([list(map(lambda word : self.stem(word), nltk.word_tokenize(re.sub(r'\W', ' ', obj[1].lower())))) for obj in train_corpus])
        texts_langs = np.array([self.get_lang(obj[1]) for obj in train_corpus])
        
        answers = self.get_answers(train_corpus)
        
        M_en = self.make_matrix(texts[texts_langs=='english'], 'english')
        M_ru = self.make_matrix(texts[texts_langs=='russian'], 'russian')
        
        self.en_clf = LogisticRegression(max_iter=10000, C=self.C)
        self.en_clf.fit(M_en, answers[texts_langs=='english'])
        
        self.ru_clf = LogisticRegression(max_iter=10000, C=self.C)
        self.ru_clf.fit(M_ru, answers[texts_langs=='russian'])
    
        
        if self.train_titles:
            titles = np.array([list(map(lambda word : self.stem(word), nltk.word_tokenize(re.sub(r'\W', ' ', obj[0].lower())))) for obj in train_corpus])
            titles_langs = np.array([self.get_lang(obj[0]) for obj in train_corpus])
            
            M_en = self.make_matrix(titles[titles_langs=='english'], 'english', for_titles = True)
            M_ru = self.make_matrix(titles[titles_langs=='russian'], 'russian', for_titles = True)
            
            self.en_clf_titles = LogisticRegression(max_iter=10000, C=self.C)
            self.en_clf_titles.fit(M_en, answers[titles_langs=='english'])
            
            self.ru_clf_titles = LogisticRegression(max_iter=10000, C=self.C)
            self.ru_clf_titles.fit(M_ru, answers[titles_langs=='russian'])
    
        
    def predict(self, news):
        texts = np.array([list(map(lambda word : self.stem(word), nltk.word_tokenize(re.sub(r'\W', ' ', obj[1].lower())))) for obj in news])
        texts_langs = np.array([self.get_lang(obj[1]) for obj in news])
        texts_probas = np.array([[None, None, None, None, None, None] for obj in news])
        
        en_indices = [i for i in range(len(texts_langs)) if texts_langs[i] == 'english']
        ru_indices = [i for i in range(len(texts_langs)) if texts_langs[i] == 'russian']
         
        if (texts_langs == 'english').max():
            M_en = self.make_matrix(texts[texts_langs=='english'], 'english', mode='predict')
            texts_probas[en_indices] = self.en_clf.predict_proba(M_en)
        
        if (texts_langs == 'russian').max():
            M_ru = self.make_matrix(texts[texts_langs=='russian'], 'russian', mode='predict')
            texts_probas[ru_indices] = self.ru_clf.predict_proba(M_ru)
        
        probas = texts_probas
        
        
        if self.train_titles:
            titles = np.array([list(map(lambda word : self.stem(word), nltk.word_tokenize(re.sub(r'\W', ' ', obj[0].lower())))) for obj in news])
            titles_langs = np.array([self.get_lang(obj[0]) for obj in news])
            titles_probas = np.array([[None, None, None, None, None, None] for obj in news])
            
            en_indices = [i for i in range(len(titles_langs)) if titles_langs[i] == 'english']
            ru_indices = [i for i in range(len(titles_langs)) if titles_langs[i] == 'russian']
            
            if (titles_langs == 'english').max():
                M_en = self.make_matrix(titles[titles_langs=='english'], 'english', mode='predict', for_titles = True)
                titles_probas[en_indices] = self.en_clf_titles.predict_proba(M_en)
        
            if (titles_langs == 'russian').max():
                M_ru = self.make_matrix(titles[titles_langs=='russian'], 'russian', mode='predict', for_titles = True)
                titles_probas[ru_indices] = self.ru_clf_titles.predict_proba(M_ru) 
            
            
            probas = (1 - self.titles_significance) * texts_probas + self.titles_significance * titles_probas
            
        answers_indices = np.argmax(probas, axis = 1)
        answers = [set([self.labels[i]]) for i in answers_indices]
        return answers

        

In [15]:
import json
import pandas as pd

In [16]:
with open("q.json", "r", encoding = "utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
texts_df = pd.DataFrame(df.groupby('text'))

texts_corpus = []
for elem in texts_df.values:
    text = elem[0]
    title = elem[1]['title'].values[0]
    user_labels = {}
    i = 0
    for labels in elem[1]['labels'].values:
        user_labels[i] = set(labels)
        i +=1
    obj = tuple([title, text, user_labels])
    texts_corpus.append(obj)

In [17]:
clf = Solution()
clf.train(texts_corpus[:800])

In [18]:
a=clf.predict(texts_corpus[800:818])

In [19]:
a

[{'Вредоносное ПО'},
 {'Угроза'},
 {'Угроза'},
 {'Вредоносное ПО'},
 {'Прочее'},
 {'Угроза'},
 {'Инцидент'},
 {'Угроза'},
 {'Угроза'},
 {'Угроза'},
 {'Вредоносное ПО'},
 {'Вредоносное ПО'},
 {'Уязвимость'},
 {'Вредоносное ПО'},
 {'Угроза'},
 {'Вредоносное ПО'},
 {'Вредоносное ПО'},
 {'Угроза'}]