In [66]:
from nltk.corpus import reuters
from sklearn.datasets import fetch_20newsgroups

import re
from collections import Counter
import pandas as pd
import numpy as np

import nltk
import pymorphy2
from nltk.stem import WordNetLemmatizer
import spacy
from stop_words import get_stop_words
from nltk import word_tokenize
import string

nlp = spacy.load("en_core_web_sm")

**FAQ sample**

In [14]:
#load sample
faq_df = pd.read_csv('list.txt', header=0, sep='\t')
morph = pymorphy2.MorphAnalyzer()
#load stop words
with open('stop_words.txt','r') as inFile:
    stop_list = set(inFile.read().split())

def preprocessing(message):
    '''
    Extracting russian terms, normalizing them and erasing stop-words
    '''
    russian_terms =  re.findall(r'[а-яА-Яё]+', str(message).lower())
    normalized_terms = list(map(lambda x: morph.parse(x)[0].normal_form, russian_terms))
    return ' '.join([term for term in normalized_terms if term not in stop_list])

In [15]:
faq_df['first_msg'] = faq_df['first_msg'].apply(preprocessing)

In [18]:
# drop empty first_msg
filt = faq_df['first_msg'].apply(len) == 0
faq_df.drop(index= faq_df[filt].index, inplace=True)

In [20]:
# delete small categories
for name in ['Postmaster', 'Уровень ПЦК::Песочница', 'Уровень ПЦК::ВФК и ВФА', 'Уровень ПЦК::ИС Сборы',
            'Уровень ПЦК::Перечни услуг', 'Misc']:
    filt = faq_df['queue'] == name
    faq_df.drop(index=faq_df[filt].index, inplace = True)
    
# merge few categories into one
for names, general_name  in zip((['Уровень ПЦК::ИС ПОУ', 'Уровень ПЦК::ПОУ'], ['Уровень ПЦК::АСУ ПФХД', 'Уровень ПЦК::Переход на АСУ ПФХД', 
                                'Уровень ПЦК::Бюджетирование и планирование ФХД'], ['Уровень ПЦК::ГЗ', 'Уровень ПЦК::ИС ГЗ'],
                                 ['Уровень ПЦК::Навигатор абитуриента', 'Уровень ПЦК::Поступай правильно 2.0', 'Уровень ПЦК::Поступай правильно 2.0 - МЭИ'],
                                 ['Уровень ПЦК::АК КСУФ', 'Уровень ПЦК::АК КСУФ МЭИ'], ['Уровень ПЦК::Расчет субсидии', 'Уровень ПЦК::Стипендии', 'Уровень ПЦК::Публичные обязательства', 'Уровень ПЦК::Текущее финансирование'],
                                 ['Уровень ПЦК::Нормативные затраты', 'Уровень ПЦК::"ИАЦ УГС"'], ['Уровень ПЦК::Семинары-совещания', 'Уровень ПЦК::Повышение квалификации'], ['Спам']),
                                ('Уровень ПЦК::ПОУ', 'Уровень ПЦК:: Бюджетирование и ПФХД', 'Уровень ПЦК::ГЗ', 'Уровень ПЦК::Поступай правильно', 'Уровень ПЦК::АК КСУФ',
                                'Уровень ПЦК::Расчет субсидий', 'Уровень ПЦК::Нормативные затраты', 'Уровень ПЦК::Повышение квалификации и семинары', 'Разное')):
    for name in names:
        filt = faq_df['queue'] == name
        faq_df.loc[filt, 'queue'] = general_name

In [37]:
faq_df.drop(['number', 'status', 'last_mgs'], axis=1, inplace=True)
faq_df.columns = ['target', 'data']

In [39]:
# statistics by categories
size_list = []
fraction_list = []
for queue in faq_df['target'].unique():
    size_list.append(faq_df[faq_df['target'] == queue].apply(len)['target'])
    fraction_list.append((faq_df[faq_df['target'] == queue].apply(len)['target'] / faq_df['target'].count())* 100)
    
info_df = pd.DataFrame({'Category': faq_df['target'].unique(), 'Count': size_list, 'Fraction, %': fraction_list})
info_df.sort_values(by='Count', ascending=False, inplace=True)
info_df

Unnamed: 0,Category,Count,"Fraction, %"
0,Центр приема обращений,10028,46.122712
3,Уровень ПЦК::Поступай правильно,5132,23.604084
5,Уровень ПЦК::ПОУ,2066,9.502346
2,Уровень ПЦК:: Бюджетирование и ПФХД,1260,5.795235
9,Уровень ПЦК::ГЗ,1180,5.427284
11,Уровень ПЦК::Повышение квалификации и семинары,767,3.527734
10,Уровень ПЦК::Расчет субсидий,377,1.733971
8,Уровень ПЦК::ЗПМОН,232,1.067059
12,Уровень ПЦК::Отчеты о результатах деятельности,220,1.011866
1,Разное,134,0.616319


In [49]:
# save to csv
faq_df.to_csv('datasets\\final_FAQ_dataset.csv', header = True, sep='\t', index=False)

**Reuters sample**

In [25]:
# volume of each category
cat_dict = Counter()
for cat in reuters.categories():
    cat_dict[cat] = len(reuters.fileids(cat))
top_7 =  dict(cat_dict.most_common(5))
print('Категория', 'объем')
for key, value in top_7.items():
    print(key, value, sep='\t')

Категория объем
earn	3964
acq	2369
money-fx	717
grain	582
crude	578


In [29]:
# choose category for each text by category with min volume
y_whole = []
indexes_to_save = []
for ids in reuters.fileids():
    file_cats = []
    for cat in reuters.categories(ids):
        try:
            file_cats.append((top_7[cat], cat))
        except KeyError:
            continue
    if file_cats:
        y_whole.append(min(file_cats)[1])
        indexes_to_save.append(ids)
# after resampling
for key, value in Counter(y_whole).most_common(5):
    print(key, value, sep='\t')

earn	3931
acq	2345
money-fx	713
grain	580
crude	578


In [42]:
sampled_texts = [reuters.raw(ids) for ids in indexes_to_save]
reuters_df = pd.DataFrame({'data':sampled_texts, 'target':y_whole}).sample(frac=1, random_state=42)

In [44]:
en_stop_words = get_stop_words('english') + list(string.ascii_lowercase)
def en_preprocessing(message):
    '''
    Extracting russian terms, normalizing them and erasing stop-words
    '''
    terms =  ' '.join(re.findall(r'\w\.\w\.|[a-z]+', message.lower()))
    doc = nlp(terms)
    normalized_terms = [token.lemma_ for token in doc]
    return ' '.join([term for term in normalized_terms if term not in en_stop_words])

In [45]:
# preprocessing
reuters_df['data'] = reuters_df['data'].apply(en_preprocessing)

In [75]:
# save to csv
reuters_df.to_csv('datasets\\reuters_dataset.csv', header = True, sep='\t', index=False)

**20Newsgroups sample**

In [52]:
cats = ['alt.atheism',
 'comp.sys.ibm.pc.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.space',
 'soc.religion.christian',
 'talk.religion.misc']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)

In [68]:
newsgroups_df = pd.DataFrame({'data':newsgroups_train['data'] + newsgroups_test['data'], 'target':np.hstack([newsgroups_train['target'], newsgroups_test['target']])}).sample(frac=1, random_state=42)

In [70]:
# preprocessing
newsgroups_df['data'] = newsgroups_df['data'].apply(en_preprocessing)

In [74]:
# save to csv
newsgroups_df.to_csv('datasets\\newsgroups.csv', header = True, sep='\t', index=False)

**Amazon sample**

In [82]:
amazon_df = pd.read_csv('train_40k.csv')

In [86]:
amazon_df = amazon_df[['Text', 'Cat1']]
amazon_df.columns = ['data', 'target']

In [88]:
# preprocessing
amazon_df['data'] = amazon_df['data'].apply(en_preprocessing)

In [89]:
# save to csv
amazon_df.to_csv('datasets\\amazon.csv', header = True, sep='\t', index=False)