In [100]:
import pandas as pd
import sklearn
import re
import numpy as np
import string as string_lib

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from pymorphy2 import MorphAnalyzer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

# from catboost import CatBoostClassifier

In [6]:
morph = MorphAnalyzer()

In [78]:
data = pd.read_csv("./messages/30_09_2017_19_37.csv", delimiter=";")

In [29]:
stemmer = SnowballStemmer("russian")

def stem(string):
    
    tokens = [stemmer.stem(token) for token in word_tokenize(string)]

    return " ".join(tokens)

In [30]:
def first_form(string):
    
    tokens = word_tokenize(string)
    
    for i in range(len(tokens)):
        if tokens[i] != "NUM":
            parse = morph.parse(tokens[i])
            tokens[i] = parse[0].normal_form
    
    return " ".join(tokens)

In [31]:
synonyms = {
        "зарплата": ["зп"],
        "электричка": ["элка"]
    }

replace_map = {}

for key, value in synonyms.items():
    for syn in value:
        replace_map[first_form(syn)] = first_form(key)

def replace_synonyms(string):
    
    tokens = word_tokenize(string)
    
    for i in range(len(tokens)):
        tokens[i] = replace_map.get(tokens[i], tokens[i])
    
    
    return " ".join(tokens)

In [108]:
def replace_numbers(string, replace_on="NUM"):
    
    return re.sub(r"([0-9]+)(\.[0-9]+)?", replace_on, string)

In [33]:
def replace_letterdigits(string):
    
    tokens = word_tokenize(string)
    
    for i in range(len(tokens)):
        match = re.match(r"^(([0-9]+)(\.[0-9]+)?)(k|к)$", tokens[i], re.UNICODE)
        
        if match:
            tokens[i] = str(int(float(match.group(1)) * 1000))
    
    return " ".join(tokens)

In [34]:
russian_stopwords = stopwords.words("russian")

def delete_stopwords(string):
    
    tokens = word_tokenize(string)
    
    for token in tokens:
        if token in russian_stopwords:
            tokens.remove(token)
    
    return " ".join(tokens)

In [35]:
def print_conf_matrix(y_test, y_predicted, categories):
    
    ma = sklearn.metrics.confusion_matrix(y_test, y_predicted).astype(str)
    ma = np.insert(ma, 0, categories, axis=1)
    ma = np.insert(ma, 0, [""] + categories, axis=0)
    
    for row in range(ma.shape[0]):
        for col in range(ma.shape[1]):
            if col != row and ma[row, col] != "0" and col > 0 and row > 0:
                print("{0:<10}".format("<" + ma[row, col] + ">"), end="")
            else:
                print("{0:<10}".format(ma[row, col]), end="")
        print()

In [36]:
def print_scores(y_test, y_predicted, categories):
    
    print("Accuracy: {0}".format(sklearn.metrics.accuracy_score(y_test, y_predicted)), end="\n\n")
    
    print(" " * 15, end="")
    for cat in categories:
        print("{0:<10}".format(cat), end="")
    
    print("\nPrecision:     ", end="")
    for digit in sklearn.metrics.precision_score(y_test, y_predicted, average=None):
        print("{0:<10}".format(round(digit, 3)), end="")
    
    print("\nRecall:        ", end="")
    for digit in sklearn.metrics.recall_score(y_test, y_predicted, average=None):
        print("{0:<10}".format(round(digit, 3)), end="")
        
    print("\nF1:            ", end="")
    for digit in sklearn.metrics.f1_score(y_test, y_predicted, average=None):
        print("{0:<10}".format(round(digit, 3)), end="")

In [109]:
translate_table = dict((ord(char), None) for char in string_lib.punctuation)

In [110]:
data.loc[:, "message"] = data["message"].apply(str.lower).apply(replace_letterdigits)
data.loc[:, "message"] = data["message"].apply(lambda s: s.translate(translate_table))

In [111]:
data["message_no_num"] = data["message"].apply(replace_numbers)

In [112]:
data["message_no_stopwords"] = data["message_no_num"].apply(delete_stopwords)

In [113]:
data["message_first_form"] = data["message_no_stopwords"].apply(first_form)

In [41]:
# data["message_stem"] = data["message_no_stopwords"].apply(stem)

In [114]:
data["message_synonyms"] = data["message_first_form"].apply(replace_synonyms)

In [115]:
print(data.shape)
data.head(10)

(266, 6)


Unnamed: 0,message,category,message_no_num,message_no_stopwords,message_first_form,message_synonyms
0,23800 доплаты за отпуска в крок,income,NUM доплаты за отпуска в крок,NUM доплаты отпуска крок,NUM доплата отпуск крок,NUM доплата отпуск крок
1,5478252 зарплата,income,NUM зарплата,NUM зарплата,NUM зарплата,NUM зарплата
2,60865 зарплата,income,NUM зарплата,NUM зарплата,NUM зарплата,NUM зарплата
3,1500 кошулька дала немного денег,income,NUM кошулька дала немного денег,NUM кошулька дала немного денег,NUM кошулька дать немного деньга,NUM кошулька дать немного деньга
4,зп 36000,income,зп NUM,зп NUM,зп NUM,зарплата NUM
5,9 сентября олег дал 30000,income,NUM сентября олег дал NUM,NUM сентября олег дал NUM,NUM сентябрь олег дать NUM,NUM сентябрь олег дать NUM
6,ещё зп 4000,income,ещё зп NUM,ещё зп NUM,ещё зп NUM,ещё зарплата NUM
7,вчера ходил в бассейн проплыл 600 метров,other,вчера ходил в бассейн проплыл NUM метров,вчера ходил бассейн проплыл NUM метров,вчера ходить бассейн проплыть NUM метр,вчера ходить бассейн проплыть NUM метр
8,вес вчера 747,weight,вес вчера NUM,вес вчера NUM,вес вчера NUM,вес вчера NUM
9,сегодня читаю три товарища,event,сегодня читаю три товарища,сегодня читаю товарища,сегодня читать товарищ,сегодня читать товарищ


In [116]:
X = data["message_synonyms"]
Y = data["category"]

In [117]:
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(X)
X_count.shape

(266, 162)

In [118]:
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_count)
X_tfidf.shape

(266, 162)

In [119]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_tfidf, Y, test_size=0.3, random_state=16)

#### Наивный Байес

In [120]:
bayes = MultinomialNB(alpha=0.01)
bayes.fit(X_train, Y_train)

predicted = bayes.predict(X_test)

In [121]:
print_scores(Y_test, predicted, bayes.classes_.tolist())
print("\n\n\nConfusion matrix:\n")
print_conf_matrix(Y_test, predicted, bayes.classes_.tolist())

Accuracy: 0.85

               event     income    other     outcome   timespend weight    
Precision:     1.0       0.6       0.0       0.983     1.0       0.1       
Recall:        0.5       0.75      0.0       0.866     1.0       1.0       
F1:            0.667     0.667     0.0       0.921     1.0       0.182     


Confusion matrix:

          event     income    other     outcome   timespend weight    
event     1         <1>       0         0         0         0         
income    0         3         0         <1>       0         0         
other     0         0         0         0         0         <1>       
outcome   0         <1>       0         58        0         <8>       
timespend 0         0         0         0         5         0         
weight    0         0         0         0         0         1         


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### SVM

In [122]:
svm = sklearn.linear_model.SGDClassifier(loss="hinge", alpha=1e-3, max_iter=5)
svm.fit(X_train, Y_train)

predicted = svm.predict(X_test)

In [123]:
print_scores(Y_test, predicted, bayes.classes_.tolist())
print("\n\n\nConfusion matrix:\n")
print_conf_matrix(Y_test, predicted, bayes.classes_.tolist())

Accuracy: 0.9125

               event     income    other     outcome   timespend weight    
Precision:     1.0       1.0       0.0       0.905     1.0       1.0       
Recall:        0.5       0.25      0.0       1.0       0.6       1.0       
F1:            0.667     0.4       0.0       0.95      0.75      1.0       


Confusion matrix:

          event     income    other     outcome   timespend weight    
event     1         0         0         <1>       0         0         
income    0         1         0         <3>       0         0         
other     0         0         0         <1>       0         0         
outcome   0         0         0         67        0         0         
timespend 0         0         0         <2>       3         0         
weight    0         0         0         0         0         1         


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### CatBoost - неуместно использовать с некатегориальными фичами, нужно применить для другого способа классификации

In [107]:
# X_train_array = X_train.toarray()
# X_test_array = X_test.toarray()

In [108]:
# le = sklearn.preprocessing.LabelEncoder()
# le.fit(Y_test.append(Y_train))

In [109]:
# Y_train_digits = le.transform(Y_train)

In [110]:
# catboost = CatBoostClassifier(iterations=2, depth=2, learning_rate=1, loss_function='Logloss', verbose=True)
# catboost.fit(X_train_array, Y_train_digits, verbose=True, cat_features=None)

# predicted = catboost.predict(X_test_array)