In [99]:
import pandas as pd
import sklearn
import re
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

In [100]:
data = pd.read_csv("./messages/30_09_2017_19_37.csv", delimiter=";")

In [101]:
stemmer = SnowballStemmer("russian")

def stem(string):
    
    tokens = [stemmer.stem(token) for token in word_tokenize(string)]

    return " ".join(tokens)

In [102]:
synonyms = {
        "зарплата": ["зп"],
        "электричка": ["элка"]
    }

replace_map = {}

for key, value in synonyms.items():
    for syn in value:
        replace_map[stem(syn)] = stem(key)

def replace_synonyms(string):
    
    tokens = word_tokenize(string)
    
    for i in range(len(tokens)):
        tokens[i] = replace_map.get(tokens[i], tokens[i])
    
    
    return " ".join(tokens)

In [103]:
def replace_numbers(string, replace_on="NUM"):
    
    return re.sub(r"([0-9]+)(\.[0-9]+)?", replace_on, string)

In [104]:
def replace_letterdigits(string):
    
    tokens = word_tokenize(string)
    
    for i in range(len(tokens)):
        match = re.match(r"^(([0-9]+)(\.[0-9]+)?)(k|к)$", tokens[i], re.UNICODE)
        
        if match:
            tokens[i] = str(int(float(match.group(1)) * 1000))
    
    return " ".join(tokens)

In [105]:
russian_stopwords = stopwords.words("russian")

def delete_stopwords(string):
    
    tokens = word_tokenize(string)
    
    for token in tokens:
        if token in russian_stopwords:
            tokens.remove(token)
    
    return " ".join(tokens)

In [106]:
def print_conf_matrix(y_test, y_predicted, categories):
    
    ma = sklearn.metrics.confusion_matrix(y_test, y_predicted).astype(str)
    ma = np.insert(ma, 0, categories, axis=1)
    ma = np.insert(ma, 0, [""] + categories, axis=0)
    
    for row in range(ma.shape[0]):
        for col in range(ma.shape[1]):
            if col != row and ma[row, col] != "0" and col > 0 and row > 0:
                print("{0:<10}".format("<" + ma[row, col] + ">"), end="")
            else:
                print("{0:<10}".format(ma[row, col]), end="")
        print()

In [107]:
def print_scores(y_test, y_predicted, categories):
    
    print("Accuracy: {0}".format(sklearn.metrics.accuracy_score(y_test, y_predicted)), end="\n\n")
    
    print(" " * 15, end="")
    for cat in categories:
        print("{0:<10}".format(cat), end="")
    
    print("\nPrecision:     ", end="")
    for digit in sklearn.metrics.precision_score(y_test, y_predicted, average=None):
        print("{0:<10}".format(round(digit, 3)), end="")
    
    print("\nRecall:        ", end="")
    for digit in sklearn.metrics.recall_score(y_test, y_predicted, average=None):
        print("{0:<10}".format(round(digit, 3)), end="")
        
    print("\nF1:            ", end="")
    for digit in sklearn.metrics.f1_score(y_test, y_predicted, average=None):
        print("{0:<10}".format(round(digit, 3)), end="")

In [108]:
data.loc[:, "message"] = data["message"].apply(str.lower).apply(replace_letterdigits)

In [109]:
data["message_no_num"] = data["message"].apply(replace_numbers)

In [110]:
data["message_no_stopwords"] = data["message_no_num"].apply(delete_stopwords)

In [111]:
data["message_stem"] = data["message_no_stopwords"].apply(stem)

In [112]:
data["message_synonyms"] = data["message_stem"].apply(replace_synonyms)

In [113]:
print(data.shape)
data.head()

(266, 6)


Unnamed: 0,message,category,message_no_num,message_no_stopwords,message_stem,message_synonyms
0,23800 доплаты за отпуска в крок,income,NUM доплаты за отпуска в крок,NUM доплаты отпуска крок,NUM доплат отпуск крок,NUM доплат отпуск крок
1,54782.52 зарплата,income,NUM зарплата,NUM зарплата,NUM зарплат,NUM зарплат
2,6086.5 зарплата,income,NUM зарплата,NUM зарплата,NUM зарплат,NUM зарплат
3,1500 кошулька дала немного денег,income,NUM кошулька дала немного денег,NUM кошулька дала немного денег,NUM кошульк дал немн денег,NUM кошульк дал немн денег
4,зп 36000,income,зп NUM,зп NUM,зп NUM,зарплат NUM


In [114]:
X = data["message_stem"]
Y = data["category"]

In [115]:
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(X)
X_count.shape

(266, 165)

In [116]:
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_count)
X_tfidf.shape

(266, 165)

In [117]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_tfidf, Y, test_size=0.3, random_state=16)

#### Наивный Байес

In [118]:
bayes = MultinomialNB(alpha=0.01)
bayes.fit(X_train, Y_train)

predicted = bayes.predict(X_test)

In [119]:
print_scores(Y_test, predicted, bayes.classes_.tolist())
print("\n\n\nConfusion matrix:\n")
print_conf_matrix(Y_test, predicted, bayes.classes_.tolist())

Accuracy: 0.8375

               event     income    other     outcome   timespend weight    
Precision:     1.0       0.6       0.0       0.983     1.0       0.091     
Recall:        0.5       0.75      0.0       0.851     1.0       1.0       
F1:            0.667     0.667     0.0       0.912     1.0       0.167     


Confusion matrix:

          event     income    other     outcome   timespend weight    
event     1         <1>       0         0         0         0         
income    0         3         0         <1>       0         0         
other     0         0         0         0         0         <1>       
outcome   0         <1>       0         57        0         <9>       
timespend 0         0         0         0         5         0         
weight    0         0         0         0         0         1         


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### SVM

In [90]:
svm = sklearn.linear_model.SGDClassifier(loss="hinge", alpha=1e-3, max_iter=5)
svm.fit(X_train, Y_train)

predicted = svm.predict(X_test)

In [91]:
print_scores(Y_test, predicted, bayes.classes_.tolist())
print("\n\n\nConfusion matrix:\n")
print_conf_matrix(Y_test, predicted, bayes.classes_.tolist())

Accuracy: 0.925

               event     income    other     outcome   timespend weight    
Precision:     1.0       1.0       0.0       0.918     1.0       1.0       
Recall:        0.5       0.25      0.0       1.0       0.8       1.0       
F1:            0.667     0.4       0.0       0.957     0.889     1.0       


Confusion matrix:

          event     income    other     outcome   timespend weight    
event     1         0         0         <1>       0         0         
income    0         1         0         <3>       0         0         
other     0         0         0         <1>       0         0         
outcome   0         0         0         67        0         0         
timespend 0         0         0         <1>       4         0         
weight    0         0         0         0         0         1         


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
