In [200]:
import pandas as pd
import sklearn
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from pymorphy2 import MorphAnalyzer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

from catboost import CatBoostClassifier

from sklearn.ensemble import RandomForestClassifier

import common
import extractors
import constants

In [3]:
morph = MorphAnalyzer()

In [81]:
stemmer = SnowballStemmer("russian")

def stem(string):
    
    tokens = [stemmer.stem(token) for token in word_tokenize(string)]

    return " ".join(tokens)

In [82]:
def first_form(string):
    
    tokens = word_tokenize(string)
    
    for i in range(len(tokens)):
        if tokens[i].islower():
            parse = morph.parse(tokens[i])
            tokens[i] = parse[0].normal_form
    
    return " ".join(tokens)

In [84]:
synonyms = {
        "зарплата": ["зп"],
        "электричка": ["элка"]
    }

replace_map = {}

for key, value in synonyms.items():
    for syn in value:
        replace_map[first_form(syn)] = first_form(key)

def replace_synonyms(string):
    
    tokens = word_tokenize(string)
    
    for i, token in enumerate(tokens):
        tokens[i] = replace_map.get(token, token)
    
    
    return " ".join(tokens)

In [85]:
russian_stopwords = stopwords.words("russian")

def delete_stopwords(string):
    
    tokens = word_tokenize(string)
    
    for token in tokens:
        if token in russian_stopwords:
            tokens.remove(token)
    
    return " ".join(tokens)

In [86]:
def print_conf_matrix(y_test, y_predicted, categories):
    
    ma = sklearn.metrics.confusion_matrix(y_test, y_predicted).astype(str)
    ma = np.insert(ma, 0, categories, axis=1)
    ma = np.insert(ma, 0, [""] + categories, axis=0)
    
    for row in range(ma.shape[0]):
        for col in range(ma.shape[1]):
            if col != row and ma[row, col] != "0" and col > 0 and row > 0:
                print("{0:<10}".format("<" + ma[row, col] + ">"), end="")
            else:
                print("{0:<10}".format(ma[row, col]), end="")
        print()

In [87]:
def print_scores(y_test, y_predicted, categories):
    
    print("Accuracy: {0}".format(sklearn.metrics.accuracy_score(y_test, y_predicted)), end="\n\n")
    
    print(" " * 15, end="")
    for cat in categories:
        print("{0:<10}".format(cat), end="")
    
    print("\nPrecision:     ", end="")
    for digit in sklearn.metrics.precision_score(y_test, y_predicted, average=None):
        print("{0:<10}".format(round(digit, 3)), end="")
    
    print("\nRecall:        ", end="")
    for digit in sklearn.metrics.recall_score(y_test, y_predicted, average=None):
        print("{0:<10}".format(round(digit, 3)), end="")
        
    print("\nF1:            ", end="")
    for digit in sklearn.metrics.f1_score(y_test, y_predicted, average=None):
        print("{0:<10}".format(round(digit, 3)), end="")

In [88]:
data = pd.read_csv("./messages/30_09_2017_19_37.csv", delimiter=";")

In [89]:
data.loc[:, "message"] = data["message"].apply(str.lower).apply(common.replace_letterdigits)
data.loc[:, "message"] = data["message"].apply(common.delete_punctuation)

In [90]:
data["timepointer"] = data["message"].apply(common.replace_timepointer)

In [91]:
data["datepointer"] = data["timepointer"].apply(common.replace_datepointer)

In [92]:
data["weightpointer"] = data["datepointer"].apply(common.replace_weightpointer)

In [93]:
data["no_num"] = data["weightpointer"].apply(common.replace_numbers)

In [94]:
data["no_stopwords"] = data["no_num"].apply(delete_stopwords)

In [95]:
data["first_form"] = data["no_stopwords"].apply(first_form)

In [96]:
data["synonyms"] = data["first_form"].apply(replace_synonyms)

In [97]:
print(data.shape)
data.head(10)

(266, 9)


Unnamed: 0,message,category,timepointer,datepointer,weightpointer,no_num,no_stopwords,first_form,synonyms
0,23800 доплаты за отпуска в крок,income,23800 доплаты за отпуска в крок,23800 доплаты за отпуска в крок,23800 доплаты за отпуска в крок,NUM доплаты за отпуска в крок,NUM доплаты отпуска крок,NUM доплата отпуск крок,NUM доплата отпуск крок
1,54782.52 зарплата,income,54782.52 зарплата,54782.52 зарплата,54782.52 зарплата,NUM зарплата,NUM зарплата,NUM зарплата,NUM зарплата
2,6086.5 зарплата,income,6086.5 зарплата,6086.5 зарплата,6086.5 зарплата,NUM зарплата,NUM зарплата,NUM зарплата,NUM зарплата
3,1500 кошулька дала немного денег,income,1500 кошулька дала немного денег,1500 кошулька дала немного денег,1500 кошулька дала немного денег,NUM кошулька дала немного денег,NUM кошулька дала немного денег,NUM кошулька дать немного деньга,NUM кошулька дать немного деньга
4,зп 36000,income,зп 36000,зп 36000,зп 36000,зп NUM,зп NUM,зп NUM,зарплата NUM
5,9 сентября олег дал 30000,income,9 сентября олег дал 30000,DATEPOINTER олег дал 30000,DATEPOINTER олег дал 30000,DATEPOINTER олег дал NUM,DATEPOINTER олег дал NUM,DATEPOINTER олег дать NUM,DATEPOINTER олег дать NUM
6,ещё зп 4000,income,ещё зп 4000,ещё зп 4000,ещё зп 4000,ещё зп NUM,ещё зп NUM,ещё зп NUM,ещё зарплата NUM
7,вчера ходил в бассейн проплыл 600 метров,other,вчера ходил в бассейн проплыл 600 метров,DATEPOINTER ходил в бассейн проплыл 600 метров,DATEPOINTER ходил в бассейн проплыл 600 метров,DATEPOINTER ходил в бассейн проплыл NUM метров,DATEPOINTER ходил бассейн проплыл NUM метров,DATEPOINTER ходить бассейн проплыть NUM метр,DATEPOINTER ходить бассейн проплыть NUM метр
8,вес вчера 74.7,weight,вес вчера 74.7,вес DATEPOINTER 74.7,WEIGHTPOINTER DATEPOINTER,WEIGHTPOINTER DATEPOINTER,WEIGHTPOINTER DATEPOINTER,WEIGHTPOINTER DATEPOINTER,WEIGHTPOINTER DATEPOINTER
9,сегодня читаю три товарища,event,сегодня читаю три товарища,DATEPOINTER читаю три товарища,DATEPOINTER читаю три товарища,DATEPOINTER читаю три товарища,DATEPOINTER читаю товарища,DATEPOINTER читать товарищ,DATEPOINTER читать товарищ


In [98]:
X = data["synonyms"]
Y = data["category"]

In [99]:
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfTransformer()

X_count = count_vectorizer.fit_transform(X)
X_tfidf = tfidf_vectorizer.fit_transform(X_count)
X_tfidf.shape

(266, 159)

In [100]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_tfidf, Y, test_size=0.3, random_state=16)

In [101]:
X_train_text, X_test_text, *_ = sklearn.model_selection.train_test_split(X, Y, test_size=0.3, random_state=16)

#### SVM

In [207]:
svm = sklearn.linear_model.SGDClassifier(loss="hinge", alpha=1e-3, max_iter=5)
svm.fit(X_train, Y_train)

predicted = svm.predict(X_test)

In [208]:
classes = svm.classes_.tolist()

print_scores(Y_test, predicted, classes)
print("\n\n\nConfusion matrix:\n")
print_conf_matrix(Y_test, predicted, classes)

Accuracy: 0.9375

               event     income    other     outcome   timespend weight    
Precision:     0.5       1.0       0.0       0.944     1.0       1.0       
Recall:        0.5       0.25      0.0       1.0       1.0       1.0       
F1:            0.5       0.4       0.0       0.971     1.0       1.0       


Confusion matrix:

          event     income    other     outcome   timespend weight    
event     1         0         0         <1>       0         0         
income    0         1         0         <3>       0         0         
other     <1>       0         0         0         0         0         
outcome   0         0         0         67        0         0         
timespend 0         0         0         0         5         0         
weight    0         0         0         0         0         1         


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


##### Что именно не так классифицировалось

In [116]:
results = pd.DataFrame()

results["text"] = X_test_text
results["y_test"] = Y_test
results["predicted"] = predicted

results[~(results.y_test == results.predicted)]

Unnamed: 0,text,y_test,predicted
7,DATEPOINTER ходить бассейн проплыть NUM метр,other,event
0,NUM доплата отпуск крок,income,outcome
10,сделать зарядка глаз,event,outcome
3,NUM кошулька дать немного деньга,income,outcome
5,DATEPOINTER олег дать NUM,income,outcome


# Классификация без использования text-based фичей

Фичи:
- ~~есть ли указатели даты~~
- ~~если ли указатели времени~~
- ~~если ли указатель веса~~
- сколько слов, которые уже встречались в сообщениях категории outcome
- ~~есть ли цифры~~
- ~~сколько цифр~~
- ~~сколько слов~~

In [141]:
def isin(series, pattern):
    
    return series.str.contains(pattern).astype(int)

def count_tokens(string):
    
    return len(word_tokenize(string))

def count_num_pointers(string):
    
    return len([token for token in word_tokenize(string) if token == constants.NUM_POINTER])

In [162]:
data_two = pd.DataFrame({"message": data["synonyms"], "category": data["category"]})

In [163]:
data_two["has_num"] = isin(data_two["message"], constants.NUM_POINTER)
data_two["has_datepointer"] = isin(data_two["message"], constants.DATE_POINTER)
data_two["has_weightpointer"] = isin(data_two["message"], constants.WEIGHT_POINTER)
data_two["has_timepointer"] = isin(data_two["message"], constants.TIME_POINTER)

data_two["num_pointers_1"] = data_two["message"].apply(lambda x: count_num_pointers(x) == 1).astype(int)
data_two["num_pointers_2"] = data_two["message"].apply(lambda x: count_num_pointers(x) == 2).astype(int)
data_two["num_pointers_3_more"] = data_two["message"].apply(lambda x: count_num_pointers(x) > 2).astype(int)

In [164]:
data_two["count_tokens"] = data_two["message"].apply(count_tokens)

for i in range(data_two.count_tokens.min(), data_two.count_tokens.max() + 1):
    data_two["count_tokens_{0}".format(i)] = (data_two["count_tokens"] == i).astype(int)

data_two = data_two.drop(["count_tokens"], axis=1)

### CatBoost

In [212]:
X_two = data_two.drop(["message", "category"], axis=1)
Y_two = data_two["category"]

In [213]:
X_train_two, X_test_two, Y_train_two, Y_test_two = sklearn.model_selection.train_test_split(
    X_two, Y_two, test_size=0.3, random_state=16)

In [214]:
le = sklearn.preprocessing.LabelEncoder()
le = le.fit(Y_two)

Y_train_two = le.transform(Y_train_two)

In [282]:
catboost = CatBoostClassifier(iterations=10, depth=2, learning_rate=1, loss_function='MultiClass', verbose=True)
catboost.fit(X_train_two, Y_train_two, verbose=True, cat_features=list(range(len(X_train_two.columns))))

predicted = catboost.predict(X_test_two).astype(int)

feature 11 is redundant categorical feature, skipping it
feature 12 is redundant categorical feature, skipping it
feature 6 is redundant categorical feature, skipping it
0:	learn -0.2806056773	total: 11.8ms	remaining: 106ms
1:	learn -0.2346779348	total: 25.9ms	remaining: 104ms
2:	learn -0.204786185	total: 36.4ms	remaining: 84.9ms
3:	learn -0.1823656592	total: 46.3ms	remaining: 69.5ms
4:	learn -0.166523977	total: 69.9ms	remaining: 69.9ms
5:	learn -0.1549167181	total: 86.6ms	remaining: 57.7ms
6:	learn -0.1488596623	total: 97ms	remaining: 41.6ms
7:	learn -0.144635182	total: 104ms	remaining: 26.1ms
8:	learn -0.1393458718	total: 112ms	remaining: 12.5ms
9:	learn -0.1328258752	total: 120ms	remaining: 0us


In [283]:
predicted = le.inverse_transform(predicted)

In [284]:
print_scores(Y_test_two, predicted, le.inverse_transform(catboost.classes_))

Accuracy: 0.925

               event     income    other     outcome   timespend weight    
Precision:     0.667     0.0       0.0       0.93      1.0       1.0       
Recall:        1.0       0.0       0.0       0.985     1.0       1.0       
F1:            0.8       0.0       0.0       0.957     1.0       1.0       

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### RandomForest

In [236]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_two, Y_train_two)
predicted = rf.predict(X_test_two)

In [237]:
predicted = le.inverse_transform(predicted)
classes_two = le.inverse_transform(rf.classes_)

In [265]:
print_scores(Y_test_two, predicted, classes_two)

Accuracy: 0.925

               event     income    other     outcome   timespend weight    
Precision:     0.667     0.0       0.0       0.943     1.0       1.0       
Recall:        1.0       0.0       0.0       0.985     1.0       1.0       
F1:            0.8       0.0       0.0       0.964     1.0       1.0       

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
