In [250]:
from gensim.models import KeyedVectors
from langdetect import detect
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
import codecs
import nltk
import os
import re

import warnings
warnings.filterwarnings('ignore')

In [133]:
def read_requests_data():
    all_requests = []
    for filename in os.listdir("./1551/"):
        category = filename[:-4]
        with open("./1551/" + filename) as f:
            file_content = f.read()
            file_requests = re.split(r'\n{2,}\d{6,}\n', file_content)
            for request in file_requests:
                request_split = re.split(r'\n', request)
                if request_split and request_split[0] != "":
                    if re.match("^\d{6,}$", request_split[0].strip()):
                        request_text = " ".join(request_split[1:])
                    else:
                        request_text = " ".join(request_split)
                    
                    request_text = request_text.replace("\n", " ")\
                            .replace("[\d\+\\\/\-\.]+", " ").strip().lower()
                    try:
                        if detect(request_text) != 'uk':
                            continue
                        all_requests.append((request_text, category))
                    except Exception:
                        print("Not detected")
                        print("-----------------------")
                        print(request_split)
                        
    return pd.DataFrame(all_requests, columns=["request", "category"])
                
def read_embeddings():
    emgeddings = KeyedVectors.load_word2vec_format("./news.lowercased.tokenized.word2vec.300d", binary=False)
    return emgeddings

In [70]:
embeddings = read_embeddings()

In [134]:
def get_stopwords():
    with open("./stopwords.txt") as f:
        return f.readlines()

In [135]:
requests_data = read_requests_data()

Not detected
-----------------------
['2847929']
Not detected
-----------------------
['2839007']


In [136]:
requests_data.count()

request     62078
category    62078
dtype: int64

In [137]:
requests_data.head()

Unnamed: 0,request,category
0,згідно даних сайту містобудівного кадастру м. ...,Контроль-за-станом-рекламних-засобів
1,добрий день. прохання навести порядок з рекла...,Контроль-за-станом-рекламних-засобів
2,"скарга. на нижній набережній, що йде вздовж в...",Контроль-за-станом-рекламних-засобів
3,на огородженні мосту(каховский путепровод №1) ...,Контроль-за-станом-рекламних-засобів
4,прошу перевірити законність конструкцій та вив...,Контроль-за-станом-рекламних-засобів


In [46]:
def cleanupDoc(s):
    stopset = set(get_stopwords())
    tokens = nltk.word_tokenize(s)
    cleanup = [token.lower() for token in tokens if token.lower() not in stopset and  len(token)>2]
    return cleanup

In [138]:
requests_data['cleaned_request'] = requests_data['request'].apply(cleanupDoc)

In [265]:
def request_to_vector(req):
    req_words_embeddings=[]
    for word in req:
        # filter out numbers and links
        if re.match(r"^[\d\-\.\#\№\\\/\s]+$", word) or word.startswith("http") or word.startswith("//"):
            continue
        try:
            emb = embeddings.get_vector(word)
            req_words_embeddings.append(emb)
        except:
            continue
    req_words_embeddings = np.array(req_words_embeddings)
    return np.sum(req_words_embeddings, axis=0).reshape(-1)
    

In [266]:
requests_data['request_embeddings'] = requests_data['cleaned_request'].apply(request_to_vector)

In [282]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
le.fit(requests_data['category'].unique())
requests_data['category_encoded'] = le.transform(requests_data['category'])

In [293]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(requests_data['request_embeddings'].to_list(), 
                                                    requests_data['category_encoded'].to_list(), 
                                                    test_size=0.3, random_state=1, 
                                                    stratify=requests_data['category_encoded'].to_list())

In [294]:
knn_model = KNeighborsClassifier(n_neighbors=188, metric='cosine')
knn_model.fit(X=X_train, y=y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=188, p=2,
           weights='uniform')

In [295]:
predictions = knn_model.predict(X_test)

In [301]:
from sklearn.metrics import f1_score, classification_report

print("F1 score =  ", f1_score(y_test, predictions, average="micro"))

F1 score =   0.3562896804046056


In [302]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.30      0.03      0.06        93
          1       0.30      0.40      0.34       209
          2       0.00      0.00      0.00        84
          3       0.00      0.00      0.00        22
          4       0.00      0.00      0.00        25
          5       0.00      0.00      0.00        28
          6       0.00      0.00      0.00        15
          7       0.28      0.71      0.40       191
          8       1.00      0.13      0.23        38
          9       1.00      0.07      0.14        27
         10       0.00      0.00      0.00        24
         11       0.35      0.09      0.15        96
         12       0.00      0.00      0.00        26
         13       0.84      0.42      0.56        38
         14       0.67      0.03      0.06        65
         15       0.00      0.00      0.00        17
         16       0.48      0.25      0.33       123
         17       0.68      0.12      0.20   

As we can see, the f1 score for KNN classifier with cosine similarity metric is only 0.36.  But as we can see, that lot of categories has 0 recall and precision. We can try to filter out categories without enough requests represented.

In [316]:
requests_data.groupby("category").count()

Unnamed: 0_level_0,request,cleaned_request,request_embeddings,is_request_embeddings_empty,category_encoded
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Інші-Подяки,310,310,310,310,310
Інші-технічні-недоліки-стану-ліфту,696,696,696,696,696
Аварійний--травмонебезпечний-стан-утримання-об-єктів-благоустрою,280,280,280,280,280
Бажаючі-отримати--Картки-киянина--КК--,72,72,72,72,72
Будівництво--дооблаштування-спортивних-майданчиків,82,82,82,82,82
Будівництво-АЗС,93,93,93,93,93
Будівництво-в-нічний-час,51,51,51,51,51
Будівництво-дооблаштування-дитячого-майданчику,636,636,636,636,636
Будівництво-та-реконструкція-об-єктів-освіти,128,128,128,128,128
Взаємовідносини-з-сусідами,91,91,91,91,91


In [340]:
s = requests_data.groupby("category").count()['request']
categories_to_use = s[s > 500].index.to_list()

requests_data = requests_data[requests_data['category'].isin(categories_to_use)]
requests_data.count()

request                        36133
category                       36133
cleaned_request                36133
request_embeddings             36133
is_request_embeddings_empty    36133
category_encoded               36133
request_tfidf                  36133
dtype: int64

In [434]:
le = LabelEncoder()
le.fit(requests_data['category'].unique())
requests_data['category_encoded'] = le.transform(requests_data['category'])
X_train, X_test, y_train, y_test = train_test_split(requests_data['request_embeddings'].to_list(), 
                                                    requests_data['category_encoded'].to_list(), 
                                                    test_size=0.3, random_state=1, 
                                                    stratify=requests_data['category_encoded'].to_list())

knn_model = KNeighborsClassifier(n_neighbors=26, metric='cosine')
knn_model.fit(X=X_train, y=y_train)
predictions = knn_model.predict(X_test)
print("F1 score =  ", f1_score(y_test, predictions, average="micro"))
print(classification_report(y_test, predictions))

F1 score =   0.5955719557195572
             precision    recall  f1-score   support

          0       0.44      0.47      0.45       209
          1       0.66      0.80      0.72       191
          2       0.50      0.46      0.48       388
          3       0.75      0.09      0.16       234
          4       0.61      0.92      0.73      1970
          5       0.61      0.63      0.62       942
          6       0.60      0.18      0.28       297
          7       0.44      0.17      0.24       162
          8       0.51      0.53      0.52       379
          9       0.88      0.42      0.57       248
         10       0.68      0.60      0.64       364
         11       0.64      0.56      0.60       335
         12       0.65      0.32      0.43       192
         13       0.36      0.13      0.20       223
         14       0.61      0.79      0.69       663
         15       0.69      0.36      0.48       193
         16       0.72      0.67      0.69       331
         17  

As we can see for categories with enough number of requests we can see much better accuracy with f1 score 0.59. So for first 26 categories with more than 500 requests we have accuracy 0.59.


Next we are going to add some enhancements to improve the accuracy.


### Improvements

As we actually did already previous preprocessing and filtering out documents with small number of elements and filtered out numbers, links, etc we are going to add lemmatization with lemmatized embeddings, use TF-IDF and try to use another models, like SVM, RandomForest and Logistic Regression.

##### TF-IDF

As it is super hard for my laptop to compute TF-IDF and KNN for all 36 thousands of records, i will take a half of it and try to calculate it on smaller amount of data

In [353]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=0.0, dtype=np.float32)

X_train, X_test, y_train, y_test = train_test_split(requests_data['cleaned_request'], 
                                                    requests_data['category_encoded'], 
                                                    test_size=0.5, random_state=1, 
                                                    stratify=requests_data['category_encoded'].to_list())

X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                    y_train, 
                                                    test_size=0.3, random_state=1, 
                                                    stratify=y_train.to_list())

tfidf_train = tfidf.fit_transform(X_train.str.join(" ")).toarray()
tfidf_test = tfidf.transform(X_test.str.join(" ")).toarray()

In [354]:
knn_model = KNeighborsClassifier(n_neighbors=26, metric='cosine')
knn_model.fit(X=tfidf_train, y=y_train)
predictions = knn_model.predict(tfidf_test)
print("F1 score =  ", f1_score(y_test, predictions, average="micro"))
print(classification_report(y_test, predictions))

F1 score =   0.6453874538745388
             precision    recall  f1-score   support

          0       0.55      0.11      0.18       104
          1       0.85      0.91      0.88        95
          2       0.79      0.29      0.43       194
          3       0.83      0.25      0.38       117
          4       0.63      0.96      0.76       985
          5       0.61      0.85      0.71       471
          6       0.69      0.17      0.27       149
          7       0.52      0.41      0.46        81
          8       0.41      0.69      0.52       189
          9       0.86      0.66      0.75       124
         10       0.63      0.91      0.74       182
         11       0.81      0.26      0.39       167
         12       0.90      0.20      0.32        96
         13       0.30      0.21      0.25       112
         14       0.75      0.78      0.77       332
         15       0.90      0.44      0.59        97
         16       0.80      0.51      0.62       165
         17  

So we can see increasing of f1 score by 0.05 with using TF-IDF instead of sum of embeddings of all the tokens only on the half of the dataset.

#### Embeddings of normalized form

In [371]:
def read_lemmatized_embeddings():
    emgeddings = KeyedVectors.load_word2vec_format("./fiction.cased.lemmatized.word2vec.300d/data", binary=False)
    return emgeddings

lemmatized_embeddings = read_lemmatized_embeddings()

In [467]:
import pymorphy2

pymorphy = pymorphy2.MorphAnalyzer(lang='uk')

def get_lemmatized_embeddings(s):
    string = " ".join(s)
    words = pymorphy.parse(string)[0].normal_form.split(" ")
    req_words_embeddings=[]
    for word in words:
        # filter out numbers and links
        if re.match(r"^[\d\-\.\#\№\\\/\s]+$", word) or word.startswith("http") or word.startswith("//"):
            continue
        try:
            emb = lemmatized_embeddings.get_vector(word)
            req_words_embeddings.append(emb)
        except:
            continue
    req_words_embeddings = np.array(req_words_embeddings)
    return np.sum(req_words_embeddings, axis=0).reshape(-1)
        

requests_data['request_lemm_embeddings'] = requests_data['cleaned_request'].apply(get_lemmatized_embeddings)

In [469]:
X_train, X_test, y_train, y_test = train_test_split(requests_data['request_lemm_embeddings'].to_list(), 
                                                    requests_data['category_encoded'].to_list(), 
                                                    test_size=0.3, random_state=1, 
                                                    stratify=requests_data['category_encoded'].to_list())

knn_model = KNeighborsClassifier(n_neighbors=26, metric='cosine')
knn_model.fit(X=X_train, y=y_train)
predictions = knn_model.predict(X_test)
print("F1 score =  ", f1_score(y_test, predictions, average="micro"))
print(classification_report(y_test, predictions))

I don't know what is the issue, but i've wasted more than severall hours to find the reason of the problem with lemmatized embeddings. The logic is the same, but i receive the problem with shapes. Didn't resolve it, so left as it is because of the lack of time.

##### Logistic regression

In [412]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(requests_data['request_embeddings'].to_list(), 
                                                    requests_data['category_encoded'].to_list(), 
                                                    test_size=0.3, random_state=1, 
                                                    stratify=requests_data['category_encoded'].to_list())
lr = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='multinomial')
lr.fit(X=X_train, y=y_train)
predictions = lr.predict(X_test)
print("F1 score =  ", f1_score(y_test, predictions, average="micro"))
print(classification_report(y_test, predictions))

F1 score =   0.7163284132841328
             precision    recall  f1-score   support

          0       0.60      0.57      0.59       209
          1       0.82      0.90      0.86       191
          2       0.57      0.57      0.57       388
          3       0.55      0.49      0.52       234
          4       0.83      0.85      0.84      1970
          5       0.75      0.77      0.76       942
          6       0.59      0.53      0.56       297
          7       0.49      0.49      0.49       162
          8       0.60      0.67      0.63       379
          9       0.81      0.78      0.80       248
         10       0.74      0.77      0.75       364
         11       0.68      0.64      0.66       335
         12       0.62      0.63      0.63       192
         13       0.39      0.26      0.31       223
         14       0.84      0.84      0.84       663
         15       0.76      0.70      0.73       193
         16       0.77      0.74      0.75       331
         17  

##### SVM

In [413]:
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(requests_data['request_embeddings'].to_list(), 
                                                    requests_data['category_encoded'].to_list(), 
                                                    test_size=0.3, random_state=1, 
                                                    stratify=requests_data['category_encoded'].to_list())
svm = LinearSVC(random_state=0, tol=1e-5)
svm.fit(X=X_train, y=y_train)
predictions = svm.predict(X_test)
print("F1 score =  ", f1_score(y_test, predictions, average="micro"))
print(classification_report(y_test, predictions))

F1 score =   0.5593173431734317
             precision    recall  f1-score   support

          0       0.71      0.08      0.15       209
          1       0.86      0.73      0.79       191
          2       0.64      0.12      0.21       388
          3       0.44      0.48      0.46       234
          4       0.83      0.67      0.74      1970
          5       0.35      0.93      0.51       942
          6       0.53      0.08      0.14       297
          7       0.42      0.33      0.37       162
          8       0.52      0.50      0.51       379
          9       0.86      0.65      0.74       248
         10       0.78      0.61      0.68       364
         11       0.76      0.48      0.59       335
         12       0.59      0.08      0.15       192
         13       0.37      0.17      0.23       223
         14       0.84      0.66      0.74       663
         15       0.76      0.59      0.66       193
         16       0.73      0.60      0.66       331
         17  

#### Non-linear SVM

In [415]:
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(requests_data['request_embeddings'].to_list(), 
                                                    requests_data['category_encoded'].to_list(), 
                                                    test_size=0.3, random_state=1, 
                                                    stratify=requests_data['category_encoded'].to_list())
svm = SVC(random_state=0, degree=3, C=3, tol=1e-5)
svm.fit(X=X_train, y=y_train)
predictions = svm.predict(X_test)
print("F1 score =  ", f1_score(y_test, predictions, average="micro"))
print(classification_report(y_test, predictions))

F1 score =   0.3092250922509225
             precision    recall  f1-score   support

          0       1.00      0.06      0.11       209
          1       1.00      0.17      0.29       191
          2       0.97      0.09      0.16       388
          3       0.98      0.20      0.33       234
          4       0.21      0.99      0.34      1970
          5       0.91      0.20      0.33       942
          6       1.00      0.10      0.18       297
          7       1.00      0.15      0.26       162
          8       0.91      0.19      0.31       379
          9       1.00      0.14      0.25       248
         10       1.00      0.17      0.29       364
         11       1.00      0.09      0.16       335
         12       0.97      0.19      0.32       192
         13       0.77      0.15      0.25       223
         14       1.00      0.11      0.20       663
         15       1.00      0.10      0.19       193
         16       1.00      0.19      0.32       331
         17  

#### Random Forest

In [441]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(requests_data['request_embeddings'].to_list(), 
                                                    requests_data['category_encoded'].to_list(), 
                                                    test_size=0.3, random_state=1, 
                                                    stratify=requests_data['category_encoded'].to_list())
rf = RandomForestClassifier(n_estimators=100,random_state=0)
rf.fit(X=X_train, y=y_train)
predictions = rf.predict(X_test)
print("F1 score =  ", f1_score(y_test, predictions, average="micro"))
print(classification_report(y_test, predictions))

F1 score =   0.582380073800738
             precision    recall  f1-score   support

          0       0.54      0.27      0.36       209
          1       0.88      0.50      0.64       191
          2       0.62      0.25      0.35       388
          3       0.94      0.19      0.32       234
          4       0.56      0.96      0.70      1970
          5       0.63      0.63      0.63       942
          6       0.77      0.21      0.33       297
          7       0.93      0.23      0.37       162
          8       0.54      0.55      0.54       379
          9       0.87      0.35      0.50       248
         10       0.69      0.59      0.64       364
         11       0.81      0.32      0.46       335
         12       0.85      0.27      0.41       192
         13       0.59      0.20      0.30       223
         14       0.53      0.81      0.64       663
         15       0.74      0.26      0.39       193
         16       0.78      0.64      0.70       331
         17   

### Neural network approach

#### Feedforward neural net

In [449]:
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(requests_data['request_embeddings'].to_list(), 
                                                    requests_data['category_encoded'].to_list(), 
                                                    test_size=0.3, random_state=1, 
                                                    stratify=requests_data['category_encoded'].to_list())

mlp = MLPClassifier(solver='adam', alpha=1e-5, learning_rate="adaptive",
                    hidden_layer_sizes=(1000, 100), random_state=1)


mlp.fit(X_train, y_train)
predictions = mlp.predict(X_test)
print("F1 score =  ", f1_score(y_test, predictions, average="micro"))
print(classification_report(y_test, predictions))

F1 score =   0.7125461254612546
             precision    recall  f1-score   support

          0       0.58      0.54      0.56       209
          1       0.84      0.83      0.83       191
          2       0.58      0.59      0.59       388
          3       0.51      0.62      0.56       234
          4       0.82      0.87      0.84      1970
          5       0.76      0.78      0.77       942
          6       0.63      0.50      0.56       297
          7       0.47      0.59      0.52       162
          8       0.65      0.59      0.62       379
          9       0.73      0.76      0.75       248
         10       0.73      0.76      0.74       364
         11       0.71      0.60      0.65       335
         12       0.59      0.70      0.64       192
         13       0.46      0.45      0.46       223
         14       0.80      0.82      0.81       663
         15       0.65      0.73      0.69       193
         16       0.81      0.73      0.77       331
         17  

#### LSTM 
classifier template was taken from medium article about multiclass classifier with LSTM:
https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17

In [483]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 300


tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(requests_data['cleaned_request'].str.join(" ").values)
word_index = tokenizer.word_index

X = tokenizer.texts_to_sequences(requests_data['cleaned_request'].str.join(" ").values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)


Y = pd.get_dummies(requests_data['category']).values


X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.30, random_state = 1)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)


model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(26, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 128

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


(25293, 250) (25293, 26)
(10840, 250) (10840, 26)
Train on 22763 samples, validate on 2530 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test set
  Loss: 0.831
  Accuracy: 0.734


### Conclusion

As result we have next accuracy(f1 score) for different models:
    - baseline (KNN with sum of embeddings) - 0.59
    - KNN with TF-IDF document vectors - 0.64
    - KNN with sum of lemmatized embeddings - (had errors with it)
    - Logistic regression with sum of embeddings - 0.71
    - Linear SVM - 0.55
    - Non-Linear SVM (degree = 3) - 0.3
    - Random Forest - 0.58
    - FeedForward network - 0.71
    - LSTM - 0.734
    
Next improvements, which can be done is to fix lemmatized embeddings sum usage as a feature vector, also to combine different vectors (simple embeddings, normalized embeddings, tf-idf vectors) with models we used to find the best accuracy and also use cross-validation to find the best hyperparameters for every model.

Also it would be good to try BiLSTM in our case and also make some experiments with different network topologies.