In [9]:
import os
from preprocessing import TextCleaner
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import SelectKBest, chi2
import eli5
import json
import joblib

# Handle doc/docx/pdf/rtf
import textract
import docx2txt
import fitz
from striprtf.striprtf import rtf_to_text

In [2]:
def read_file(filename: str):
    if filename.endswith("docx"):
        text = docx2txt.process(filename)
    elif filename.endswith("pdf"):
        doc = fitz.open(filename)
        text = []
        for page in doc:
            text.append(page.get_text())
        text = " ".join(text)
    elif filename.endswith("doc"):
        text = reinterpret(textract.process(filename))
        text = remove_convert_info(text)
    elif filename.endswith("rtf"):
        with open(filename) as f:
            content = f.read()
            text = rtf_to_text(content)
    else:
        raise ValueError(
            "Does not support the current file extension, currently supported files: 'docx', 'doc' and 'pdf'"
        )
    return text


def read_json(filename: str):
    with open(filename, "r") as f:
        classes = json.load(f)
    return classes


def reinterpret(text: str):
    return text.decode('utf8')


def remove_convert_info(text: str):
    for i, s in enumerate(text):
        if s == ":":
            break
    return text[i + 6:]


def save_cls2index(filename: str = "hacka-aka-embedika/cls2index.json"):
    with open(filename, "w") as f:
        json.dump(cls2index, f, ensure_ascii=False)


def metrics(true, preds):
    acc = accuracy_score(true, preds)
    p = precision_score(true, preds, average='micro')
    f1 = f1_score(true, preds, average='micro')
    recall = recall_score(true, preds, average='micro')
    return {
        "accuracy_score": acc,
        "precision_score": p,
        "f1_score": f1,
        "recall_score": recall,
    }

In [3]:
json_file = "hacka-aka-embedika/classes.json"
dataset_path = "hacka-aka-embedika/docs"
classes = read_json(json_file)

In [4]:
data = {"filename": [], "text": [], "class": []}
cls2index = {}
for filename, cls in classes.items():
    print(filename)
    data["filename"].append(filename)
    text = read_file(os.path.join(dataset_path, filename))
    data["text"].append(text)
    cls = cls.strip()
    data["class"].append(cls)
    if cls not in cls2index:
        cls2index[cls] = len(cls2index)

5908cb5da047d6c9e6dfea6337fb3189.doc
14711e4fc8e56f0c75856c8837ec04cb.doc
7eb67b5aecf3f3190aab0a5f8ea32172.docx
b40a9d048b199d5f4db62a6a2335f2a0.pdf
84fec112d02288861e7af59f468131fb.docx
f6377999f8a5aa9a09b03e428ac93153.doc
a525f050cef10dee3a42468daec064ff.doc
bec0aa38d1383172690a18d16b07f154.doc
214d620d9c54bc83111277dd872d3cb2.pdf
d143c89d002fcef3e2bd2efdb4966f55.doc
2fd747f38e30ae7ce1c9d6e3b907ac5d.doc
4c2c295e81f4a6c3e669e8f76c6ce423.docx
64f58bc6e1207a570a38d771609b2cf1.docx
7ecd641f2ad81961c17455ed3ebeb2ab.doc
4e583dc5a5f1499fd2408f3152589f2d.doc
79104075f8b2ff971d51c495e67af52c.pdf
19e2becdb0f10e1c16a5a2460f3a84a2.pdf
0f7f507d0af90aba3c35484de016d8b4.doc
69ab7557dee21939aa7432b23a54cb2b.doc
8b82f3c800e486d9da9a13c98f7a40d6.doc
856860329f573bbaf158e1eafa885ba5.docx
35b6a0f57d909507c5aa9a8972b15f35.pdf
f28d4a853be12515dae73a5912bc5b41.doc
2c758805e2917306e6cbb079e2adcfcf.rtf
4db6b233fda895c3bffcb5fdc5b8e1de.rtf
18d573815c15b4e798bdfbfb52fb2f43.docx
d54c0b06162cf7cf57e8b1e7356aa204

In [5]:
df = pd.DataFrame.from_dict(data)
df["cls"] = df["class"].apply(lambda row: cls2index[row])
df

Unnamed: 0,filename,text,class,cls
0,5908cb5da047d6c9e6dfea6337fb3189.doc,\n﻿\t\tДОГОВОР \n\nг. Москва\n«__» ________ 20...,Договоры поставки,0
1,14711e4fc8e56f0c75856c8837ec04cb.doc,"\n﻿\n\nДоговор №______________\n\n\n Дата, мес...",Договоры поставки,0
2,7eb67b5aecf3f3190aab0a5f8ea32172.docx,ДОГОВОР ЗАКУПКИ № __________/\n\n\n\nг.\t\t\t\...,Договоры поставки,0
3,b40a9d048b199d5f4db62a6a2335f2a0.pdf,\n \n \nДОГОВОР ПОСТАВКИ № 1 \nг. Москва \n ...,Договоры поставки,0
4,84fec112d02288861e7af59f468131fb.docx,Договор № {НомерДокумента}\n\n\n\n{ДатаДокуме...,Договоры поставки,0
...,...,...,...,...
114,f57fe87f15a6dee2b17e804421be63b5.pdf,Страница 1 из 8 \n \n \nДОГОВОР КУПЛИ-ПРОДАЖИ ...,Договоры купли-продажи,4
115,1ea8809d696a4bd6a2076fbc6fd28c23.doc,\n﻿ДОГОВОР\n\nг. Москва\n«_____» _____________...,Договоры купли-продажи,4
116,57962abd8dbe0ce0c7056896ee4501f1.doc,\n﻿ПРЕДВАРИТЕЛЬНЫЙ ДОГОВОР\n\nг. Москва\n«____...,Договоры купли-продажи,4
117,2c81df29db63aebf495106881a52188f.doc,\n﻿ДОГОВОР\n№ __\n\nг.________________\t\t\t\t...,Договоры купли-продажи,4


In [44]:
cleaner = TextCleaner()
df["clean_text"] = df["text"].apply(lambda row: cleaner.execute(row))
df

Unnamed: 0,filename,text,class,cls,clean_text
0,5908cb5da047d6c9e6dfea6337fb3189.doc,\n﻿\t\tДОГОВОР \n\nг. Москва\n«__» ________ 20...,Договоры поставки,0,договор г москва г общество ограничить ответст...
1,14711e4fc8e56f0c75856c8837ec04cb.doc,"\n﻿\n\nДоговор №______________\n\n\n Дата, мес...",Договоры поставки,0,договор дата место заключение населенный пункт...
2,7eb67b5aecf3f3190aab0a5f8ea32172.docx,ДОГОВОР ЗАКУПКИ № __________/\n\n\n\nг.\t\t\t\...,Договоры поставки,0,договор закупка г г общество ограничить ответс...
3,b40a9d048b199d5f4db62a6a2335f2a0.pdf,\n \n \nДОГОВОР ПОСТАВКИ № 1 \nг. Москва \n ...,Договоры поставки,0,договор поставка г москва декабрь г общество о...
4,84fec112d02288861e7af59f468131fb.docx,Договор № {НомерДокумента}\n\n\n\n{ДатаДокуме...,Договоры поставки,0,договор номердокумент датадокумент названиекон...
...,...,...,...,...,...
114,f57fe87f15a6dee2b17e804421be63b5.pdf,Страница 1 из 8 \n \n \nДОГОВОР КУПЛИ-ПРОДАЖИ ...,Договоры купли-продажи,4,страница договор куплипродажа оборудование быв...
115,1ea8809d696a4bd6a2076fbc6fd28c23.doc,\n﻿ДОГОВОР\n\nг. Москва\n«_____» _____________...,Договоры купли-продажи,4,договор г москва год гр рф год рождение фамили...
116,57962abd8dbe0ce0c7056896ee4501f1.doc,\n﻿ПРЕДВАРИТЕЛЬНЫЙ ДОГОВОР\n\nг. Москва\n«____...,Договоры купли-продажи,4,предварительный договор г москва год гр рф год...
117,2c81df29db63aebf495106881a52188f.doc,\n﻿ДОГОВОР\n№ __\n\nг.________________\t\t\t\t...,Договоры купли-продажи,4,договор г г именовать дальнейший продавец лицо...


In [59]:
log_reg = LogisticRegression(solver='newton-cg')
vectorizer = CountVectorizer()
pipe = Pipeline(
  steps=[
    ("vect", vectorizer),
    # ("feature_selection", SelectKBest(chi2, k=10)),
    ("reg", log_reg)])

In [60]:
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"].values.tolist(), df["cls"].values.tolist(), test_size=0.2, random_state=42)

In [74]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
y = np.array(y_train)
X = np.array(X_train)
for i, (train_index, test_index) in enumerate(kf.split(X_train)):
    print(f"Fold {i}:")
    log_reg = LogisticRegression(solver='newton-cg')
    vectorizer = CountVectorizer()
    pipe = Pipeline(
      steps=[
        ("vect", vectorizer),
        ("reg", log_reg)]
    )
    pipe.fit(X[train_index], y[train_index])
    preds = pipe.predict(X[test_index])
    metrics_output = metrics(y[test_index], preds)
    print("Val metrics:")
    print(metrics_output)

    print("Test metrics:")
    preds = pipe.predict(X_test)
    metrics_output = metrics(y_test, preds)
    print(metrics_output, end="\n\n")

Fold 0:
Val metrics:
{'accuracy_score': 1.0, 'precision_score': 1.0, 'f1_score': 1.0, 'recall_score': 1.0}
Test metrics:
{'accuracy_score': 0.9583333333333334, 'precision_score': 0.9583333333333334, 'f1_score': 0.9583333333333334, 'recall_score': 0.9583333333333334}

Fold 1:
Val metrics:
{'accuracy_score': 1.0, 'precision_score': 1.0, 'f1_score': 1.0, 'recall_score': 1.0}
Test metrics:
{'accuracy_score': 0.9583333333333334, 'precision_score': 0.9583333333333334, 'f1_score': 0.9583333333333334, 'recall_score': 0.9583333333333334}

Fold 2:
Val metrics:
{'accuracy_score': 1.0, 'precision_score': 1.0, 'f1_score': 1.0, 'recall_score': 1.0}
Test metrics:
{'accuracy_score': 0.9583333333333334, 'precision_score': 0.9583333333333334, 'f1_score': 0.9583333333333334, 'recall_score': 0.9583333333333334}

Fold 3:
Val metrics:
{'accuracy_score': 0.8947368421052632, 'precision_score': 0.8947368421052632, 'f1_score': 0.8947368421052632, 'recall_score': 0.8947368421052632}
Test metrics:
{'accuracy_scor

In [48]:
log_reg = LogisticRegression(solver='newton-cg')
vectorizer = CountVectorizer()

log_reg.fit(vectorizer.fit_transform(X_train), y_train)
preds = log_reg.predict(vectorizer.transform(X_test))
metrics_output = metrics(y_test, preds)
print("Val metrics:")
print(metrics_output)

print("Test metrics:")
preds = log_reg.predict(vectorizer.transform(X_test))
metrics_output = metrics(y_test, preds)
print(metrics_output, end="\n\n")

Val metrics:
{'accuracy_score': 0.9583333333333334, 'precision_score': 0.9583333333333334, 'f1_score': 0.9583333333333334, 'recall_score': 0.9583333333333334}
Test metrics:
{'accuracy_score': 0.9583333333333334, 'precision_score': 0.9583333333333334, 'f1_score': 0.9583333333333334, 'recall_score': 0.9583333333333334}



In [49]:
eli5.show_weights(estimator=log_reg, feature_names=list(vectorizer.get_feature_names_out()), top=(50, 5))

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+0.167,поставщик,,,
+0.086,поставка,,,
+0.081,товар,,,
+0.065,покупатель,,,
+0.043,партия,,,
+0.039,каждый,,,
+0.038,приемка,,,
+0.026,поставить,,,
+0.025,ответственность,,,
+0.025,спецификация,,,

Weight?,Feature
+0.167,поставщик
+0.086,поставка
+0.081,товар
+0.065,покупатель
+0.043,партия
+0.039,каждый
+0.038,приемка
+0.026,поставить
+0.025,ответственность
+0.025,спецификация

Weight?,Feature
+0.154,услуга
+0.097,исполнитель
+0.066,заказчик
+0.064,настоящий
+0.044,оказать
+0.038,оказание
+0.034,банкет
+0.030,проведение
+0.027,договор
+0.027,исполнение

Weight?,Feature
+3.368,<BIAS>
+0.174,работа
+0.072,выполнение
+0.059,подпись
+0.052,выполнить
+0.051,подрядчик
+0.047,заказчик
+0.032,наименование
+0.031,фамилия
+0.029,лицо

Weight?,Feature
+0.132,арендатор
+0.098,арендодатель
+0.079,аренда
+0.041,квартира
+0.038,плата
+0.036,договор
+0.036,помещение
+0.035,дом
+0.030,арендный
+0.030,имущество

Weight?,Feature
+0.196,продавец
+0.081,покупатель
+0.074,средство
+0.067,транспортный
+0.051,собственность
+0.051,право
+0.046,регистрация
+0.045,выдать
+0.043,тс
+0.041,передать


In [50]:
# joblib.dump(log_reg, 'model_v1_natasha.joblib')

['model_v1_natasha.joblib']

In [51]:
# joblib.dump(vectorizer, 'vectorizer_v1_natasha.joblib')

['vectorizer_v1_natasha.joblib']

In [76]:
# joblib.dump(pipe, 'pipe_v1_natasha.joblib')

['pipe_v1_natasha.joblib']