In [63]:
import numpy as np
import pandas as pd
import os
import math
import fnmatch as fn
import json
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import gensim.downloader as api
from gensim.models import TfidfModel, Word2Vec
from gensim.corpora import Dictionary
import gensim
from sklearn.preprocessing import StandardScaler
from gensim.test.utils import common_texts
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, roc_curve, auc
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.metrics import accuracy_score
from gensim.models import FastText
import eli5
from eli5.sklearn import PermutationImportance
import fitz
import spacy

In [101]:
def get_spacy_features(text, document):
    tokens = [token for token in document if str(token) == text]
    tokens_pos = [token.pos_ for token in document if str(token) == text]
    features = {}
    for i in tokens_pos:
        features[i] = 1
    date_pattern = r'(\d{1,2})\s*(?:st|nd|rd|th)?\s+(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|(nov|dec)(?:ember)?|\S+)\s+(\d{4}|\d{2}(?!\d))|\d{1,2}[.\/-]\d{1,2}[.\/-]\d{2,4}|(\d{1,2})\s+(?:январ(?:я|ь)|феврал(?:я|ь)|март(?:а)?|апрел(?:я|ь)|май(?:я)?|июн(?:я|ь)|июл(?:я|ь)|август(?:а)?|сентябр(?:я|ь)|октябр(?:я|ь)|ноябр(?:я|ь)|декабр(?:я|ь))\s+(\d{4}|\d{2}(?!\d))|^\d{4}$'
    for token in tokens:
        if (token.vector_norm > 0.0):
            print(token.lemma_, token.pos_, token.vector_norm, token.ent_type_)
        if str(token).isupper():
            features["ABB"] = 1
            continue
        if token.ent_type_ == "ORG":
            features["ORG"] = 1
            if "PROPN" in features:
                del features["PROPN"]
            if "NOUN" in features:
                del features["NOUN"]
            continue
        if token.ent_type_ == "PER":
            features["PERSON"] = 1
            if "ORG" in features:
                del features["ORG"]
            if "NOUN" in features:
                del features["NOUN"]
            continue
        if re.match(date_pattern, token.text):
            features["DATE"] = 1
            if "NUM" in features:
                del features["NUM"]
            if "PERSON" in features:
                del features["PERSON"]
                if "PROPN" in features:
                    del features["PROPN"]
            if "ADJ" in features:
                del features["ADJ"]
            continue
        if token.like_url:
            features["URL"] = 1
    return features

def getDf(wv, words, document):
    unique_features = set()
    for page in words['pages']:
        for block in page['blocks']:
            for annotation in block['annotations']:
                row = {
                    "text": annotation.get("text"),
                    "is_italic": annotation.get("is_italic"),
                    "x_top_left": annotation.get("x_top_left"),
                    "font_size": annotation.get("font_size"),
                    "start": annotation.get("start"),
                    "is_bold": annotation.get("is_bold"),
                    "font_name": annotation.get("font_name"),
                    "is_normal": annotation.get("is_normal"),
                    "y_top_left": annotation.get("y_top_left"),
                    "width": annotation.get("width"),
                    "end": annotation.get("end"),
                    "height": annotation.get("height"),
                    "w2v": wv[annotation.get("text")].mean(),
                    "ADJ": 0,
                    "NOUN": 0,
                    "NUM": 0,
                    "PROPN": 0,
                    "DATE": 0,
                    "URL": 0,
                    "PERSON": 0,
                    "ORG": 0,
                    "ABB": 0,
                }
                spacy_features = get_spacy_features(annotation.get("text"), document)
                unique_features.update(spacy_features.keys())
                for feature in unique_features:
                    row[feature] = 1 if feature in spacy_features else 0
                df.loc[len(df)] = row
                unique_features = set()
                
def getDf3(wv, words, document):
    unique_features = set()
    # , "ADJ", "PROPN", "NUM", "NOUN", "DATE", "URL", "PERSON", "ORG", "ABB"
    df = pd.DataFrame(columns=["text", "is_italic", "x_top_left", "font_size", "start", "is_bold", "font_name", "is_normal", "y_top_left", "width", "end", "height", "w2v", "ADJ", "PROPN", "NUM", "NOUN", "DATE", "URL", "PERSON", "ORG", "ABB"])
    for page in words['pages']:
        for block in page['blocks']:
            for annotation in block['annotations']:
                row = {
                    "text": annotation.get("text"),
                    "is_italic": annotation.get("is_italic"),
                    "x_top_left": annotation.get("x_top_left"),
                    "font_size": annotation.get("font_size"),
                    "start": annotation.get("start"),
                    "is_bold": annotation.get("is_bold"),
                    "font_name": annotation.get("font_name"),
                    "is_normal": annotation.get("is_normal"),
                    "y_top_left": annotation.get("y_top_left"),
                    "width": annotation.get("width"),
                    "end": annotation.get("end"),
                    "height": annotation.get("height"),
                    "w2v": wv[annotation.get("text")].mean(),
                    "ADJ": 0,
                    "NOUN": 0,
                    "NUM": 0,
                    "PROPN": 0,
                    "DATE": 0,
                    "URL": 0,
                    "PERSON": 0,
                    "ORG": 0,
                    "ABB": 0,
                }
                spacy_features = get_spacy_features(annotation.get("text"), document)
                unique_features.update(spacy_features.keys())
                for feature in unique_features:
                    row[feature] = 1 if feature in spacy_features else 0
                df.loc[len(df)] = row
                unique_features = set()
    return df

In [132]:
def getText2(words):
    str1 = []
    str2 = ""
    for i in range(len(words['pages'])):
        for j in range (len(words['pages'][i]["blocks"])):
            for k in range (len(words['pages'][i]["blocks"][j]["annotations"])):
                str1.append(words['pages'][i]["blocks"][j]["annotations"][k]["text"])
                str2 = str2 + words['pages'][i]["blocks"][j]["annotations"][k]["text"] + " "
    return [str1,str2]

path = "C:\\Users\\Sheri\\IdeaProjects\\Work\\files"
listOfFiles = os.listdir(r"C:\\Users\\Sheri\\IdeaProjects\\Work\\files")
docs = []
q = []
for i in range(1,11):
    frame = path + "\\" + str(i) + "Full.json"
    with open(frame, "r", encoding="utf-8") as file:
        temp = getText2(json.load(file))
        docs.append(temp[0])
        q.append(temp[1])
nlp = spacy.load('ru_core_news_md')
documents = list(nlp.pipe(q))
documents[1].ents

(Викулин Валентин Андреевич Бизнес коммуникаций и информатики 12.02.2018,
 Проректор,
 Индуинова Анна Валерьевна Группа)

In [118]:
from pathlib import Path 

model = Word2Vec(sentences=docs, vector_size=100, min_count=1, workers=4, epochs=10)
wv = model.wv

path_to_dir = path
# , "ADJ", "PROPN", "NUM", "NOUN", "DATE", "URL", "PERSON", "ORG", "ABB"
df = pd.DataFrame(columns=["text", "is_italic", "x_top_left", "font_size", "start", "is_bold", "font_name", "is_normal", "y_top_left", "width", "end", "height", "w2v", "ADJ", "PROPN", "NUM", "NOUN", "DATE", "URL", "PERSON", "ORG", "ABB"])

for i in range(1,11):
    frame = path + "\\" + str(i) + "Full.json"
    entry = path + "\\" + str(i)
    with open(frame, "r", encoding="utf-8") as file:
        s = json.load(file)
        dtf = getDf3(wv, s, documents[i-1])
        getDf(wv, s, documents[i-1])
        filepath = Path(entry + "Fullidk" + ".csv")  
        filepath.parent.mkdir(parents=True, exist_ok=True)  
        dtf.to_csv(filepath, sep=',')  
        print(frame)

математика NOUN 5.492016 
и CCONJ 3.7161891 
информационный ADJ 6.041673 
технология NOUN 6.0638013 
математика NOUN 5.492016 
и CCONJ 3.7161891 
информационный ADJ 6.041673 
технология NOUN 6.0638013 
C:\Users\Sheri\IdeaProjects\Work\files\1Full.json
коммуникация NOUN 6.3451147 PER
и CCONJ 3.7161891 PER
информатика NOUN 6.0638013 PER
коммуникация NOUN 6.3451147 PER
и CCONJ 3.7161891 PER
информатика NOUN 6.0638013 PER
C:\Users\Sheri\IdeaProjects\Work\files\2Full.json
система NOUN 5.9339876 
система NOUN 5.9339876 
C:\Users\Sheri\IdeaProjects\Work\files\3Full.json
транспорт NOUN 5.496902 
транспорт NOUN 5.496902 
C:\Users\Sheri\IdeaProjects\Work\files\4Full.json
электроника NOUN 6.170791 ORG
и CCONJ 3.7161891 ORG
электроника NOUN 6.170791 ORG
и CCONJ 3.7161891 ORG
C:\Users\Sheri\IdeaProjects\Work\files\5Full.json
C:\Users\Sheri\IdeaProjects\Work\files\6Full.json
и CCONJ 3.7161891 
и CCONJ 3.7161891 
C:\Users\Sheri\IdeaProjects\Work\files\7Full.json
C:\Users\Sheri\IdeaProjects\Work\files

In [119]:
def get_name_keys(words):
    lst = []
    for i in range(len(words['pages'])):
        lst.append(list(words['pages'][i]["fields"].keys()))
    keys = [item for sublist in lst for item in sublist]
    return keys

def get_key_classes(keys):
    classes = {"other":0}
    counter = 1
    for key in keys:
        classes[key+"key"] = counter
        counter+=1
        classes[key+"value"] = counter
        counter+=1
    return classes

def is_intersect(x1, y1, w1, h1, x2, y2, w2, h2):
    return not (x1 + w1 < x2 or
                x2 + w2 < x1 or
                y1 + h1 < y2 or
                y2 + h2 < y1)

def getClass(row,x,name, keys_classes):
    r_top_x = row['x_top_left']
    r_top_y = row['y_top_left']
    r_width = row['width']
    r_height = row['height']
    
    key_top_x = x['key']['x_top_left']
    key_top_y = x['key']['y_top_left']
    key_width = x['key']['width']
    key_height = x['key']['height']
    
    value_top_x = x['value']['x_top_left']
    value_width = x['value']['width']
    value_top_y = x['value']['y_top_left']
    value_height = x['value']['height']
    
    if (is_intersect(r_top_x, r_top_y, r_width, r_height, key_top_x, key_top_y, key_width, key_height)):
        return keys_classes.get(name+"key")
    elif(is_intersect(r_top_x, r_top_y, r_width, r_height, value_top_x, value_top_y, value_width, value_height)):
        return keys_classes.get(name+'value')
    else:
        if 'Y' in row.index:
            if (row['Y']!= keys_classes.get("other")):
                return row['Y']
            else:
                return keys_classes.get("other")
        else:
            return keys_classes.get("other")
        
def getTrainingSet_Pr(json_str, words, keys, keys_classes):
    for i in range(len(json_str['pages'])):
        for j in keys:
            key_obj = json_str['pages'][i]['fields'][j]
            words['Y'] = words.apply(getClass, args=(key_obj, j, keys_classes), axis=1)
            
def getTrainingSet(json_str,words):
    keys = get_name_keys(json_str)
    keys_classes = get_key_classes(keys)
    getTrainingSet_Pr(json_str, words, keys, keys_classes)            

In [135]:
for i in range(1,9):
    scaler = StandardScaler()
    frame = path + "\\" + str(i) + "Fullidk.csv"
    json_str = path + "\\" + str(i) + ".json"
    word = pd.read_csv(frame)
    print(frame)
    with open(json_str, "r", encoding="utf-8") as f:
        template = json.load(f)
    getTrainingSet(template,word)
    X = word
    X = pd.concat([word[word['Y']!=0], word[word['Y']!=0], word[word['Y']!=0]], ignore_index=True)
    Y = X['Y']
    X = scaler.fit_transform(X.drop(columns=['text','font_name','end','start','Y', "Unnamed: 0"]), Y)
    binCount = np.bincount(Y.values)
    binMax = np.max(binCount)
    dic = {1:binMax, 2:binMax, 3:binMax, 4:binMax, 5:binMax, 6:binMax, 7:binMax, 8:binMax, 9:binMax, 10:binMax}
    oversampling = SMOTE(sampling_strategy=dic, k_neighbors=2)
    X,Y = oversampling.fit_resample(X,Y)
    model = svm.SVC(kernel='rbf')
    # model = RandomForestClassifier(n_estimators=100,
    #                               random_state=42).fit(X, Y)
    model.fit(X,Y)

C:\Users\Sheri\IdeaProjects\Work\files\1Fullidk.csv
C:\Users\Sheri\IdeaProjects\Work\files\2Fullidk.csv
C:\Users\Sheri\IdeaProjects\Work\files\3Fullidk.csv
C:\Users\Sheri\IdeaProjects\Work\files\4Fullidk.csv
C:\Users\Sheri\IdeaProjects\Work\files\5Fullidk.csv
C:\Users\Sheri\IdeaProjects\Work\files\6Fullidk.csv
C:\Users\Sheri\IdeaProjects\Work\files\7Fullidk.csv
C:\Users\Sheri\IdeaProjects\Work\files\8Fullidk.csv


In [136]:
for i in range(9, 11):
    word = pd.read_csv(r"C" + path[1:] + "\\" + str(i) + "Fullidk.csv")
    temp = word
    with open(r"C" + path[1:] + "\\" + str(i) + ".json", "r", encoding="utf-8") as f:
            template = json.load(f)
    getTrainingSet(template,word)
    Y_true = word['Y']
    word = scaler.fit_transform(word.drop(columns=['text','font_name','end','start','Y', "Unnamed: 0"]), Y)
    print(get_key_classes(get_name_keys(template)))
    Y_pred = model.predict(word)
    temp["Y"] = Y_pred
    print(accuracy_score(Y_true,Y_pred))
    t = pd.DataFrame()
    t['Y'] = Y_pred
    t['text'] = temp['text']
    t['Y_true'] = Y_true
    temp["Y_true"] = Y_true
    print(t[t['Y'] != t['Y_true']][['Y', 'text', 'Y_true']])
    temp2 = temp.drop(columns=['text','font_name','end','start','Y', 'Unnamed: 0', "Y_true"], axis = 1)
    perm = PermutationImportance(model, random_state=44).fit(word, t["Y"])
    display(eli5.show_weights(perm, feature_names = temp2.columns.tolist()))
    
    doc = fitz.open('C:/Users/Sheri/IdeaProjects/Work/temp_files/example' + str(i) + '.pdf')
    ttt = get_key_classes(get_name_keys(template))
    for page in doc:
        page.wrap_contents()
        for idx, row in temp.iterrows():
            for key, value in ttt.items():
                if value == row["Y"] & row["Y"] == row["Y_true"]:
                    rect_x1 = row['x_top_left']
                    rect_y1 = row["y_top_left"]
                    rect = [rect_x1, rect_y1]
                    page.insert_text([rect_x1, rect_y1-10], str(value), fontsize=14, color = (1, 0, 0))
                    normalized_number1 = row["Y"] * 3 / 100
                    normalized_number2 = row["Y"] * 2 / 100
                    normalized_number3 = 1
                    clr = (normalized_number1, normalized_number2, normalized_number3)
                    page.draw_rect([rect_x1, rect_y1, rect_x1 + row["width"], rect_y1 + row["height"]],  color = clr, width = 2)
    doc.save('C:\\Users\\Sheri\\IdeaProjects\\Work\\temp_files\\correct' + str(i) + '.pdf')
    
    doc = fitz.open('C:/Users/Sheri/IdeaProjects/Work/temp_files/example' + str(i) + '.pdf')
    ttt = get_key_classes(get_name_keys(template))
    for page in doc:
        page.wrap_contents()
        for idx, row in temp.iterrows():
            for key, value in ttt.items():
                if value == row["Y"] & row["Y"] != row["Y_true"]:
                    rect_x1 = row['x_top_left']
                    rect_y1 = row["y_top_left"]
                    rect = [rect_x1, rect_y1]
                    page.insert_text([rect_x1, rect_y1-10], str(value), fontsize=14, color = (1, 0, 0))
                    normalized_number1 = 1
                    normalized_number2 = row["Y"] * 2 / 100
                    normalized_number3 = row["Y"] * 3 / 100
                    clr = (normalized_number1, normalized_number2, normalized_number3)
                    page.draw_rect([rect_x1, rect_y1, rect_x1 + row["width"], rect_y1 + row["height"]],  color = clr, width = 2)
    doc.save('C:\\Users\\Sheri\\IdeaProjects\\Work\\temp_files\\incorrect' + str(i) + '.pdf')
    
    doc = fitz.open('C:/Users/Sheri/IdeaProjects/Work/temp_files/example' + str(i) + '.pdf')
    ttt = get_key_classes(get_name_keys(template))
    for page in doc:
        page.wrap_contents()
        for idx, row in temp.iterrows():
            for key, value in ttt.items():
                if value == row["Y"] & row["Y"] == row["Y_true"]:
                    rect_x1 = row['x_top_left']
                    rect_y1 = row["y_top_left"]
                    rect = [rect_x1, rect_y1]
                    page.insert_text([rect_x1, rect_y1-10], str(value), fontsize=14, color = (1, 0, 0))
                    normalized_number1 = row["Y"] * 3 / 100
                    normalized_number2 = row["Y"] * 2 / 100
                    normalized_number3 = 1
                    clr = (normalized_number1, normalized_number2, normalized_number3)
                    page.draw_rect([rect_x1, rect_y1, rect_x1 + row["width"], rect_y1 + row["height"]],  color = clr, width = 2)
                elif value == row["Y"] & row["Y"] != row["Y_true"]:
                    rect_x1 = row['x_top_left']
                    rect_y1 = row["y_top_left"]
                    rect = [rect_x1, rect_y1]
                    page.insert_text([rect_x1, rect_y1-10], str(value), fontsize=14, color = (1, 0, 0))
                    normalized_number1 = 1
                    normalized_number2 = row["Y"] * 2 / 100
                    normalized_number3 = row["Y"] * 3 / 100
                    clr = (normalized_number1, normalized_number2, normalized_number3)
                    page.draw_rect([rect_x1, rect_y1, rect_x1 + row["width"], rect_y1 + row["height"]],  color = clr, width = 2)
    doc.save('C:\\Users\\Sheri\\IdeaProjects\\Work\\temp_files\\corandincor' + str(i) + '.pdf')

{'other': 0, 'ФИОkey': 1, 'ФИОvalue': 2, 'Номер приказаkey': 3, 'Номер приказаvalue': 4, 'Группаkey': 5, 'Группаvalue': 6, 'Год поступленияkey': 7, 'Год поступленияvalue': 8, 'Факультетkey': 9, 'Факультетvalue': 10}
0.5652173913043478
     Y          text  Y_true
4    2      Институт      10
6    8    31.08.2021       0
7    2       Студент       0
11   3       3876-НГ       4
12  10    Проректор:       0
13   2      Васильев       0
14   2        Андрей       0
15   2     Сергеевич       0
20   9  Действителен       0
21   9           до:       0


Weight,Feature
0.2348  ± 0.1180,y_top_left
0.2000  ± 0.0426,x_top_left
0.1826  ± 0.1594,font_size
0.1652  ± 0.0651,is_italic
0.1478  ± 0.0887,width
0.1478  ± 0.0887,w2v
0.1478  ± 0.0426,PERSON
0.1478  ± 0.1180,height
0.1478  ± 0.1180,is_bold
0.1217  ± 0.0852,PROPN


{'other': 0, 'ФИОkey': 1, 'ФИОvalue': 2, 'Номер приказаkey': 3, 'Номер приказаvalue': 4, 'Группаkey': 5, 'Группаvalue': 6, 'Год поступленияkey': 7, 'Год поступленияvalue': 8, 'Факультетkey': 9, 'Факультетvalue': 10}
0.5833333333333334
     Y          text  Y_true
7    8    31.08.2020       0
8    2       Студент       0
12   3       3876-НГ       4
13  10    Проректор:       0
14   2        Зайцев       0
15   2        Сергей       0
16   2     Андреевич       0
21   9  Действителен       0
22   9           до:       0
23   3        2212ХР       6


Weight,Feature
0.2000  ± 0.1333,y_top_left
0.1667  ± 0.1179,PROPN
0.1583  ± 0.0624,is_italic
0.1583  ± 0.0333,x_top_left
0.1417  ± 0.1354,font_size
0.1417  ± 0.1354,is_bold
0.1333  ± 0.1333,width
0.1250  ± 0.0527,is_normal
0.1167  ± 0.0624,PERSON
0.1083  ± 0.0408,NUM


In [251]:
"ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE"

SyntaxError: invalid syntax (521946612.py, line 1)

In [113]:
for i in documents:
    for named_entity in i.ents:
        # help(named_entity)
        print(named_entity, named_entity.label_)
        # break

Иванов Иван Иванович PER
Власов PER
Дмитрий Юрьевич Группа PER
Викулин Валентин Андреевич Бизнес коммуникаций и информатики 12.02.2018 PER
Проректор PER
Индуинова Анна Валерьевна Группа PER
Евгений Дорман Александрович PER
Уринбойева PER
Нигина Баходировна Группа PER
Евгений Сертанович PER
Институт электроники и инноватики 31.08.2024 Студент ФИО: ORG
Михайлов PER
Александр Николаевич Группа PER
Сушнякова PER
Алина Ивановна Физико-математический PER
Ацер Каземир Александрович Группа PER
Кузнецов Даниил Игоревич PER
Институт Истории ORG
Антропологии 31.08.2024 Студент ФИО ORG
Смирнова PER
Оксана Павловна Группа PER
Алина Викторовона Биолого-почвенный 21.03.2012 PER
Сахаров Клим Сергеев Группа PER
Захаров PER
Артем Олегович Институт Экономики PER
Васильев PER
Андрей Сергеевич Группа PER
Новикова PER
Елена Дмитриевна PER
Институт химии растворов ORG
Зайцев Сергей Андреевич Группа PER


In [137]:
text = "Викулин Валентин Андреевич Бизнес коммуникаций и информатики 12.02.2018"
doc = nlp(text)
doc[0].lefts

<generator at 0x240bdb5b010>