In [1]:
import re
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import requests
import json
import time
from typing import List, Protocol, TypeVar, Union
from datetime import datetime, timedelta

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document
from chromadb import Documents, EmbeddingFunction, Embeddings

from tqdm.notebook import tqdm
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../src')
sys.path.append('../data')
from parsing.parser import parse_pdf
from embeddings import CustomEmbeddings, TogetherEmbeddings, YandexEmbeddings

import arxiv
import pickle

## Download PDFs from ArXiv

In [128]:
papers_ = arxiv.Client().results(arxiv.Search(query='text mining'))
N_PAPERS = 50
save_dir = "./data/raw/text_mining/articles"
downloaded_files = [f for f in listdir(save_dir) if f.endswith('.pdf')]

for i, paper in tqdm(enumerate(papers_), total=N_PAPERS):
    paper = next(papers_)
    
    if f'{paper.title}.pdf' not in downloaded_files:
        try:
            paper.download_pdf(dirpath=save_dir, filename=f'{paper.title}.pdf')
            downloaded_files.append(paper.title + '.pdf')
        except OSError as e:
            print(f'Gor error downloading paper "{paper.title}":\n{e}')
            
    if i == N_PAPERS - 1:
        break

  0%|          | 0/50 [00:00<?, ?it/s]

Gor error downloading paper "Can You Explain That, Better? Comprehensible Text Analytics for SE Applications":
[Errno 22] Invalid argument: './data/raw/text_mining/articles\\Can You Explain That, Better? Comprehensible Text Analytics for SE Applications.pdf'
Gor error downloading paper ""In vivo" spam filtering: A challenge problem for data mining":
[Errno 22] Invalid argument: './data/raw/text_mining/articles\\"In vivo" spam filtering: A challenge problem for data mining.pdf'


## Parse text from Arxiv PDFs

In [3]:
DATADIR = 'data/raw/text_mining/articles/'
PARSED_FILES_DIR = 'data/processed/texts/'

filenames = os.listdir(DATADIR)

for i, filename in enumerate(filenames):
    try:
        print(f'Process file {i + 1} of {len(filenames)}: {filename}...')
        res = parse_pdf(DATADIR + filename,
                        conf_threshold=0.55,
                        verbose=False,
                        debug=False,
                        debug_dir=PARSED_FILES_DIR)
        text = '\n'.join([r[0] for r in res])
        
        name = filename.split('/')
        with open(f'data/processed/texts/{filename.removesuffix("pdf") + "txt"}', 'w', errors='ignore') as f:
            f.write(text)
    except Exception as e:
        print(f'File {filename} failed to process..')
        print(e)

Process file 1 of 57: A Mining-Based Compression Approach for Constraint Satisfaction Problems.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  7.47it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'UCJIPT+CMR10', 'mean_length': 205.66153846153847, 'width': {'hist': array([14,  9,  4,  4,  9,  9,  7,  3,  5, 66], dtype=int64), 'bin_edges': array([  2.75965128,  38.35765528,  73.95565928, 109.55366328,
       145.15166728, 180.74967128, 216.34767528, 251.94567928,
       287.54368328, 323.14168728, 358.73969128])}, 'x0': {'hist': array([ 5,  0, 80, 20, 10,  6,  2,  4,  2,  1], dtype=int64), 'bin_edges': array([ 16.34    ,  59.429976, 102.519952, 145.609928, 188.699904,
       231.78988 , 274.879856, 317.969832, 361.059808, 404.149784,
       447.23976 ])}, 'x1': {'hist': array([ 5,  0,  1,  9,  4,  2, 15, 14,  9, 71], dtype=int64), 'bin_edges': array([ 36.34      ,  83.55595313, 130.77190626, 177.98785938,
       225.20381251, 272.41976564, 319.63571877, 366.8516719 ,
       414.06762502, 461.28357815, 508.49953128])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.37it/s]


Process file 2 of 57: A Survey on Web Multimedia Mining.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  6.60it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 9, 'font': 'LNTYRL+Times-Roman', 'mean_length': 214.7828282828283, 'width': {'hist': array([ 22,   6,  13,  10,   4,   5,   6,   5,  18, 109], dtype=int64), 'bin_edges': array([  4.55998678,  45.23362368,  85.90726058, 126.58089748,
       167.25453439, 207.92817129, 248.60180819, 289.2754451 ,
       329.949082  , 370.6227189 , 411.29635581])}, 'x0': {'hist': array([179,   3,   1,   0,   1,   1,   0,   0,   1,  12], dtype=int64), 'bin_edges': array([105.8355163 , 145.55590925, 185.2763022 , 224.99669515,
       264.7170881 , 304.43748105, 344.157874  , 383.87826695,
       423.59865989, 463.31905284, 503.03944579])}, 'x1': {'hist': array([  7,   8,   9,  11,   4,   5,   6,   7,   4, 137], dtype=int64), 'bin_edges': array([119.21624464, 159.00827981, 198.80031497, 238.59235014,
       278.38438531, 318.17642048, 357.96845564, 397.76049081,
       437.55252598, 477.34456115, 517.13659631])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  7.04it/s]


Process file 3 of 57: Accessing accurate documents by mining auxiliary document information.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 10.96it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 9, 'font': 'ONTVEI+NimbusRomNo9L-Regu', 'mean_length': 229.95454545454547, 'width': {'hist': array([21, 10,  6,  3,  3,  4,  5,  1,  2, 33], dtype=int64), 'bin_edges': array([  4.9813    ,  44.08350874,  83.18571748, 122.28792622,
       161.39013496, 200.4923437 , 239.59455244, 278.69676118,
       317.79896992, 356.90117866, 396.0033874 ])}, 'x0': {'hist': array([ 6,  0, 54,  5,  3,  1,  6, 11,  1,  1], dtype=int64), 'bin_edges': array([ 16.34      ,  55.39416368,  94.44832736, 133.50249104,
       172.55665472, 211.6108184 , 250.66498208, 289.71914576,
       328.77330944, 367.82747312, 406.8816368 ])}, 'x1': {'hist': array([ 6,  1,  2,  6,  6, 13,  7,  8,  3, 36], dtype=int64), 'bin_edges': array([ 36.34      ,  83.10633874, 129.87267748, 176.63901622,
       223.40535496, 270.1716937 , 316.93803244, 363.70437118,
       410.47070992, 457.23704866, 504.0033874 ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 11.61it/s]


Process file 4 of 57: Active Mining Sample Pair Semantics for Image-text Matching.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.59it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'DQVMIR+NimbusRomNo9L-Regu', 'mean_length': 143.10687022900763, 'width': {'hist': array([114,  23,  11,  11,  97,   1,   1,   2,   0,   2], dtype=int64), 'bin_edges': array([  3.4869   ,  55.1435249, 106.8001498, 158.4567747, 210.1133996,
       261.7700245, 313.4266494, 365.0832743, 416.7398992, 468.3965241,
       520.053149 ])}, 'x0': {'hist': array([80, 11,  9, 18, 10, 72, 16, 15, 18, 13], dtype=int64), 'bin_edges': array([ 16.34  ,  69.5429, 122.7458, 175.9487, 229.1516, 282.3545,
       335.5574, 388.7603, 441.9632, 495.1661, 548.369 ])}, 'x1': {'hist': array([11, 13, 12, 23, 63, 13,  8, 21, 24, 74], dtype=int64), 'bin_edges': array([ 36.34     ,  88.4727636, 140.6055272, 192.7382908, 244.8710544,
       297.003818 , 349.1365816, 401.2693452, 453.4021088, 505.5348724,
       557.667636 ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.70it/s]


Process file 5 of 57: Affect as a proxy for literary mood.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  8.12it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'APLMCZ+NimbusRomNo9L-Regu', 'mean_length': 220.81428571428572, 'width': {'hist': array([29, 11, 22, 19,  7,  5,  4, 10,  4, 99], dtype=int64), 'bin_edges': array([  4.6505728 ,  49.95819552,  95.26581824, 140.57344096,
       185.88106368, 231.1886864 , 276.49630912, 321.80393184,
       367.11155456, 412.41917728, 457.7268    ])}, 'x0': {'hist': array([  4, 160,   4,   4,   4,   4,  10,   2,  17,   1], dtype=int64), 'bin_edges': array([ 16.34     ,  62.5712256, 108.8024512, 155.0336768, 201.2649024,
       247.496128 , 293.7273536, 339.9585792, 386.1898048, 432.4210304,
       478.652256 ])}, 'x1': {'hist': array([  5,   8,   8,  17,   8,  18,   8,  10,   9, 119], dtype=int64), 'bin_edges': array([ 36.34   ,  85.56528, 134.79056, 184.01584, 233.24112, 282.4664 ,
       331.69168, 380.91696, 430.14224, 479.36752, 528.5928 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  8.11it/s]


Process file 6 of 57: An Annotated Dataset of Stack Overflow Post Edits.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  3.65it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 9, 'font': 'MXHOLS+LinLibertineT', 'mean_length': 230.5151515151515, 'width': {'hist': array([ 9,  3,  2,  6,  2, 43,  0,  0,  0,  1], dtype=int64), 'bin_edges': array([ 20.        ,  58.01031114,  96.02062228, 134.03093342,
       172.04124456, 210.0515557 , 248.06186684, 286.07217798,
       324.08248912, 362.09280026, 400.1031114 ])}, 'x0': {'hist': array([22,  3,  1,  1,  0,  0, 35,  1,  0,  3], dtype=int64), 'bin_edges': array([ 16.34      ,  66.04595625, 115.75191249, 165.45786874,
       215.16382498, 264.86978123, 314.57573747, 364.28169372,
       413.98764996, 463.69360621, 513.39956246])}, 'x1': {'hist': array([ 4,  0,  1,  2, 19,  0,  0,  3,  8, 29], dtype=int64), 'bin_edges': array([ 36.34      ,  88.68655984, 141.03311968, 193.37967952,
       245.72623936, 298.0727992 , 350.41935904, 402.76591888,
       455.11247872, 507.45903856, 559.8055984 ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.11it/s]


Process file 7 of 57: Arabic Text Mining.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  5.38it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'ABCDEE+Times New Roman', 'mean_length': 91.10652173913043, 'width': {'hist': array([ 98,  12,  20,  31,  27,  26,  23,  39, 182,   2], dtype=int64), 'bin_edges': array([  7.32888 ,  60.826432, 114.323984, 167.821536, 221.319088,
       274.81664 , 328.314192, 381.811744, 435.309296, 488.806848,
       542.3044  ])}, 'x0': {'hist': array([  2, 215, 149,  24,  10,  11,  22,  19,   7,   1], dtype=int64), 'bin_edges': array([ -9.   ,  32.383,  73.766, 115.149, 156.532, 197.915, 239.298,
       280.681, 322.064, 363.447, 404.83 ])}, 'x1': {'hist': array([ 35,  17,  16,  35,  41,  37,  32,  26, 129,  92], dtype=int64), 'bin_edges': array([ 61.776   , 115.230976, 168.685952, 222.140928, 275.595904,
       329.05088 , 382.505856, 435.960832, 489.415808, 542.870784,
       596.32576 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  5.87it/s]


Process file 8 of 57: Architecture of Text Mining Application in Analyzing Public Sentiments of West Java Governor Election using Naive Bayes Classification.pdf...
Statistics dictionary not passed, calculate statistics...


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]


File Architecture of Text Mining Application in Analyzing Public Sentiments of West Java Governor Election using Naive Bayes Classification.pdf failed to process..
('Unhandled', 6)
Process file 9 of 57: Assessing Text Mining and Technical Analyses on Forecasting Financial Time Series.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  7.14it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 12, 'font': 'AAAAAI+TimesNewRomanPSMT', 'mean_length': 241.3184713375796, 'width': {'hist': array([36, 13,  6,  2, 10,  0,  2,  3,  2, 83], dtype=int64), 'bin_edges': array([  6.888 ,  53.5992, 100.3104, 147.0216, 193.7328, 240.444 ,
       287.1552, 333.8664, 380.5776, 427.2888, 474.    ])}, 'x0': {'hist': array([111,   1,   0,   7,   4,   5,  21,   3,   3,   2], dtype=int64), 'bin_edges': array([ 72.     , 108.19968, 144.39936, 180.59904, 216.79872, 252.9984 ,
       289.19808, 325.39776, 361.59744, 397.79712, 433.9968 ])}, 'x1': {'hist': array([ 6,  5,  6,  5, 26, 10,  8,  3,  3, 85], dtype=int64), 'bin_edges': array([110.677728 , 154.2099552, 197.7421824, 241.2744096, 284.8066368,
       328.338864 , 371.8710912, 415.4033184, 458.9355456, 502.4677728,
       546.       ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  7.47it/s]


Process file 10 of 57: Augmenting Text Mining Approaches with Social Network Analysis to Understand the Complex Relationships among Users' Requests...
File Augmenting Text Mining Approaches with Social Network Analysis to Understand the Complex Relationships among Users' Requests failed to process..
No /Root object! - Is this really a PDF?
Process file 11 of 57: AutoAM...
File AutoAM failed to process..
No /Root object! - Is this really a PDF?
Process file 12 of 57: BioBERT...
File BioBERT failed to process..
No /Root object! - Is this really a PDF?
Process file 13 of 57: Combining Text Mining and Visualization Techniques to Study Teams' Behavioral Processes.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.88it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'AAAAAO+TimesNewRomanPSMT', 'mean_length': 250.775, 'width': {'hist': array([28,  8,  9,  6, 66,  1,  1,  0,  0,  1], dtype=int64), 'bin_edges': array([  5.980008 ,  52.7103288,  99.4406496, 146.1709704, 192.9012912,
       239.631612 , 286.3619328, 333.0922536, 379.8225744, 426.5528952,
       473.283216 ])}, 'x0': {'hist': array([53,  4,  6,  4,  1, 38,  4,  2,  3,  5], dtype=int64), 'bin_edges': array([ 55.670624 , 100.2307208, 144.7908176, 189.3509144, 233.9110112,
       278.471108 , 323.0312048, 367.5913016, 412.1513984, 456.7114952,
       501.271592 ])}, 'x1': {'hist': array([ 7,  4,  9,  7, 38,  1,  1,  5, 12, 36], dtype=int64), 'bin_edges': array([ 61.650632 , 109.7568608, 157.8630896, 205.9693184, 254.0755472,
       302.181776 , 350.2880048, 398.3942336, 446.5004624, 494.6066912,
       542.71292  ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.77it/s]


Process file 14 of 57: Comparative Opinion Mining...
File Comparative Opinion Mining failed to process..
No /Root object! - Is this really a PDF?
Process file 15 of 57: Comparison of Syntactic Parsers on Biomedical Texts.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  5.41it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'XMPYBY+NimbusRomNo9L-Regu', 'mean_length': 184.17880794701986, 'width': {'hist': array([ 95,  23,  19,  17,  25, 120,   0,   1,   1,   1], dtype=int64), 'bin_edges': array([  4.4233944 ,  48.10178397,  91.78017355, 135.45856312,
       179.13695269, 222.81534227, 266.49373184, 310.17212142,
       353.85051099, 397.52890056, 441.20729014])}, 'x0': {'hist': array([113,  30,  15,   7,  12,  76,  16,  13,   7,  13], dtype=int64), 'bin_edges': array([ 16.34  ,  70.3679, 124.3958, 178.4237, 232.4516, 286.4795,
       340.5074, 394.5353, 448.5632, 502.5911, 556.619 ])}, 'x1': {'hist': array([29, 12, 14,  9, 24, 86, 19, 20, 17, 72], dtype=int64), 'bin_edges': array([ 36.34      ,  89.04084041, 141.74168083, 194.44252124,
       247.14336166, 299.84420207, 352.54504249, 405.2458829 ,
       457.94672332, 510.64756373, 563.34840414])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  5.38it/s]


Process file 16 of 57: Cross-topic Argument Mining from Heterogeneous Sources Using Attention-based Neural Networks.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.21it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 9, 'font': 'VRWWLH+TeXGyreTermes-Regular', 'mean_length': 130.5462962962963, 'width': {'hist': array([163,  31,   6,  11, 104,   0,   3,   3,   1,   2], dtype=int64), 'bin_edges': array([  2.727275  ,  47.8423315 ,  92.95738801, 138.07244451,
       183.18750102, 228.30255752, 273.41761403, 318.53267053,
       363.64772704, 408.76278354, 453.87784005])}, 'x0': {'hist': array([ 4, 84, 22, 24, 33, 70, 26, 21, 22, 18], dtype=int64), 'bin_edges': array([ 16.34  ,  65.9883, 115.6366, 165.2849, 214.9332, 264.5815,
       314.2298, 363.8781, 413.5264, 463.1747, 512.823 ])}, 'x1': {'hist': array([ 5, 18, 15, 30, 39, 66, 14, 27, 28, 82], dtype=int64), 'bin_edges': array([ 36.34    ,  85.625957, 134.911914, 184.197871, 233.483828,
       282.769785, 332.055742, 381.341699, 430.627656, 479.913613,
       529.19957 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.51it/s]


Process file 17 of 57: Delay Impact on Stubborn Mining Attack Severity in Imperfect Bitcoin Network.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.23it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 8, 'font': 'BCDEEE+TimesNewRomanPSMT', 'mean_length': 138.73595505617976, 'width': {'hist': array([77, 15, 13,  6, 65,  1,  0,  0,  0,  1], dtype=int64), 'bin_edges': array([  3.99  ,  55.3206, 106.6512, 157.9818, 209.3124, 260.643 ,
       311.9736, 363.3042, 414.6348, 465.9654, 517.296 ])}, 'x0': {'hist': array([61, 11,  5,  1,  6, 55, 14, 13,  7,  5], dtype=int64), 'bin_edges': array([ 45.36,  93.61, 141.86, 190.11, 238.36, 286.61, 334.86, 383.11,
       431.36, 479.61, 527.86])}, 'x1': {'hist': array([31,  4,  9,  7, 32, 10,  8, 12, 13, 52], dtype=int64), 'bin_edges': array([ 55.215   , 106.901776, 158.588552, 210.275328, 261.962104,
       313.64888 , 365.335656, 417.022432, 468.709208, 520.395984,
       572.08276 ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.17it/s]


Process file 18 of 57: Discriminative Topic Mining via Category-Name Guided Text Embedding.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.51it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 9, 'font': 'FWGLIC+LinLibertineT', 'mean_length': 126.17131474103586, 'width': {'hist': array([225,  44,  29,  18,  83,  98,   0,   3,   1,   1], dtype=int64), 'bin_edges': array([  2.2416    ,  49.98987364,  97.73814728, 145.48642092,
       193.23469456, 240.9829682 , 288.73124184, 336.47951548,
       384.22778912, 431.97606276, 479.7243364 ])}, 'x0': {'hist': array([140,  41,  36,  22,  15, 118,  41,  39,  29,  21], dtype=int64), 'bin_edges': array([ 16.34  ,  69.9559, 123.5718, 177.1877, 230.8036, 284.4195,
       338.0354, 391.6513, 445.2672, 498.8831, 552.499 ])}, 'x1': {'hist': array([ 16,  30,  35,  39, 127,  14,  33,  46,  49, 113], dtype=int64), 'bin_edges': array([ 36.34      ,  89.28631528, 142.23263057, 195.17894585,
       248.12526113, 301.07157642, 354.0178917 , 406.96420698,
       459.91052227, 512.85683755, 565.80315283])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.62it/s]


Process file 19 of 57: Editorial for the First Workshop on Mining Scientific Papers...
File Editorial for the First Workshop on Mining Scientific Papers failed to process..
No /Root object! - Is this really a PDF?
Process file 20 of 57: Efficient Analysis of Pattern and Association Rule Mining Approaches.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:05<00:00,  2.76it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'FNTSBS+TimesNewRomanPSMT', 'mean_length': 131.47732181425485, 'width': {'hist': array([203,  68,  14,   5,  25, 132,   0,   0,   7,   9], dtype=int64), 'bin_edges': array([  3.35026238,  46.97424191,  90.59822143, 134.22220095,
       177.84618047, 221.47015999, 265.09413951, 308.71811904,
       352.34209856, 395.96607808, 439.5900576 ])}, 'x0': {'hist': array([139,  71,  58,  13,  10,  91,  12,  40,  17,  12], dtype=int64), 'bin_edges': array([ 72.   , 117.072, 162.144, 207.216, 252.288, 297.36 , 342.432,
       387.504, 432.576, 477.648, 522.72 ])}, 'x1': {'hist': array([44, 51, 26, 60, 94,  4,  7, 44, 39, 94], dtype=int64), 'bin_edges': array([ 92.6096616 , 137.85035133, 183.09104105, 228.33173078,
       273.57242051, 318.81311023, 364.05379996, 409.29448968,
       454.53517941, 499.77586914, 545.01655886])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:04<00:00,  2.85it/s]


Process file 21 of 57: Efficient Generalized Temporal Pattern Mining in Big Time Series Using Mutual Information.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:09<00:00,  1.75it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'TAPVUX+URWPalladioL-Roma', 'mean_length': 75.19143968871596, 'width': {'hist': array([859,  89,  45,  38, 246,   1,   1,   0,   1,   5], dtype=int64), 'bin_edges': array([  2.1967533 ,  53.18827797, 104.17980264, 155.17132731,
       206.16285198, 257.15437665, 308.14590132, 359.13742599,
       410.12895066, 461.12047533, 512.112     ])}, 'x0': {'hist': array([237,  82, 115, 125,  71, 234,  82, 123, 126,  90], dtype=int64), 'bin_edges': array([ 16.34  ,  70.8685, 125.397 , 179.9255, 234.454 , 288.9825,
       343.511 , 398.0395, 452.568 , 507.0965, 561.625 ])}, 'x1': {'hist': array([ 67,  76, 112, 145, 226,  70,  64, 140, 134, 251], dtype=int64), 'bin_edges': array([ 36.34  ,  89.6074, 142.8748, 196.1422, 249.4096, 302.677 ,
       355.9444, 409.2118, 462.4792, 515.7466, 569.014 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:08<00:00,  2.01it/s]


Process file 22 of 57: Enhancing Multimodal Compositional Reasoning of Visual Language Models with Generative Negative Mining.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:02<00:00,  5.99it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'TQDLRY+NimbusRomNo9L-Regu', 'mean_length': 196.95136778115503, 'width': {'hist': array([123,  34,   6,  18, 135,   1,   2,   1,   0,   9], dtype=int64), 'bin_edges': array([  2.81880996,  52.03730912, 101.25580829, 150.47430745,
       199.69280662, 248.91130578, 298.12980494, 347.34830411,
       396.56680327, 445.78530244, 495.0038016 ])}, 'x0': {'hist': array([ 97,  25,  20,  19,  13, 109,  17,  12,   7,  10], dtype=int64), 'bin_edges': array([ 16.34  ,  67.1366, 117.9332, 168.7298, 219.5264, 270.323 ,
       321.1196, 371.9162, 422.7128, 473.5094, 524.306 ])}, 'x1': {'hist': array([11, 17, 25, 19, 86, 25, 17, 16, 17, 96], dtype=int64), 'bin_edges': array([ 36.34      ,  87.21766064, 138.09532128, 188.97298192,
       239.85064256, 290.7283032 , 341.60596384, 392.48362448,
       443.36128512, 494.23894576, 545.1166064 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:02<00:00,  6.10it/s]


Process file 23 of 57: Evaluation of text data mining for database curation...
File Evaluation of text data mining for database curation failed to process..
No /Root object! - Is this really a PDF?
Process file 24 of 57: Exploring Diseases and Syndromes in Neurology Case Reports from 1955 to 2017 with Text Mining.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:25<00:00,  1.32s/it]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'URYRZB+NimbusRomNo9L-Regu', 'mean_length': 153.65682656826567, 'width': {'hist': array([ 71,  24,  22,  11,   4,  11,  10,  10,   5, 103], dtype=int64), 'bin_edges': array([  5.9776    ,  52.00558816,  98.03357632, 144.06156448,
       190.08955264, 236.1175408 , 282.14552896, 328.17351712,
       374.20150528, 420.22949344, 466.2574816 ])}, 'x0': {'hist': array([112,  61,  13,  19,  22,  28,   6,   3,   3,   4], dtype=int64), 'bin_edges': array([ 16.34  ,  67.0535, 117.767 , 168.4805, 219.194 , 269.9075,
       320.621 , 371.3345, 422.048 , 472.7615, 523.475 ])}, 'x1': {'hist': array([  6,  13,  16,  15,  16,  32,  24,  20,  12, 117], dtype=int64), 'bin_edges': array([ 36.34      ,  85.78294816, 135.22589632, 184.66884448,
       234.11179264, 283.5547408 , 332.99768896, 382.44063712,
       431.88358528, 481.32653344, 530.7694816 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:25<00:00,  1.34s/it]


Process file 25 of 57: Exploring term-document matrices from matrix models in text mining.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:05<00:00,  2.08it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'Helvetica', 'mean_length': 64.6, 'width': {'hist': array([463,  36,  17,  15, 109,   1,   0,   2,   1,   1], dtype=int64), 'bin_edges': array([  1.668,  50.387,  99.106, 147.825, 196.544, 245.263, 293.982,
       342.701, 391.42 , 440.139, 488.858])}, 'x0': {'hist': array([ 78,  98,  53,  61,  33,  73, 141,  37,  41,  30], dtype=int64), 'bin_edges': array([ 16.34  ,  68.2292, 120.1184, 172.0076, 223.8968, 275.786 ,
       327.6752, 379.5644, 431.4536, 483.3428, 535.232 ])}, 'x1': {'hist': array([ 14,  76,  51,  64,  52, 163,  56,  38,  51,  80], dtype=int64), 'bin_edges': array([ 36.34   ,  88.11981, 139.89962, 191.67943, 243.45924, 295.23905,
       347.01886, 398.79867, 450.57848, 502.35829, 554.1381 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:05<00:00,  2.31it/s]


Process file 26 of 57: Extracting Body Text from Academic PDF Documents for Text Mining.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.82it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'THDCPZ+NimbusRomNo9L-Regu', 'mean_length': 238.8136645962733, 'width': {'hist': array([42,  6, 10,  8, 91,  0,  2,  0,  0,  2], dtype=int64), 'bin_edges': array([  4.9813   ,  47.7082706,  90.4352412, 133.1622118, 175.8891824,
       218.616153 , 261.3431236, 304.0700942, 346.7970648, 389.5240354,
       432.251006 ])}, 'x0': {'hist': array([ 6, 52,  5,  2,  0,  1, 66, 14,  5, 10], dtype=int64), 'bin_edges': array([ 16.34  ,  64.3026, 112.2652, 160.2278, 208.1904, 256.153 ,
       304.1156, 352.0782, 400.0408, 448.0034, 495.966 ])}, 'x1': {'hist': array([ 6,  2,  7,  7,  5, 35,  6,  9, 12, 72], dtype=int64), 'bin_edges': array([ 36.34      ,  84.86832318, 133.39664636, 181.92496955,
       230.45329273, 278.98161591, 327.50993909, 376.03826227,
       424.56658546, 473.09490864, 521.62323182])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.89it/s]


Process file 27 of 57: Finding Sequential Patterns from Large Sequence Data.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.99it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 9, 'font': 'FGDKKG+TimesNewRomanPSMT', 'mean_length': 240.06741573033707, 'width': {'hist': array([28, 17,  7,  4,  3, 24,  3,  0,  2,  1], dtype=int64), 'bin_edges': array([  2.3442867 ,  45.27025803,  88.19622936, 131.12220069,
       174.04817202, 216.97414335, 259.90011468, 302.82608601,
       345.75205734, 388.67802867, 431.604     ])}, 'x0': {'hist': array([25, 12,  6,  8,  2, 19,  4,  6,  2,  5], dtype=int64), 'bin_edges': array([ 53.87857 , 102.556706, 151.234842, 199.912978, 248.591114,
       297.26925 , 345.947386, 394.625522, 443.303658, 491.981794,
       540.65993 ])}, 'x1': {'hist': array([ 6, 13,  2,  8, 15,  3,  5,  8,  6, 23], dtype=int64), 'bin_edges': array([ 78.66    , 126.131541, 173.603082, 221.074623, 268.546164,
       316.017705, 363.489246, 410.960787, 458.432328, 505.903869,
       553.37541 ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.22it/s]


Process file 28 of 57: Hierarchical Topic Mining via Joint Spherical Tree and Text Embedding.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.40it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 9, 'font': 'DJYMUP+LinLibertineT', 'mean_length': 114.82096069868996, 'width': {'hist': array([235,  43,  22,  23, 131,   0,   0,   1,   0,   3], dtype=int64), 'bin_edges': array([  1.819194 ,  52.1055578, 102.3919216, 152.6782854, 202.9646492,
       253.251013 , 303.5373768, 353.8237406, 404.1101044, 454.3964682,
       504.682832 ])}, 'x0': {'hist': array([ 78,  39,  43,  31,  22, 119,  44,  35,  26,  21], dtype=int64), 'bin_edges': array([ 16.34  ,  71.1603, 125.9806, 180.8009, 235.6212, 290.4415,
       345.2618, 400.0821, 454.9024, 509.7227, 564.543 ])}, 'x1': {'hist': array([  9,  36,  37,  43,  80,   3,  39,  38,  51, 122], dtype=int64), 'bin_edges': array([ 36.34    ,  89.498588, 142.657176, 195.815764, 248.974352,
       302.13294 , 355.291528, 408.450116, 461.608704, 514.767292,
       567.92588 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.58it/s]


Process file 29 of 57: Improving LSTM-based Video Description with Linguistic Knowledge Mined from Text.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.02it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'VHTHBE+NimbusRomNo9L-Regu', 'mean_length': 178.5503875968992, 'width': {'hist': array([32,  8,  4,  4,  6,  2,  5,  6, 61,  1], dtype=int64), 'bin_edges': array([  3.05812737,  29.87810959,  56.69809182,  83.51807404,
       110.33805626, 137.15803848, 163.97802071, 190.79800293,
       217.61798515, 244.43796738, 271.2579496 ])}, 'x0': {'hist': array([17, 32,  5,  7,  7, 42,  5,  3,  6,  5], dtype=int64), 'bin_edges': array([ 16.34 ,  67.434, 118.528, 169.622, 220.716, 271.81 , 322.904,
       373.998, 425.092, 476.186, 527.28 ])}, 'x1': {'hist': array([ 5,  5,  5,  7, 10, 36,  2, 10,  5, 44], dtype=int64), 'bin_edges': array([ 36.34      ,  86.97632448, 137.61264896, 188.24897344,
       238.88529792, 289.5216224 , 340.15794688, 390.79427136,
       441.43059584, 492.06692032, 542.7032448 ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:06<00:00,  1.03s/it]


Process file 30 of 57: Knowledge extraction, modeling and formalization...
File Knowledge extraction, modeling and formalization failed to process..
No /Root object! - Is this really a PDF?
Process file 31 of 57: Meta-learning of textual representations.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  9.43it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'GRKSXP+NimbusRomNo9L-Regu', 'mean_length': 198.47413793103448, 'width': {'hist': array([22, 13, 12,  3,  2,  4,  3,  4,  5, 48], dtype=int64), 'bin_edges': array([  4.9813    ,  51.50466162,  98.02802323, 144.55138485,
       191.07474646, 237.59810808, 284.12146969, 330.64483131,
       377.16819292, 423.69155454, 470.21491616])}, 'x0': {'hist': array([ 6, 69,  6,  4,  3,  9, 10,  1,  1,  7], dtype=int64), 'bin_edges': array([ 16.34  ,  58.7361, 101.1322, 143.5283, 185.9244, 228.3205,
       270.7166, 313.1127, 355.5088, 397.9049, 440.301 ])}, 'x1': {'hist': array([ 7,  2,  4,  6,  2, 15,  9,  6,  6, 59], dtype=int64), 'bin_edges': array([ 36.34      ,  86.88071588, 137.42143175, 187.96214763,
       238.5028635 , 289.04357938, 339.58429525, 390.12501113,
       440.665727  , 491.20644288, 541.74715876])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  7.83it/s]


Process file 32 of 57: Mining Healthcare Procurement Data Using Text Mining and Natural Language Processing -- Reflection From An Industrial Project.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:11<00:00,  2.33it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 11, 'font': 'BAAAAA+TimesNewRomanPSMT', 'mean_length': 240.76976744186047, 'width': {'hist': array([139,  26,  19,   9,   2,   5,   2,  12,  19, 197], dtype=int64), 'bin_edges': array([  2.75000006,  50.65183722,  98.55367438, 146.45551153,
       194.35734869, 242.25918585, 290.16102301, 338.06286017,
       385.96469732, 433.86653448, 481.76837164])}, 'x0': {'hist': array([256,  35,  18,  23,  28,  22,  18,  15,  11,   4], dtype=int64), 'bin_edges': array([ 56.692917 , 103.2382613, 149.7836056, 196.3289499, 242.8742942,
       289.4196385, 335.9649828, 382.5103271, 429.0556714, 475.6010157,
       522.14636  ])}, 'x1': {'hist': array([ 25,  19,  23,  24,  22,  19,  22,  21,  33, 222], dtype=int64), 'bin_edges': array([ 82.93901081, 128.49123859, 174.04346638, 219.59569416,
       265.14792194, 310.70014973, 356.25237751, 401.80460529,
       447.35683307, 492.90906086, 538.46128864])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:11<00:00,  2.28it/s]


Process file 33 of 57: Multi-granularity Argument Mining in Legal Texts.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.60it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 8, 'font': 'ZTSCII+NimbusRomNo9L-Regu', 'mean_length': 132.09947643979058, 'width': {'hist': array([57, 15,  8,  9,  3,  9,  6,  7,  6, 71], dtype=int64), 'bin_edges': array([  3.98505   ,  38.65537942,  73.32570884, 107.99603826,
       142.66636768, 177.3366971 , 212.00702652, 246.67735594,
       281.34768536, 316.01801478, 350.6883442 ])}, 'x0': {'hist': array([  4,   0, 129,  11,   5,  10,   8,   8,   8,   8], dtype=int64), 'bin_edges': array([ 16.34      ,  59.50786122, 102.67572244, 145.84358366,
       189.01144488, 232.1793061 , 275.34716732, 318.51502854,
       361.68288976, 404.85075098, 448.0186122 ])}, 'x1': {'hist': array([ 4,  0, 14, 14, 12, 15, 10, 16, 21, 85], dtype=int64), 'bin_edges': array([ 36.34      ,  79.96383442, 123.58766884, 167.21150326,
       210.83533768, 254.4591721 , 298.08300652, 341.70684094,
       385.33067536, 428.95450978, 472.5783442 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.96it/s]


Process file 34 of 57: Novel text categorization by amalgamation of augmented k-nearest neighborhood classification and k-medoids clustering.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  4.78it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 11, 'font': 'Times New Roman', 'mean_length': 167.37704918032787, 'width': {'hist': array([74, 28, 17,  6, 11,  6,  7,  4,  8, 83], dtype=int64), 'bin_edges': array([  5.52   ,  48.51184,  91.50368, 134.49552, 177.48736, 220.4792 ,
       263.47104, 306.46288, 349.45472, 392.44656, 435.4384 ])}, 'x0': {'hist': array([139,  16,  11,  10,   7,  14,  15,   6,  12,  14], dtype=int64), 'bin_edges': array([ 84.984 , 127.2436, 169.5032, 211.7628, 254.0224, 296.282 ,
       338.5416, 380.8012, 423.0608, 465.3204, 507.58  ])}, 'x1': {'hist': array([ 15,  17,  10,  10,  17,  16,  14,  15,  18, 112], dtype=int64), 'bin_edges': array([124.1    , 163.73224, 203.36448, 242.99672, 282.62896, 322.2612 ,
       361.89344, 401.52568, 441.15792, 480.79016, 520.4224 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  4.77it/s]


Process file 35 of 57: On Utilization and Importance of Perl Status Reporter (SRr) in Text Mining.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  8.84it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 12, 'font': 'TimesNewRomanPSMT', 'mean_length': 185.93203883495147, 'width': {'hist': array([21, 17, 10,  8, 41,  3,  1,  1,  0,  1], dtype=int64), 'bin_edges': array([  5.8509 ,  44.99781,  84.14472, 123.29163, 162.43854, 201.58545,
       240.73236, 279.87927, 319.02618, 358.17309, 397.32   ])}, 'x0': {'hist': array([31, 11,  8,  9,  1,  0, 29,  5,  3,  6], dtype=int64), 'bin_edges': array([ 89.99424 , 125.955816, 161.917392, 197.878968, 233.840544,
       269.80212 , 305.763696, 341.725272, 377.686848, 413.648424,
       449.61    ])}, 'x1': {'hist': array([ 6, 13, 11, 27,  2,  0,  3,  8,  9, 24], dtype=int64), 'bin_edges': array([144.996   , 184.268835, 223.54167 , 262.814505, 302.08734 ,
       341.360175, 380.63301 , 419.905845, 459.17868 , 498.451515,
       537.72435 ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  8.97it/s]


Process file 36 of 57: Open Data Platform for Knowledge Access in Plant Health Domain ...
File Open Data Platform for Knowledge Access in Plant Health Domain  failed to process..
[Errno 2] No such file or directory: 'data/raw/text_mining/articles/Open Data Platform for Knowledge Access in Plant Health Domain '
Process file 37 of 57: Opinion Mining on Non-English Short Text.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 10.46it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 12, 'font': 'TOQLHD+CMR12', 'mean_length': 160.3739837398374, 'width': {'hist': array([25,  8,  6,  8,  2,  4,  5,  6,  2, 57], dtype=int64), 'bin_edges': array([  5.85326592,  48.46853274,  91.08379955, 133.69906637,
       176.31433318, 218.9296    , 261.54486682, 304.16013363,
       346.77540045, 389.39066726, 432.00593408])}, 'x0': {'hist': array([ 4, 63, 27,  7,  4,  4, 11,  2,  0,  1], dtype=int64), 'bin_edges': array([ 16.34      ,  67.49559274, 118.65118547, 169.80677821,
       220.96237094, 272.11796368, 323.27355642, 374.42914915,
       425.58474189, 476.74033462, 527.89592736])}, 'x1': {'hist': array([ 4,  1,  2,  3, 14, 14,  9,  8,  8, 60], dtype=int64), 'bin_edges': array([ 36.34      ,  86.99199507, 137.64399014, 188.29598522,
       238.94798029, 289.59997536, 340.25197043, 390.9039655 ,
       441.55596058, 492.20795565, 542.85995072])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 12.64it/s]


Process file 38 of 57: Opinion Mining Using Population-tuned Generative Language Models.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 24/24 [00:03<00:00,  7.92it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'AIADQR+NimbusRomNo9L-Regu', 'mean_length': 131.48178137651823, 'width': {'hist': array([168, 111,  56,  37,  16,  15,   7,   8,  14,  62], dtype=int64), 'bin_edges': array([  2.49065   ,  49.26262183,  96.03459366, 142.80656549,
       189.57853732, 236.35050915, 283.12248098, 329.89445282,
       376.66642465, 423.43839648, 470.21036831])}, 'x0': {'hist': array([  5, 208,  27,  91,  54,  35,  23,  13,   7,  31], dtype=int64), 'bin_edges': array([ 16.34  ,  68.4676, 120.5952, 172.7228, 224.8504, 276.978 ,
       329.1056, 381.2332, 433.3608, 485.4884, 537.616 ])}, 'x1': {'hist': array([  6,  15,  87,  37,  32, 128,  32,  20,  32, 105], dtype=int64), 'bin_edges': array([ 36.34      ,  90.01025711, 143.68051422, 197.35077133,
       251.02102844, 304.69128555, 358.36154266, 412.03179977,
       465.70205688, 519.37231399, 573.0425711 ])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 24/24 [00:02<00:00,  8.31it/s]


Process file 39 of 57: Overview of Web Content Mining Tools.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  4.04it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'Times-Roman', 'mean_length': 159.928, 'width': {'hist': array([35,  8,  5, 15,  5,  3,  2,  2,  6, 44], dtype=int64), 'bin_edges': array([  2.99711613,  47.84867347,  92.7002308 , 137.55178813,
       182.40334546, 227.25490279, 272.10646012, 316.95801745,
       361.80957479, 406.66113212, 451.51268945])}, 'x0': {'hist': array([64, 15,  3,  6, 11,  8, 11,  6,  0,  1], dtype=int64), 'bin_edges': array([ 72. , 112.5, 153. , 193.5, 234. , 274.5, 315. , 355.5, 396. ,
       436.5, 477. ])}, 'x1': {'hist': array([ 4,  5,  1,  9, 13, 20, 11,  5,  5, 52], dtype=int64), 'bin_edges': array([ 93.94541227, 137.11924713, 180.29308199, 223.46691685,
       266.64075171, 309.81458657, 352.98842143, 396.16225629,
       439.33609115, 482.50992601, 525.68376087])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  4.10it/s]


Process file 40 of 57: Pbm...
File Pbm failed to process..
No /Root object! - Is this really a PDF?
Process file 41 of 57: PMC text mining subset in BioC...
File PMC text mining subset in BioC failed to process..
No /Root object! - Is this really a PDF?
Process file 42 of 57: Population-based metaheuristics for Association Rule Text Mining.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.41it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 9, 'font': 'BMGYSA+LinLibertineT', 'mean_length': 104.21900826446281, 'width': {'hist': array([117,  24,  11,   7,  11,  71,   0,   0,   0,   1], dtype=int64), 'bin_edges': array([  1.939743 ,  47.3665787,  92.7934144, 138.2202501, 183.6470858,
       229.0739215, 274.5007572, 319.9275929, 365.3544286, 410.7812643,
       456.2081   ])}, 'x0': {'hist': array([49, 11,  8,  9,  9, 71, 26, 25, 22, 12], dtype=int64), 'bin_edges': array([ 16.34  ,  69.5748, 122.8096, 176.0444, 229.2792, 282.514 ,
       335.7488, 388.9836, 442.2184, 495.4532, 548.688 ])}, 'x1': {'hist': array([ 7,  6, 12, 13, 52, 14, 24, 21, 25, 68], dtype=int64), 'bin_edges': array([ 36.34      ,  88.68657072, 141.03314144, 193.37971216,
       245.72628288, 298.0728536 , 350.41942432, 402.76599504,
       455.11256576, 507.45913648, 559.8057072 ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.26it/s]


Process file 43 of 57: RUArt...
File RUArt failed to process..
No /Root object! - Is this really a PDF?
Process file 44 of 57: RuBioRoBERTa...
File RuBioRoBERTa failed to process..
No /Root object! - Is this really a PDF?
Process file 45 of 57: Scalable and Accurate Self-supervised Multimodal Representation Learning without Aligned Video and Text Data.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00,  5.54it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'RRVVUJ+NimbusRomNo9L-Regu', 'mean_length': 249.88717948717948, 'width': {'hist': array([ 25,  19,   8,   6, 131,   1,   0,   0,   2,   3], dtype=int64), 'bin_edges': array([  4.9813    ,  54.15469253, 103.32808506, 152.50147759,
       201.67487012, 250.84826265, 300.02165518, 349.19504771,
       398.36844024, 447.54183277, 496.7152253 ])}, 'x0': {'hist': array([83,  7,  2,  7,  0,  2, 83,  3,  4,  4], dtype=int64), 'bin_edges': array([ 16.34  ,  64.7322, 113.1244, 161.5166, 209.9088, 258.301 ,
       306.6932, 355.0854, 403.4776, 451.8698, 500.262 ])}, 'x1': {'hist': array([ 5,  7,  6,  5, 70,  1, 10,  7,  6, 78], dtype=int64), 'bin_edges': array([ 36.34      ,  87.39172359, 138.44344718, 189.49517077,
       240.54689436, 291.59861794, 342.65034153, 393.70206512,
       444.75378871, 495.8055123 , 546.85723589])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  5.07it/s]


Process file 46 of 57: Scalable Text Mining with Sparse Generative Models.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████| 205/205 [00:32<00:00,  6.40it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 12, 'font': 'OJLBCP+CMR12', 'mean_length': 112.0606823906572, 'width': {'hist': array([2393,  312,  135,  141,  101,   87,  104,   62,   58,  974],
      dtype=int64), 'bin_edges': array([  2.52731871,  58.33664754, 114.14597638, 169.95530521,
       225.76463404, 281.57396287, 337.38329171, 393.19262054,
       449.00194937, 504.81127821, 560.62060704])}, 'x0': {'hist': array([1663,  167,  310,  413,  498,  658,  263,  221,   62,  112],
      dtype=int64), 'bin_edges': array([ 16.34  ,  72.3989, 128.4578, 184.5167, 240.5756, 296.6345,
       352.6934, 408.7523, 464.8112, 520.8701, 576.929 ])}, 'x1': {'hist': array([  67,  217,  373,  555,  823,  502,  381,  271,  133, 1045],
      dtype=int64), 'bin_edges': array([ 34.98830912,  94.73647408, 154.48463904, 214.232804  ,
       273.98096896, 333.72913392, 393.47729888, 453.22546384,
       512.9736288 , 572.72179376, 632.46995872])}}


100%|████████████████████████████████████████████████████████████████████████████████| 205/205 [00:29<00:00,  7.00it/s]


Process file 47 of 57: Selectively Hard Negative Mining for Alleviating Gradient Vanishing in Image-Text Matching.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:03<00:00,  3.12it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'VQHGDW+NimbusRomNo9L-Regu', 'mean_length': 126.26004728132388, 'width': {'hist': array([202,  31,  23,  14, 150,   0,   1,   0,   1,   1], dtype=int64), 'bin_edges': array([  2.6540433 ,  50.56420953,  98.47437576, 146.38454199,
       194.29470822, 242.20487445, 290.11504068, 338.02520691,
       385.93537314, 433.84553937, 481.7557056 ])}, 'x0': {'hist': array([107,  26,  27,  29,  21, 103,  23,  35,  26,  26], dtype=int64), 'bin_edges': array([ 16.34  ,  68.6528, 120.9656, 173.2784, 225.5912, 277.904 ,
       330.2168, 382.5296, 434.8424, 487.1552, 539.468 ])}, 'x1': {'hist': array([  8,  29,  23,  31, 116,  15,  20,  33,  26, 122], dtype=int64), 'bin_edges': array([ 36.34      ,  88.07046103, 139.80092206, 191.53138308,
       243.26184411, 294.99230514, 346.72276617, 398.4532272 ,
       450.18368822, 501.91414925, 553.64461028])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:03<00:00,  3.37it/s]


Process file 48 of 57: Sentiment Analysis...
File Sentiment Analysis failed to process..
No /Root object! - Is this really a PDF?
Process file 49 of 57: Subjectivity Classification using Machine Learning Techniques for Mining Feature-Opinion Pairs from Web Opinion Sources.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.96it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 8, 'font': 'Times-Roman', 'mean_length': 105.02227722772277, 'width': {'hist': array([258,  33,  13,  17,  82,   0,   0,   0,   0,   1], dtype=int64), 'bin_edges': array([  1.94231439,  49.90321947,  97.86412456, 145.82502964,
       193.78593472, 241.7468398 , 289.70774488, 337.66864997,
       385.62955505, 433.59046013, 481.55136521])}, 'x0': {'hist': array([87, 36, 29, 19,  9, 77, 46, 29, 39, 33], dtype=int64), 'bin_edges': array([ 53.879997 , 102.2279958, 150.5759946, 198.9239934, 247.2719922,
       295.619991 , 343.9679898, 392.3159886, 440.6639874, 489.0119862,
       537.359985 ])}, 'x1': {'hist': array([26, 41, 30, 29, 52, 15, 31, 45, 39, 96], dtype=int64), 'bin_edges': array([ 60.10494139, 109.41097493, 158.71700848, 208.02304202,
       257.32907557, 306.63510911, 355.94114266, 405.2471762 ,
       454.55320975, 503.8592433 , 553.16527684])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.21it/s]


Process file 50 of 57: Text Classification using the Concept of Association Rule of Data Mining.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  4.19it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 11, 'font': 'DHJAPE+TimesNewRoman', 'mean_length': 134.5873015873016, 'width': {'hist': array([118,  34,  17,   6,  73,   2,   0,   0,   0,   2], dtype=int64), 'bin_edges': array([  5.52  ,  52.3233,  99.1266, 145.9299, 192.7332, 239.5365,
       286.3398, 333.1431, 379.9464, 426.7497, 473.553 ])}, 'x0': {'hist': array([58,  6, 12, 22, 22, 57, 13, 12, 22, 28], dtype=int64), 'bin_edges': array([ 59.16   , 104.46974, 149.77948, 195.08922, 240.39896, 285.7087 ,
       331.01844, 376.32818, 421.63792, 466.94766, 512.2574 ])}, 'x1': {'hist': array([ 5, 13, 10, 43, 48, 10,  9, 31, 23, 60], dtype=int64), 'bin_edges': array([ 78.982184 , 128.2354356, 177.4886872, 226.7419388, 275.9951904,
       325.248442 , 374.5016936, 423.7549452, 473.0081968, 522.2614484,
       571.5147   ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  4.56it/s]


Process file 51 of 57: Text Data Mining from the Author's Perspective...
File Text Data Mining from the Author's Perspective failed to process..
No /Root object! - Is this really a PDF?
Process file 52 of 57: Text mining and visualization using VOSviewer.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  9.48it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 11, 'font': 'Times-Roman', 'mean_length': 230.52, 'width': {'hist': array([ 9,  3,  3,  2,  1,  1,  0,  3,  1, 27], dtype=int64), 'bin_edges': array([  5.63928817,  47.87343407,  90.10757996, 132.34172585,
       174.57587174, 216.81001764, 259.04416353, 301.27830942,
       343.51245531, 385.7466012 , 427.9807471 ])}, 'x0': {'hist': array([37,  3,  0,  1,  1,  0,  0,  1,  1,  6], dtype=int64), 'bin_edges': array([ 93.59854115, 135.53863394, 177.47872673, 219.41881951,
       261.3589123 , 303.29900509, 345.23909788, 387.17919067,
       429.11928346, 471.05937625, 512.99946904])}, 'x1': {'hist': array([ 5,  0,  1,  2,  0,  1,  2,  2,  4, 33], dtype=int64), 'bin_edges': array([150.77887323, 187.85905687, 224.93924052, 262.01942416,
       299.0996078 , 336.17979144, 373.25997509, 410.34015873,
       447.42034237, 484.50052601, 521.58070966])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  9.07it/s]


Process file 53 of 57: Text Mining System for Non-Expert Miners.pdf...
Statistics dictionary not passed, calculate statistics...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.37it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 10, 'font': 'Times New Roman', 'mean_length': 136.52238805970148, 'width': {'hist': array([26, 20,  7, 10,  5,  1, 58,  0,  1,  6], dtype=int64), 'bin_edges': array([  6.24624 ,  36.688616,  67.130992,  97.573368, 128.015744,
       158.45812 , 188.900496, 219.342872, 249.785248, 280.227624,
       310.67    ])}, 'x0': {'hist': array([39, 20,  4,  9,  2, 30, 11, 12,  4,  3], dtype=int64), 'bin_edges': array([ 72.024 , 115.1836, 158.3432, 201.5028, 244.6624, 287.822 ,
       330.9816, 374.1412, 417.3008, 460.4604, 503.62  ])}, 'x1': {'hist': array([ 8,  6, 11, 12, 30,  1,  7,  5, 14, 40], dtype=int64), 'bin_edges': array([108.8262 , 150.72658, 192.62696, 234.52734, 276.42772, 318.3281 ,
       360.22848, 402.12886, 444.02924, 485.92962, 527.83   ])}}


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.12it/s]


Process file 54 of 57: TILFA...
File TILFA failed to process..
No /Root object! - Is this really a PDF?
Process file 55 of 57: Towards a Near Universal Time Series Data Mining Tool...
File Towards a Near Universal Time Series Data Mining Tool failed to process..
No /Root object! - Is this really a PDF?
Process file 56 of 57: Transitive Text Mining for Information Extraction and Hypothesis Generation.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:01<00:00,  8.65it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Bold', 'size': 12, 'font': 'Helvetica', 'mean_length': 93.42574257425743, 'width': {'hist': array([124,   3,  17,   6,   2,   1,   2,   2,   4,  41], dtype=int64), 'bin_edges': array([  2.75965128,  36.88349176,  71.00733224, 105.13117273,
       139.25501321, 173.37885369, 207.50269417, 241.62653466,
       275.75037514, 309.87421562, 343.9980561 ])}, 'x0': {'hist': array([ 6,  0, 75, 37,  6, 16, 33,  9, 10, 10], dtype=int64), 'bin_edges': array([ 16.34  ,  57.6776,  99.0152, 140.3528, 181.6904, 223.028 ,
       264.3656, 305.7032, 347.0408, 388.3784, 429.716 ])}, 'x1': {'hist': array([ 6,  0, 51,  6, 14, 12, 26, 18, 14, 55], dtype=int64), 'bin_edges': array([ 36.34      ,  79.589944  , 122.839888  , 166.089832  ,
       209.33977601, 252.58972001, 295.83966401, 339.08960801,
       382.33955201, 425.58949601, 468.83944001])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:01<00:00, 11.64it/s]


Process file 57 of 57: Using compression to identify acronyms in text.pdf...
Statistics dictionary not passed, calculate statistics...


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00,  6.19it/s]


Statistics calculated.
Results:
{'min_length': 50, 'style': 'Standart', 'size': 11, 'font': 'DQPICQ+CMR10', 'mean_length': 152.1809523809524, 'width': {'hist': array([78, 20,  3,  9, 11,  7,  5,  2,  5, 70], dtype=int64), 'bin_edges': array([  3.0218207 ,  47.46328297,  91.90474523, 136.3462075 ,
       180.78766976, 225.22913203, 269.67059429, 314.11205656,
       358.55351882, 402.99498109, 447.43644336])}, 'x0': {'hist': array([  8, 110,  20,  23,   6,  13,  11,   2,  14,   3], dtype=int64), 'bin_edges': array([ 16.34  ,  65.4275, 114.515 , 163.6025, 212.69  , 261.7775,
       310.865 , 359.9525, 409.04  , 458.1275, 507.215 ])}, 'x1': {'hist': array([ 8, 19, 18, 10,  9, 20, 20, 14, 12, 80], dtype=int64), 'bin_edges': array([ 36.34      ,  86.20958434, 136.07916867, 185.94875301,
       235.81833734, 285.68792168, 335.55750601, 385.42709035,
       435.29667468, 485.16625902, 535.03584336])}}


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00,  7.37it/s]


## Split text to chunks

In [2]:
def get_pdf_text(pdf_filename):
    text = ""
    pdf_reader = PdfReader(pdf_filename)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text


def process_text(text: str) -> str:
    text = re.sub(r'\[.*?\]', '', text)
    text = text.replace('>>>', '\n>>>')
    return text


def validate_text(text: str) -> bool:    
    DIGITS_LETTERS_THRESHOLD = 0.5
    n_digits = sum([sym.isdigit() for sym in text])
    n_letters = sum([sym.isalpha() for sym in text])
    
    if n_digits / (n_letters + 1) > DIGITS_LETTERS_THRESHOLD:
        return False
    
    # Оглавления
    if re.search(r' {10,}|(?: \.){10,}', text):
        return False
    
    # Ссылки
#     if re.search(r'www\.|https?://', text):
#         return False
    return True

In [3]:
PARSED_TEXTS_DIR = '../data/processed/texts/'
filepaths = [PARSED_TEXTS_DIR + file for file in os.listdir(PARSED_TEXTS_DIR) if file.endswith('.txt')]

In [4]:
chink_size = 1500

text_splitter = CharacterTextSplitter(
        separator=" ",
        chunk_size=chink_size,
        chunk_overlap=400,
        length_function=len
    )

processed_docs = []

for filename in filepaths:
    with open(filename) as f:
        text = f.read()
        
    valid_chunks = 0
    text_chunks = text_splitter.split_text(text)
        
    for j, chunk in enumerate(text_chunks):            
        chunk = process_text(chunk)
        if len(chunk) > chink_size:
            continue
            
        if validate_text(chunk):
            valid_chunks += 1
            doc = Document(page_content=chunk, metadata=dict(docname=filename.split(sep='/')[-1].removesuffix('.txt')))
            processed_docs.append(doc)

    print(f'Text processed. Got {valid_chunks} texts. Validity is {(valid_chunks / len(text_chunks) * 100):.2f}%')

Text processed. Got 21 texts. Validity is 100.00%
Text processed. Got 34 texts. Validity is 100.00%
Text processed. Got 17 texts. Validity is 100.00%
Text processed. Got 31 texts. Validity is 100.00%
Text processed. Got 38 texts. Validity is 100.00%
Text processed. Got 12 texts. Validity is 100.00%
Text processed. Got 989 texts. Validity is 99.90%
Text processed. Got 131 texts. Validity is 98.50%
Text processed. Got 27 texts. Validity is 100.00%
Text processed. Got 31 texts. Validity is 100.00%
Text processed. Got 24 texts. Validity is 100.00%
Text processed. Got 44 texts. Validity is 100.00%
Text processed. Got 34 texts. Validity is 100.00%
Text processed. Got 20 texts. Validity is 100.00%
Text processed. Got 48 texts. Validity is 100.00%
Text processed. Got 46 texts. Validity is 100.00%
Text processed. Got 76 texts. Validity is 100.00%
Text processed. Got 55 texts. Validity is 100.00%
Text processed. Got 32 texts. Validity is 100.00%
Text processed. Got 31 texts. Validity is 100.00%


Created a chunk of size 16369, which is longer than the specified 1500


Text processed. Got 630 texts. Validity is 98.44%
Text processed. Got 43 texts. Validity is 100.00%
Text processed. Got 344 texts. Validity is 95.56%
Text processed. Got 42 texts. Validity is 100.00%
Text processed. Got 32 texts. Validity is 100.00%
Text processed. Got 25 texts. Validity is 100.00%
Text processed. Got 9 texts. Validity is 100.00%
Text processed. Got 13 texts. Validity is 100.00%
Text processed. Got 241 texts. Validity is 100.00%
Text processed. Got 16 texts. Validity is 100.00%
Text processed. Got 26 texts. Validity is 100.00%


In [5]:
print(f'Parsed {len(processed_docs)} documents.')

Parsed 5471 documents.


## Translate documents

In [6]:
class Translator(Embeddings):  
    def __init__(self, folder_id, oauth_token):
        self.folder_id = folder_id
        self.oauth_token = oauth_token
        
        self._iam_expires_at = None
        self._iam_token = None
        
        self._update_iam_token()
        
        self._url = "https://translate.api.cloud.yandex.net/translate/v2/translate"
        
        self.batch_size = 5

    
    def _update_iam_token(self):
        print('Updating Yandex IAM token.')
        query = {
            "yandexPassportOauthToken": self.oauth_token
        }
        response = requests.post("https://iam.api.cloud.yandex.net/iam/v1/tokens", data=json.dumps(query))
        if response.status_code != 200:
            print(f"Could not update Yandex IAM token. Status code: {response.status_code} Body:\n{response.text}")
            raise Exception('Failed to get Yandex IAM token!')

        response = response.json()
        self._iam_expires_at = datetime.strptime(response['expiresAt'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        self._iam_token = response['iamToken']
        print(f'IAM token succesully updated: {self._iam_token[:15]} Expires at: {self._iam_expires_at}')
    
    
    def __call__(self, texts: List[str], source='en', target='ru') -> List[str]:
        return self.translate(texts=texts, source=source, target=target)
    
    
    def translate(self, texts: List[str], source='en', target='ru') -> List[str]:
        headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {self._iam_token}",
                "x-folder-id": f"{self.folder_id}"
            }
        
        translated_texts = []
        
        for text in tqdm(range(0, len(texts), self.batch_size)):
            if datetime.utcnow() + timedelta(hours=1) > self._iam_expires_at:
                self._update_iam_token()
                headers['Authorization'] = f"Bearer {self._iam_token}"
                
            body = {
                "folderId": self.folder_id,
                "texts": texts[i: i + self.batch_size],
                "targetLanguageCode": target
            }

            retries = 0
            while retries < 5:
                try:
                    response = requests.post(self._url, json=body, headers=headers)
                    
                    if response.status_code == 200:
                        translated_texts += [translation['text'] for translation in response.json()['translations']]
                        break
                    else:
                        time.sleep(1)
                        retries += 1
                except Exception as e:
                    print(e)
                    print(response.text)
            else:
                raise Exception(f'Failed to translate text. Status code: {response.status_code}\nMessage:{response.text}')
        return translated_texts

In [7]:
OAUTH_TOKEN = 'y0_AgAAAABFOtKIAATuwQAAAAEAcNmeAABjadeGLERHyJNjoJ7uwRjlQ_oQjA'
FOLDER_ID = 'b1g6hkb495v635sf93ir'

translator = Translator(FOLDER_ID, OAUTH_TOKEN)

Updating Yandex IAM token.
IAM token succesully updated: t1.9euelZrPx5vN Expires at: 2024-04-07 07:36:53


In [8]:
en_texts = [doc.page_content for doc in processed_docs]

In [9]:
ru_texts = translator(en_texts)

  0%|          | 0/697 [00:00<?, ?it/s]

Exception: Failed to translate text. Status code: 429
Message:{
 "code": 8,
 "message": "limit on units was exceeded. Limit: 1000000, Interval: 1h0m0s",
 "details": [
  {
   "@type": "type.googleapis.com/google.rpc.RequestInfo",
   "requestId": "ea32ed77-30c3-4a41-a527-01eea4ac04b8"
  }
 ]
}


## Create storage

In [6]:
model = 'BAAI/bge-base-en-v1.5'
token = '4fbc8ea1ec756dd6d645a7db75ebb6e8a85cdaf81b09acb2bbe3143a017d11c8'

In [7]:
books_dir = '../data/raw/text_mining/books'
books = os.listdir(books_dir)
books = [book.split('.pdf')[0] for book in books]
books

['An Introduction To Information Retrieval',
 'Applied Text Analysis with Python-2016',
 'Foundations of Statistical Natural Language Processing - Christopher D. Manning',
 'Natural Language Processing with Python-2009',
 'Natural Language Understanding with Distributed Representation-2017',
 'Representation Learning for Natural Language Processing',
 'Text_Analytics_with_Python']

In [8]:
for doc in processed_docs:
    source_type = 'book' if doc.metadata['docname'] in books else 'article' 
    doc.metadata = doc.metadata | {'source_type': source_type}

In [9]:
from typing import List
import together

together.api_key = token
client = together.Together()

def get_embedding(text: List[str], model: str) -> List[List[float]]:
    text = text.replace("\n", " ")
    outputs = client.embeddings.create(input = [text], model=model)
    return outputs.data[0].embedding

In [11]:
save_directory = '../data/processed/embeddings'

with open(f"{save_directory}/total_embeddings.pickle", 'rb') as f:
    embeddings = pickle.load(f)

In [13]:
len(embeddings)

1907

In [None]:
n_exceptions = 0

for doc in tqdm(processed_docs[1788:]):
    text = doc.page_content
    
    while n_exceptions < 5:
        try:
            embedding = get_embedding(text, model=model)
            embeddings.append(embedding)
            n_exceptions = 0
            time.sleep(1)
            break
        except Exception as e:
            print(f'Got Exception: {e}')
            n_exceptions += 1
            time.sleep(1)
            
            text = text[:-150]
    else:
        print(f'Got five exceptions! Saving {len(embeddings)} book embeddings to persist directory!')
        break
        
with open(f"{save_directory}/total_embeddings.pickle", 'wb') as f:
    pickle.dump(embeddings, f)

  0%|          | 0/3683 [00:00<?, ?it/s]

Got Exception: 400 Client Error: Bad Request for url: https://api.together.xyz/api/v1/embeddings
Got Exception: 400 Client Error: Bad Request for url: https://api.together.xyz/api/v1/embeddings
Got Exception: 400 Client Error: Bad Request for url: https://api.together.xyz/api/v1/embeddings


In [31]:
print(f'Got five exceptions! Saving {len(book_embeddings)} book embeddings to persist directory!')
with open(f"{save_directory}/book_embeddings.pickle", 'wb') as f:
    pickle.dumps(book_embeddings)

Got five exceptions! Saving 1201 book embeddings to persist directory!


In [43]:
class DummyEmbeddings(Embeddings):
    def __init__(self, emb_mapper):
        self.emb_mapper = emb_mapper
        
    def __call__(self, texts: List[str]) -> Embeddings:
        return self.embed_documents(texts)
    
    def embed_query(self, text: str) -> List[float]:
        return self.emb_mapper[text]
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        outputs = []
        for text in texts:
            outputs.append(self.emb_mapper[text])
        return outputs

In [44]:
vectorstore = Chroma.from_documents(book_docs, DummyEmbeddings(emb_mapper), persist_directory="../data/vectorstores/storage_v0.4")

## Get vectorstore from memory

In [113]:
class DummyEmbeddings(Embeddings):
    def __init__(self, emb_mapper):
        self.emb_mapper = emb_mapper
        
    def __call__(self, texts: List[str]) -> Embeddings:
        return self.embed_documents(texts)
    
    def embed_query(self, text: str) -> List[float]:
        return self.emb_mapper[text]
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        outputs = []
        for text in texts:
            outputs.append(self.emb_mapper[text])
        return outputs

In [114]:
docs = vectorstore.get(include=['embeddings', 'metadatas', 'documents'])

emb_mapper = dict(zip(docs['documents'], docs['embeddings']))
emb = DummyEmbeddings(emb_mapper)

In [116]:
model = 'BAAI/bge-base-en-v1.5'
token = '4fbc8ea1ec756dd6d645a7db75ebb6e8a85cdaf81b09acb2bbe3143a017d11c8'
persist_directory = "../data/vectorstores/storage_v0.4"

embeddings = TogetherEmbeddings(model=model, token=token)

vectorstore = Chroma(persist_directory=persist_directory, embedding_function=emb)

In [117]:
for i, id_ in enumerate(docs['ids']):
    new_metadata = docs['metadatas'][i] | {'source_type': 'book'}
    new_doc = Document(page_content=docs['documents'][i], metadata=new_metadata, embedding=docs['embeddings'][i])
    
    vectorstore.update_document(id_, new_doc)

## Query to storage

In [118]:
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [119]:
def get_relevant_documents(db, query, source=None, n_answers=3):
    docs = db.similarity_search_with_score(query, filter = {'docname': source} if source else None, k=n_answers)
    
    return [doc[0] for doc in docs]
    return docs

def process_prompt(prompt, folder_id):
    url = f'gpt://{folder_id}/yandexgpt/latest'


def process_question(question: str, vectorstore, yandex_folder_id: str, source=None, n_documents=3):
    docs = get_relevant_documents(vectorstore, question, source=source, n_answers=n_documents)
    context = '\n\n'.join([doc.page_content for doc in docs])
    
    TEMPLATE = """
You are an expert in Data Science, NLP and Text Mining. 
You answer the user's questions using the provided context. Below is the user's question, as well as several texts that form the context. Your task is to answer the user's question using only the information from the given texts. If the information contained in the texts is not relevant, you should try to answer it yourself, but in this case, be sure to indicate that there is no information in the context.
You must answer in the same language in which the user asked you the question. For example, if the question was in English, then the answer should also be in English. Russian Russian, then the answer is in Russian.

Users'question: {question}
Texts as context:
{context}
"""
    prompt = TEMPLATE.join(question=question, context=context)

In [120]:
def get_response(question, context, iam_token):
    url = f'gpt://{FOLDER_ID}/yandexgpt/latest'
    
    headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer {iam_token}",
                "x-folder-id": f"{FOLDER_ID}"
            }
    
    body = {
      "modelUri": f"gpt://{FOLDER_ID}/yandexgpt/latest",
      "completionOptions": {
        "stream": False,
        "temperature": 0.6,
        "maxTokens": "2000"
      },
      "messages": [
        {
          "role": "system",
          "text": "You are an expert in Data Science, NLP and Text Mining NLP and Text Mining.\nYou answer the user's questions using the provided context. Below is the user's question, as well as several texts that form the context. Your task is to answer the user's question using only the information from the given texts. If the information contained in the texts is not relevant, you should try to answer it yourself, but in this case, be sure to indicate that there is no information in the context.\nYou must answer in the same language in which the user asked you the question. For example, if the question was in English, then the answer should also be in English. If question is in Russian, then the answer is in Russian."
          },
        {
          "role": "user",
          "text": f"Question: {question}\nContext:\n{context}"
        }
      ]
    }
    response = requests.post("https://llm.api.cloud.yandex.net/foundationModels/v1/completion", headers=headers, data=json.dumps(body))
    
    if response.status_code != 200:
        raise Exception(f'generation failed! Status code: {response.status_code}\n{response.text}')
            
    return response.json()['result']['alternatives'][0]['message']['text']

In [121]:
query = 'Explain TF IDF algorithm'
docs = get_relevant_documents(vectorstore, query, source=None, n_answers=3)
context = '\n\n'.join([doc.page_content for doc in docs])

for doc in docs:
    print(doc.metadata)
    print(doc.page_content)
    print('='*100)

{'docname': 'Text_Analytics_with_Python', 'source_type': 'book'}
for this algorithm:
 # define function to compute tfidf weighted averaged word vector for a document
def tfidf_wtd_avg_word_vectors(words, tfidf_vector, tfidf_vocabulary, model, 
num_features):
 weighted averaged word vector representation for a document. We also create a 
corresponding generic function tfidf_weighted_averaged_word_vectorizer() to 
perform TF-IDF weighted averaging of word vectors for a corpus of documents. We 
can see our implemented function in action on our sample corpora using the following 
snippet:
 # get tfidf weights and vocabulary from earlier results and compute result
In : corpus_tfidf = tdidf_features
 ...: vocab = tfidf_vectorizer.vocabulary_
 ...: wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_
vectorizer(corpus=TOKENIZED_CORPUS, tfidf_vectors=corpus_tfidf,
 ...: tfidf_vocabulary=vocab, model=model, 
 ...: print np.round(wt_tfidf_word_vec_features, 3)

 
 
 ]
 # compute avgd word 

In [15]:
print(context[:10000])

for this algorithm:
 # define function to compute tfidf weighted averaged word vector for a document
def tfidf_wtd_avg_word_vectors(words, tfidf_vector, tfidf_vocabulary, model, 
num_features):
 weighted averaged word vector representation for a document. We also create a 
corresponding generic function tfidf_weighted_averaged_word_vectorizer() to 
perform TF-IDF weighted averaging of word vectors for a corpus of documents. We 
can see our implemented function in action on our sample corpora using the following 
snippet:
 # get tfidf weights and vocabulary from earlier results and compute result
In : corpus_tfidf = tdidf_features
 ...: vocab = tfidf_vectorizer.vocabulary_
 ...: wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_
vectorizer(corpus=TOKENIZED_CORPUS, tfidf_vectors=corpus_tfidf,
 ...: tfidf_vocabulary=vocab, model=model, 
 ...: print np.round(wt_tfidf_word_vec_features, 3)

 
 
 ]
 # compute avgd word vector for test new_doc
In : nd_wt_tfidf_word_vec_features = tfid

## Good examples of questions

* Pros and cons of TF-IDF
* Pros of classic methods of machine learning against neural networks
* Main tasks of text analisys?
* Differences of classification and regression
* What is Word2Vec? Differences with FastText?

## Invalid articles and books

* Scalable Text Mining with Sparse Generative Models
* Natural Language Understanding with Distributed Representation-2017

## Выводы

Очень много мэтчей с текстами из книг, кажется, что в книгах находится очень много информации, которая может быть полезной в информационной системе. В статьях чаще всего можно встретить очень продвинутые и узконаправленные темы, нежели информацию интересную пользователю.

Можем попробовать расширить длину контекста для того, чтобы уместить больше информации, макс 8192 токена. \
Модель: https://huggingface.co/BAAI/bge-large-en-v1.5

На следующей итерации будем работать исключительно с книгами по text mining. Добавить более общие книги, например по python и анализу данных в целом. Необходимо научиться работать с формулами. 

In [135]:
from typing import List
import together

together.api_key = token
client = together.Together(api_key=token)

def get_embedding(text: List[str], model: str) -> List[List[float]]:
    text = text.replace("\n", " ")
    outputs = client.embeddings.create(input = [text], model=model)
    return outputs.data[0].embedding

arxiv_embeddings = []
n_exceptions = 0

save_directory = '../data/processed/embeddings'

for doc in tqdm(arxiv_docs):
    while n_exceptions < 5:
        try:
            embedding = get_embedding(doc.page_content, model=model)
            arxiv_embeddings.append(embedding)
            n_exceptions = 0
            time.sleep(1)
            break
        except Exception as e:
            print(f'Got Exception: {e}')
            n_exceptions += 1
    else:
        print(f'Got five exceptions! Saving {len(arxiv_embeddings)} book embeddings to persist directory!')
        break
        
with open(f"{save_directory}/arxiv_embeddings.pickle", 'wb') as f:
    pickle.dumps(arxiv_embeddings)

TypeError: Together.__init__() got an unexpected keyword argument 'api_key'

In [129]:
len(arxiv_docs)

483