In [1]:
from collections import Counter
import io
import os
from pathlib import Path

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams

import spacy

In [2]:
nlp = spacy.load("pl_core_news_sm")

In [3]:
path = Path().parent.absolute()
pdf_name = "janko-muzykant.pdf"
pdf_file_path = Path.joinpath(path, "Source", pdf_name)

In [4]:
def extract_pdf_text(file_path):

        with open (file_path, "rb") as fh:
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            laparams = LAParams()
            conventer = TextConverter(resource_manager, fake_file_handle, laparams=laparams)

            for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
                page_interpreter = PDFPageInterpreter(resource_manager, conventer)
                page_interpreter.process_page(page)
                text = fake_file_handle.getvalue()
            conventer.close()
            fake_file_handle.close()
        
        text = text.replace("\n", " ").replace("\n", "")
        text = text.replace("\x0c", "").replace("(cid:3095)", "j")
        text = text.replace("\uf768\uf765\uf76e\uf772\uf779\uf76b", "")
        text = text.replace("\uf773\uf769\uf765\uf76e\uf76b\uf769\uf765\uf777\uf769\uf763\uf77a", "")
        return text

In [5]:
doc = nlp(extract_pdf_text(pdf_file_path))

In [6]:
pos_counter = Counter(([token.pos_ for token in doc])).most_common(); pos_counter

[('NOUN', 796),
 ('PUNCT', 669),
 ('VERB', 485),
 ('ADP', 255),
 ('ADJ', 246),
 ('PART', 182),
 ('CCONJ', 163),
 ('ADV', 160),
 ('PRON', 96),
 ('SPACE', 74),
 ('SCONJ', 55),
 ('X', 29),
 ('INTJ', 12),
 ('NUM', 4)]

In [7]:
print("The most common grammatical class was", pos_counter[0][0])

The most common grammatical class was NOUN


In [8]:
pos_list = [t[0] for t in pos_counter]

In [9]:
def grammar_word_count(grammar_list):
    # create a dictionary where the keys are parts of speech
    pos_dict = dict()
    for i in grammar_list:
        pos_dict[i] = []
        
    # separate words according to parts of speech
    for token in doc:
        for k, v in pos_dict.items():
            if token.pos_ == k:
                v.append(token.text)
                
    # count words            
    pos_dict_words_counter = {}
    for k, v in pos_dict.items():
        pos_dict_words_counter[k] = Counter(v).most_common()
    return pos_dict_words_counter

In [10]:
pos_word_count = grammar_word_count(pos_list)

In [11]:
pos_word_count

{'NOUN': [('co', 15),
  ('Janek', 10),
  ('Janko', 8),
  ('to', 7),
  ('raz', 7),
  ('Bóg', 7),
  ('skrzypki', 7),
  ('Muzykant', 6),
  ('czasem', 6),
  ('matka', 6),
  ('oczy', 5),
  ('Matulu', 5),
  ('głos', 5),
  ('Lektury', 4),
  ('ǳień', 4),
  ('roku', 4),
  ('boru', 4),
  ('domu', 4),
  ('tym', 4),
  ('ciemności', 4),
  ('piersi', 4),
  ('drzwi', 4),
  ('kredensu', 4),
  ('Janku', 4),
  ('ǳiecka', 4),
  ('twarz', 4),
  ('stronie', 3),
  ('Polska', 3),
  ('świat', 3),
  ('nic', 3),
  ('roboty', 3),
  ('choǳił', 3),
  ('chłopak', 3),
  ('barǳo', 3),
  ('rzecz', 3),
  ('granie', 3),
  ('coś', 3),
  ('Co', 3),
  ('niebie', 3),
  ('Janka', 3),
  ('karczmie', 3),
  ('Bęǳiem', 3),
  ('bęǳiem', 3),
  ('gonta', 3),
  ('ścianie', 3),
  ('nikogo', 3),
  ('kredensie', 3),
  ('łopuchach', 3),
  ('słowik', 3),
  ('ramach', 2),
  ('projektu', 2),
  ('tapczanie', 2),
  ('gromnicę', 2),
  ('dobroǳieja', 2),
  ('chłopaka', 2),
  ('Jan', 2),
  ('dusza', 2),
  ('ciała', 2),
  ('Chłopak', 2),
  ('życ