In [None]:
# !pip install pytesseract
# !pip install tqdm
# !pip install pdf2image
# !pip install pytesseract
# !pip install Pillow

In [1]:
# imports
import cv2
import pytesseract
from tqdm import tqdm
from pdf2image import convert_from_path
from PIL import Image
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches
from docx.shared import Pt
import os
import tempfile

In [2]:

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def pdf_to_word(pdf_path, output_dir, lang='fas', **kwargs):
    """ 
    A way to use google-tesseract ocr for extracting 
    texts from pdf file.
    
    Args:
        pdf_path (str): PDF file path.
        output_dir (str): Output directory.
        lang (str): tesseract language support.
    """
    
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    pages = convert_from_path(pdf_path, **kwargs)
    texts = []
    
    print(f'PDF is preparing to convert into document [#{len(pages)} pages]')
    for i, page in tqdm(enumerate(pages), position=0):
        
        with tempfile.TemporaryDirectory() as img_dir:
            img_name = f'{pdf_name}-{i+1}.jpg'
            img_path = os.path.join(img_dir, img_name)
            
            page.save(img_path, 'JPEG')
            text = pytesseract.image_to_string(Image.open(img_path), lang=lang)
            texts.append(text)
    
    document = Document()
    style_normal = document.styles['Normal']
    font = style_normal.font
    font.name = 'Arial'
    font.rtl = True
    
    style_h1 = document.styles['Heading 1']
    font = style_h1.font
    font.name = 'Arial'
    font.rtl = True
    
    for i, text in tqdm(enumerate(texts), position=0):
        heading = document.add_heading(f'صفحه: {i+1}', level=1)
        heading.alignment = WD_ALIGN_PARAGRAPH.RIGHT
        heading.style = document.styles['Heading 1']
        
        paragraph = document.add_paragraph(text)
        paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
        paragraph.style = document.styles['Normal']
    
    output_path = os.path.join(output_dir, f'{pdf_name}.docx')
    document.save(output_path)
    print(f'Done! Your document can be found here "{output_path}"')

In [3]:
# pdf_dir = 'D:/Spot The Bot Data/9'
# word_dir = 'D:/Spot The Bot Data/Word files/9'



# for pdf_file in os.listdir(pdf_dir)[159:]: 
#     if pdf_file.endswith('.pdf') or pdf_file.endswith('.PDF'):
#         pdf_path = os.path.join(pdf_dir, pdf_file)
#         output_dir = word_dir
#         pdf_to_word(pdf_path, output_dir)

In [4]:
pdf_dir = 'D:/Spot The Bot Data/10'
word_dir = 'D:/Spot The Bot Data/Word files/10'



for pdf_file in os.listdir(pdf_dir)[143:]: 
    if pdf_file.endswith('.pdf') or pdf_file.endswith('.PDF'):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        output_dir = word_dir
        pdf_to_word(pdf_path, output_dir)

PDF is preparing to convert into document [#238 pages]


238it [07:51,  1.98s/it]
238it [00:01, 236.48it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\سيرت جلال_الدين خوارزمشاه منكُبرنی.docx"
PDF is preparing to convert into document [#126 pages]


126it [03:52,  1.84s/it]
126it [00:00, 224.91it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\سپتامبر بی_باران.docx"
PDF is preparing to convert into document [#225 pages]


225it [07:10,  1.91s/it]
225it [00:00, 262.86it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\سپید.docx"
PDF is preparing to convert into document [#194 pages]


194it [04:35,  1.42s/it]
194it [00:00, 288.34it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\سیاست1111.docx"
PDF is preparing to convert into document [#70 pages]


70it [00:56,  1.25it/s]
70it [00:00, 294.71it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\سیاه_نامه مقدس.docx"
PDF is preparing to convert into document [#73 pages]


73it [01:27,  1.20s/it]
73it [00:00, 264.68it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شازده.docx"
PDF is preparing to convert into document [#591 pages]


591it [13:09,  1.34s/it]
591it [00:04, 132.12it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شاه-عباس-اثر-منوچهر-مطیعی.docx"
PDF is preparing to convert into document [#16 pages]


16it [00:31,  1.97s/it]
16it [00:00, 163.46it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شخصيت عيسي مسيح در متون مانوي.docx"
PDF is preparing to convert into document [#31 pages]


31it [00:21,  1.45it/s]
31it [00:00, 314.05it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شخصیت_شناسی به زبان ساده.docx"
PDF is preparing to convert into document [#106 pages]


106it [02:59,  1.69s/it]
106it [00:00, 245.14it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شرح-زمانی-رویداد-های-مهم-سیاسی-افغانستان.docx"
PDF is preparing to convert into document [#500 pages]


500it [15:34,  1.87s/it]
500it [00:02, 240.07it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شناخت و سنجش مارکسیسم.docx"
PDF is preparing to convert into document [#257 pages]


257it [05:32,  1.30s/it]
257it [00:01, 237.31it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شناخت-اساطیر-یونان-اثر-جان-پین-سنت-.docx"
PDF is preparing to convert into document [#11 pages]


11it [00:24,  2.22s/it]
11it [00:00, 299.96it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شهرستان_هاي ايران.docx"
PDF is preparing to convert into document [#85 pages]


85it [01:03,  1.33it/s]
85it [00:00, 326.03it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شهرياران طبرستان.docx"
PDF is preparing to convert into document [#148 pages]


148it [03:39,  1.48s/it]
148it [00:00, 290.62it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شهریار - ماکیاولی.docx"
PDF is preparing to convert into document [#39 pages]


39it [00:56,  1.45s/it]
39it [00:00, 283.63it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\شيخ صفي و تبارش.docx"
PDF is preparing to convert into document [#97 pages]


97it [03:39,  2.26s/it]
97it [00:00, 130.21it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\صد در نثر و سد در بندهش.docx"
PDF is preparing to convert into document [#81 pages]


81it [00:48,  1.68it/s]
81it [00:00, 336.31it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\صمد بهرنگی؛ با موج_های ارس به دریا پیوست....docx"
PDF is preparing to convert into document [#24 pages]


24it [00:23,  1.03it/s]
24it [00:00, 325.42it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\صنعان.docx"
PDF is preparing to convert into document [#95 pages]


95it [02:18,  1.46s/it]
95it [00:00, 257.36it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\ضمیر پنهان.docx"
PDF is preparing to convert into document [#81 pages]


81it [01:34,  1.16s/it]
81it [00:00, 251.43it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\ظهور تيمور.docx"
PDF is preparing to convert into document [#508 pages]


508it [08:35,  1.01s/it]
508it [00:01, 316.93it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\ظهور و سقوط اتحاد جماهیر شوروی.docx"
PDF is preparing to convert into document [#245 pages]


245it [04:41,  1.15s/it]
245it [00:00, 277.91it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\ظهور و سقوط مدرن.docx"
PDF is preparing to convert into document [#738 pages]


738it [13:14,  1.08s/it]
738it [00:02, 251.15it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\عالم آرای صفوی.docx"
PDF is preparing to convert into document [#16 pages]


16it [00:14,  1.12it/s]
16it [00:00, 295.80it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\عشق و شور زندگی.docx"
PDF is preparing to convert into document [#13 pages]


13it [00:19,  1.52s/it]
13it [00:00, 200.95it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\عهدنامه گلستان.docx"
PDF is preparing to convert into document [#136 pages]


136it [05:25,  2.39s/it]
136it [00:00, 234.09it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\عوارف المعارف_.docx"
PDF is preparing to convert into document [#304 pages]


304it [12:22,  2.44s/it]
304it [00:01, 212.39it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\فتوح_البلدان (بلاذري).docx"
PDF is preparing to convert into document [#225 pages]


225it [05:05,  1.36s/it]
225it [00:00, 243.53it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\فقر فلسفه.docx"
PDF is preparing to convert into document [#86 pages]


86it [02:25,  1.69s/it]
86it [00:00, 226.36it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\فلسفه فلوطین.docx"
PDF is preparing to convert into document [#100 pages]


100it [01:42,  1.02s/it]
100it [00:00, 266.98it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\فلسفه فیزیک.docx"
PDF is preparing to convert into document [#126 pages]


126it [04:02,  1.93s/it]
126it [00:00, 249.99it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\فلسفه و کلام اسلامی.docx"
PDF is preparing to convert into document [#101 pages]


101it [03:46,  2.24s/it]
101it [00:00, 243.88it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\فیه ما فیه.docx"
PDF is preparing to convert into document [#137 pages]


137it [02:40,  1.17s/it]
137it [00:00, 289.30it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\قلعه حيوانات.docx"
PDF is preparing to convert into document [#100 pages]


100it [02:01,  1.22s/it]
100it [00:00, 259.31it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\قلعه کارپاتها.docx"
PDF is preparing to convert into document [#540 pages]


540it [14:14,  1.58s/it]
540it [00:02, 236.30it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\قلندریه-در-تاریخ-اثر-محمدرضا-شفیعی-کدکنی-.docx"
PDF is preparing to convert into document [#341 pages]


341it [06:04,  1.07s/it]
341it [00:01, 260.28it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\قوم از ياد رفته.docx"
PDF is preparing to convert into document [#21 pages]


21it [00:38,  1.86s/it]
21it [00:00, 208.46it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\كارنامه اردشيربابكان.docx"
PDF is preparing to convert into document [#438 pages]


438it [15:00,  2.06s/it]
438it [00:02, 193.21it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\محتشم.docx"
PDF is preparing to convert into document [#362 pages]


362it [06:45,  1.12s/it]
362it [00:01, 237.12it/s]


Done! Your document can be found here "D:/Spot The Bot Data/Word files/10\ژاپن در گذشته و حال.docx"
