In [1]:
import fitz  # PyMuPDF
import os
import pandas as pd
import pytesseract
from PIL import Image

# Directory path
directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/scanned_data_eng'

# Function to extract text from PDF using Tesseract OCR
def extract_text_with_ocr(pdf_path):
    text = ''
    try:
        document = fitz.open(pdf_path)
        for page in document:
            image_list = page.get_pixmap(alpha=False, matrix=fitz.Matrix(3, 3))
            img = Image.frombytes("RGB", [image_list.width, image_list.height], image_list.samples)
            text += pytesseract.image_to_string(img)
    except Exception as e:
        print(f"Exception occurred: {str(e)}")
    return text

# Function to get country name from file name
def get_country_name(file_name):
    return file_name.split('_scanned.pdf')[0]

# List to store data
data = []

# Loop through the directory
for filename in os.listdir(directory):
    if filename.endswith('_scanned.pdf'):
        file_path = os.path.join(directory, filename)
        country = get_country_name(filename)
        text = extract_text_with_ocr(file_path)

        # Append data to list
        data.append({'Country': country, 'Language': 'English', 'Text': text})

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data)

print(df)

# Save DataFrame to a pickle file
#df.to_pickle('scanned_eng.pkl')


                          Country Language  \
0                        Eswatini  English   
1                         Jamaica  English   
2        Turks and Caicos Islands  English   
3                      The Gambia  English   
4                    Cook Islands  English   
5                        Tanzania  English   
6  Federated States of Micronesia  English   

                                                Text  
0  piae jn\nPi SAT OR face\nSUPPLEMENT TO\nTHE\nS...  
1  16.\n\n17.\n\nDISASTER RISK MANAGEMENT\n\nTHE ...  
2  Page | of 70\n\n  \n\n*\n\nTURKS AND CAICOS IS...  
3   \nNATIONAL DISASTER MANAGEMENT ACT, 2008\n\n...  
4   \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\nPWNS\n...  
5  THE DISASTER MANAGEMENT ACT, 2015\n\nARRANGEME...  
6  PRESIDENTIAL COMm. NUs_@=L22eey\nFSM CONGRESS\...  


In [3]:
# Updating the previous code to extract Spanish language text

import fitz  # PyMuPDF
import os
import pandas as pd
import pytesseract
from PIL import Image

# Directory path
directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/scanned_data_spa'

# Function to extract text from PDF using Tesseract OCR with a specified language
def extract_text_with_ocr(pdf_path, language='spa'):
    text = ''
    try:
        document = fitz.open(pdf_path)
        for page in document:
            image_list = page.get_pixmap(alpha=False, matrix=fitz.Matrix(3, 3))
            img = Image.frombytes("RGB", [image_list.width, image_list.height], image_list.samples)
            text += pytesseract.image_to_string(img, lang=language)
    except Exception as e:
        print(f"Exception occurred: {str(e)}")
    return text

# Function to get country name from file name
def get_country_name(file_name):
    return file_name.split('_scanned.pdf')[0]

# List to store data
data = []

# Loop through the directory
for filename in os.listdir(directory):
    if filename.endswith('_scanned.pdf'):
        file_path = os.path.join(directory, filename)
        country = get_country_name(filename)
        text = extract_text_with_ocr(file_path, language='spa')  # Extract text in Spanish

        # Append data to list
        data.append({'Country': country, 'Language': 'Spanish', 'Text': text})  # Updated Language to 'Spanish'

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data)

print(df)

# Save DataFrame to a pickle file
#df.to_pickle('scanned_spa.pkl')


          Country Language                                               Text
0  Argentina_2020  Spanish  BOLETÍN OFICIAL\n\nde la República Argentina\n...
1        Paraguay  Spanish   \n\nPODER LEGISLATIVO\nLEY N*. 2615\n\nQUE CR...
2        Colombia  Spanish   \n\nvero: 1523 2 4 ABRIOIÍ\n\n“POR EL CUAL SE...


In [6]:
# Updating the previous code to extract Portuguese language text

import fitz  # PyMuPDF
import os
import pandas as pd
import pytesseract
from PIL import Image

# Directory path
directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/scanned_data_por'

# Function to extract text from PDF using Tesseract OCR with a specified language
def extract_text_with_ocr(pdf_path, language='por'):
    text = ''
    try:
        document = fitz.open(pdf_path)
        for page in document:
            image_list = page.get_pixmap(alpha=False, matrix=fitz.Matrix(3, 3))
            img = Image.frombytes("RGB", [image_list.width, image_list.height], image_list.samples)
            text += pytesseract.image_to_string(img, lang=language)
    except Exception as e:
        print(f"Exception occurred: {str(e)}")
    return text

# Function to get country name from file name
def get_country_name(file_name):
    return file_name.split('_scanned.pdf')[0]

# List to store data
data = []

# Loop through the directory
for filename in os.listdir(directory):
    if filename.endswith('_scanned.pdf'):
        file_path = os.path.join(directory, filename)
        country = get_country_name(filename)
        text = extract_text_with_ocr(file_path, language='por')  # Extract text in Portuguese

        # Append data to list
        data.append({'Country': country, 'Language': 'Portuguese', 'Text': text})  # Updated Language to 'Portuguese'

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data)

print(df)

# Save DataFrame to a pickle file
#df.to_pickle('scanned_por.pkl')


      Country    Language                                               Text
0  Angola_200  Portuguese  2262\n\n \n\nSUMÁRIO\n\nAssembleia Nacional\nL...


In [7]:
# Updating the previous code to extract French language text

import fitz  # PyMuPDF
import os
import pandas as pd
import pytesseract
from PIL import Image

# Directory path
directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/scanned_data_fra'

# Function to extract text from PDF using Tesseract OCR with a specified language
def extract_text_with_ocr(pdf_path, language='fra'):
    text = ''
    try:
        document = fitz.open(pdf_path)
        for page in document:
            image_list = page.get_pixmap(alpha=False, matrix=fitz.Matrix(3, 3))
            img = Image.frombytes("RGB", [image_list.width, image_list.height], image_list.samples)
            text += pytesseract.image_to_string(img, lang=language)
    except Exception as e:
        print(f"Exception occurred: {str(e)}")
    return text

# Function to get country name from file name
def get_country_name(file_name):
    return file_name.split('_scanned.pdf')[0]

# List to store data
data = []

# Loop through the directory
for filename in os.listdir(directory):
    if filename.endswith('_scanned.pdf'):
        file_path = os.path.join(directory, filename)
        country = get_country_name(filename)
        text = extract_text_with_ocr(file_path, language='fra')  # Extract text in French

        # Append data to list
        data.append({'Country': country, 'Language': 'French', 'Text': text})  # Updated Language to 'French'

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data)

print(df)

# Save DataFrame to a pickle file
#df.to_pickle('scanned_fra.pkl')


   Country Language                                               Text
0  Tunisia   French   \n\ntraduction française\n\nVendredi 2 dhoul-...
