In [2]:
# The following code extracts text from the pdf files and the dataframe is then converted to a pickle file. The language used is English.

import os
import PyPDF2
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_eng' # Add the path to your directory

# List all files in the directory
files = os.listdir(directory)

# Filter out files not ending with "_scanned.pdf" and are PDFs
pdf_files = [file for file in files if file.endswith('.pdf') and not file.endswith('_scanned.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Extract text from each page
        num_pages = len(pdf_reader.pages)
        text = ''
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        # Append country name and extracted text to the list
        data.append({'Country': country_name, 'Text': text, 'Language': 'English'})  # Adding 'Language' column

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_eng.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_eng.pkl')
print(df_loaded.head())


                  Country                                               Text  \
0                Cambodia  1 \n  \n \nKINGDOM OF CAMBODIA \nNATION  RELIG...   
1                   India  1 \n THE DISASTER MANAGEMENT ACT, 2005  \n____...   
2  Bosnia and Herzegovina   1  \n \nPursuant to Article IV4.a) of the Con...   
3                Barbados  CHAPTER 160A\nEMERGENCY MANAGEMENT\n2006-20\nT...   
4              Bangladesh   \nRegistered No. DA -1  \n \nBangladesh Gazet...   

  Language  
0  English  
1  English  
2  English  
3  English  
4  English  


In [3]:
# Extracting text from Spanish text data. Will be using PyMuPDF instead of PyPDF2.

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_spa'  # Update to the directory containing Spanish data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Spanish'})  # Setting 'Language' to Spanish

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_spa.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_spa.pkl')
#print(df_loaded.head())


             Country                                               Text  \
0         Costa Rica  Nº  8488 \n \nLA ASAMBLEA LEGISLATIVA DE LA RE...   
1            Bolivia  LEY Nº 602\nLEY DE 14 DE NOVIEMBRE DE 2014\n \...   
2        El Salvador  ASAMBLEA LEGISLATIVA  -  REPUBLICA DE EL SALVA...   
3  Equatorial Guinea  j \nREPUBLICA DE GUIl\lEA ECUATORIAL \n, \nBOL...   
4              Spain  Ley 17/2015, de 9 de julio, del Sistema Nacion...   

  Language  
0  Spanish  
1  Spanish  
2  Spanish  
3  Spanish  
4  Spanish  


In [4]:
# Extracting text from Portuguese text data.

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_por'  # Update to the directory containing Portuguese data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Portuguese'})  # Setting 'Language' to Portuguese

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_por.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_por.pkl')
print(df_loaded)


                 Country                                               Text  \
0                 Brazil  1 \n \nPresidência da República \nCasa Civil \...   
1               Portugal   \nInternal \nLei de Bases da Protecção Civil ...   
2            Angola_2020  Sexa-feira, 22 de Maio de 2020 \nI Serie-N.o 6...   
3  Sao Tome and Principe   \nTerça\nTerça\nTerça\nTerça        Feira\nFe...   
4             Cabo Verde  BOLETIM OFICIAL\nQuarta-feira, 7 de Março de 2...   

     Language  
0  Portuguese  
1  Portuguese  
2  Portuguese  
3  Portuguese  
4  Portuguese  


In [5]:
# Extracting text from French text data.

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_fra'  # Update to the directory containing French data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'French'})  # Setting 'Language' to French

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_fra.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_fra.pkl')
print(df_loaded)


        Country                                               Text Language
0  Burkina Faso       BURKINA  FASO  \n \n \n IVE REPUBLIQUE \n...   French
1       Belgium  LOI DE BASE \n08/2021  p. 1/56 \n \nLOI DU 15 ...   French
2   Switzerland  1 \nLoi fédérale \nsur la protection de la pop...   French
3    Madagascar   \n1 \n \n \n \n \n \n \nASSEMBLEE  NATIONALE ...   French
4        France  Ordonnance n  2012-351 du 12 mars 2012        ...   French
5          Togo  40\n. JOURNAL OFFICIEL DE LA REPUBLIQUE TOGOLA...   French
        Country                                               Text Language
0  Burkina Faso       BURKINA  FASO  \n \n \n IVE REPUBLIQUE \n...   French
1       Belgium  LOI DE BASE \n08/2021  p. 1/56 \n \nLOI DU 15 ...   French
2   Switzerland  1 \nLoi fédérale \nsur la protection de la pop...   French
3    Madagascar   \n1 \n \n \n \n \n \n \nASSEMBLEE  NATIONALE ...   French
4        France  Ordonnance n  2012-351 du 12 mars 2012        ...   French
5          T

In [6]:
# Extracting text for Arabic

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_arabic'  # Update to the directory containing Arabic data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Arabic'})  # Setting 'Language' to Arabic

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_arabic.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_arabic.pkl')
#print(df_loaded)


     Country                                               Text Language
0  Palestine   نوناق  ( مقر3  ةنسل )1998عافدلا نأشب م يندملا...   Arabic


In [7]:
# Extracting text for Russian language

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_russian'  # Update to the directory containing Russian data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Russian'})  # Setting 'Language' to Russian

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_russian.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_russian.pkl')
print(df_loaded)


   Country                                               Text Language
0  Belarus   \n \nPublic \n \nЗакон Республики Беларусь от...  Russian


In [8]:
# Extracting text from Danish language data

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_danish'  # Update to the directory containing Danish data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Danish'})  # Setting 'Language' to Danish

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_danish.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_danish.pkl')
#print(df_loaded)


   Country                                               Text Language
0  Denmark  Udskriftsdato: 22. februar 2022\nLBK nr 314 af...   Danish


In [9]:
# Extracting text from Finnish data

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_finnish'  # Update to the directory containing Finnish data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Finnish'})  # Setting 'Language' to Finnish

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_finnish.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_finnish.pkl')
#print(df_loaded)


   Country                                               Text Language
0  Finland  2/22/22, 4:12 PM\nValmiuslaki 1552/2011 - Ajan...  Finnish
   Country                                               Text Language
0  Finland  2/22/22, 4:12 PM\nValmiuslaki 1552/2011 - Ajan...  Finnish


In [10]:
# Extracting text from German language

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_german'  # Update to the directory containing German data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'German'})  # Setting 'Language' to German

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_german.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_german.pkl')
print(df_loaded)


   Country                                               Text Language
0  Germany  Ein Service des Bundesministeriums der Justiz ...   German


In [11]:
# Extracting text from Hungarian

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_hungarian'  # Update to the directory containing Hungarian data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Hungarian'})  # Setting 'Language' to Hungarian

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_hungarian.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_hungarian.pkl')
#print(df_loaded)


   Country                                               Text   Language
0  Hungary  3/1/22, 9:48 AM\n2011. évi CXXVIII. törvény a ...  Hungarian
   Country                                               Text   Language
0  Hungary  3/1/22, 9:48 AM\n2011. évi CXXVIII. törvény a ...  Hungarian


In [12]:
# Extracting text from Latvian.

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_latvian'  # Update to the directory containing Latvian data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Latvian'})  # Setting 'Language' to Latvian

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_latvian.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_latvian.pkl')
#print(df_loaded)


  Country                                               Text Language
0  Latvia  Publicēts:\nLatvijas Vēstnesis, 100,\n25.05.20...  Latvian


In [17]:
# Extracting text from Dutch

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_dutch'  # Update to the directory containing Dutch data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Dutch'})  # Setting 'Language' to Dutch

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_dutch.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_dutch.pkl')
#print(df_loaded)


       Country                                               Text Language
0  Netherlands  2/21/22, 4:12 PM\nwetten.nl - Regeling - Wet v...    Dutch


In [16]:
# Extracting text from Norwegian

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_norwegian'  # Update to the directory containing Norwegian data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Norwegian'})  # Setting 'Language' to Norwegian

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_norwegian.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_norwegian.pkl')
#print(df_loaded)


  Country                                               Text   Language
0  Norway  2/21/22, 4:12 PM\nwetten.nl - Regeling - Wet v...  Norwegian


In [18]:
# Extracting text from Polish

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_polish'  # Update to the directory containing Polish data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Polish'})  # Setting 'Language' to Polish

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_polish.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_polish.pkl')
print(df_loaded)


  Country                                               Text Language
0  Poland  ©Kancelaria Sejmu \n \n \n \ns. 1/33 \n \n \n ...   Polish


In [19]:
# Extracting text from Serbian

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_serbian'  # Update to the directory containing Serbian data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Serbian'})  # Setting 'Language' to Serbian

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_serbian.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_serbian.pkl')
print(df_loaded)


  Country                                               Text Language
0  Serbia   \nPublic \n     Преузето са www.pravno-inform...  Serbian


In [20]:
# Extracting text from Swedish

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_swedish'  # Update to the directory containing Swedish data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Swedish'})  # Setting 'Language' to Swedish

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_swedish.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_swedish.pkl')
#print(df_loaded)


  Country                                               Text Language
0  Sweden  Start  Dokument & lagar  Lag (2003:778) om sky...  Swedish


In [21]:
# Extracting text from Japanese

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_japanese'  # Update to the directory containing Japanese data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Japanese'})  # Setting 'Language' to Japanese

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_japanese.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_japanese.pkl')
print(df_loaded)


  Country                                               Text  Language
0   Japan  11/22/21, 4:36 PM\n災害対策基本法 | e-Gov法令検索\nhttps:...  Japanese


In [23]:
# Extracting text from Dhivehi

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_dhivehi'  # Update to the directory containing Dhivehi data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Dhivehi'})  # Setting 'Language' to Dhivehi

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_dhivehi.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_dhivehi.pkl')
print(df_loaded)


    Country                                               Text Language
0  Maldives  ްސީފޮއ ެގާޔްއިރޫހްމުޖްލުސީއަރ\nުގަމުނާފުރުކަތު...  Dhivehi


In [24]:
# Extracting text from Mongolian

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_mongolian'  # Update to the directory containing Mongolian data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Mongolian'})  # Setting 'Language' to Mongolian

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_mongolian.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_mongolian.pkl')
print(df_loaded)


    Country                                               Text   Language
0  Mongolia  МОНГОЛ УЛСЫН ХУУЛЬ\n2017 оны 02 сарын 02 өдөр\...  Mongolian


In [25]:
# Extracting text from Nepali language

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_nepali'  # Update to the directory containing Nepali data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'Nepali'})  # Setting 'Language' to Nepali

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
#df.to_pickle('text_nepali.pkl')

# Load the saved pickle file back as a DataFrame
#df_loaded = pd.read_pickle('text_nepali.pkl')
#print(df_loaded)


  Country                                               Text Language
0   Nepal   \n \nwww.lawcommission.gov.np \n1 \n \n \nविप...   Nepali


In [28]:
# Extracting text from country Vanuatu

import os
import fitz  # PyMuPDF
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_vanuatu'  # Update to the directory containing the data

# List all files in the directory
files = os.listdir(directory)

# Filter out files that are PDFs
pdf_files = [file for file in files if file.endswith('.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    pdf_document = fitz.open(file_path)

    # Extract text from each page
    text = ''
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    # Append country name, extracted text, and language to the list
    data.append({'Country': country_name, 'Text': text, 'Language': 'English'})  # Setting 'Language' to English

    pdf_document.close()

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_vanuatu.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_vanuatu.pkl')
print(df_loaded)


   Country                                               Text Language
0  Vanuatu  REPUBLIC OF VANUATU \nDISASTER RISK MANAGEMENT...  English


In [1]:
# The following code extracts text from the pdf files and the dataframe is then converted to a pickle file. The language used is English.

import os
import PyPDF2
import pandas as pd

directory = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/raw_data/text_data_tuvalu' # Add the path to your directory

# List all files in the directory
files = os.listdir(directory)

# Filter out files not ending with "_scanned.pdf" and are PDFs
pdf_files = [file for file in files if file.endswith('.pdf') and not file.endswith('_scanned.pdf')]

data = []

for file_name in pdf_files:
    file_path = os.path.join(directory, file_name)
    country_name = file_name.replace('.pdf', '')  # Extract country name from the file name

    # Open each PDF file
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Extract text from each page
        num_pages = len(pdf_reader.pages)
        text = ''
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        # Append country name and extracted text to the list
        data.append({'Country': country_name, 'Text': text, 'Language': 'English'})  # Adding 'Language' column

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
#print(df)

# Add the 'Language' column and save the DataFrame to a pickle file
df.to_pickle('text_tuvalu.pkl')

# Load the saved pickle file back as a DataFrame
df_loaded = pd.read_pickle('text_tuvalu.pkl')
print(df_loaded.head())


       Country                                               Text Language
0  Tuvalu_2007   \n2007 Revised Edition \n  CAP. 20.38   \n  \...  English
1  Tuvalu_2014  National Disaster Management (Compensation \nA...  English
2  Tuvalu_2021   \n \n     \n\n\nNATIONAL DISASTER MA NAGE...  English
