In [2]:
!pip install pdfplumber



In [3]:
import pdfplumber
import pandas as pd

In [4]:
pdf_path = "UrhoboDictionary(1).pdf"

In [5]:
with pdfplumber.open(pdf_path) as pdf:
    for page_num, page in enumerate(pdf.pages):
        text = page.extract_text()
        if text:
            print(f"\n--- Page {page_num + 1} ---\n{text}")


--- Page 1 ---
URHOBO - ENGLISH DICTIONARY
by
Anthony Obakpọnọvwẹ Ukere
B.A. (HONS) Linguistics (UNIBEN), ANIPR
Typed in by George Sider for Kay Williamson (†).
This version edited by Roger Blench (Cambridge 2005)

--- Page 2 ---
Introduction
The present Urhobo dictionary was locally published in Nigeria, printed by Ilupeju Press, Benin City. The
original is not dated, but the preface is dated 1986, so perhaps this is the date of publication. It was typed
into the computer, originally into Macintosh and later transferred to PC at the behest of Kay Williamson (†)
who has some role in encouraging it s original publication.
Changes in the manuscript
The following changes were made by Roger Blench for this circulation version;
1. English corrected
2. Fonts changed from IPA Kiel to Times New Roman
3. Text formatted as a table
The original has a grammatical introduction, and an appendix listing Urhobo proper names and towns. The
original also has an IPA-like transcription following the main

In [6]:
pdf_path = "UrhoboDictionary(1).pdf"
all_tables = []

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        table = page.extract_table()
        if table:
            df = pd.DataFrame(table[1:], columns=table[0])
            all_tables.append(df)

if all_tables:
    combined_df = pd.concat(all_tables, ignore_index=True)
    print(combined_df.head())
else:
    print("No tables found.")

No tables found.


In [7]:
import pdfplumber
import pandas as pd

In [8]:
pdf_path = "UrhoboDictionary(1).pdf"
entries = []

with pdfplumber.open(pdf_path) as pdf:
    for page_num in [0, 1]:  # pages 1 and 2 (0-indexed)
        page = pdf.pages[page_num]
        table = page.extract_table()
        
        if table:
            header = table[0]
            rows = table[1:]  # skip header row if necessary

            # Try to detect Urhobo and English column positions
            urhobo_idx = None
            english_idx = None

            for i, col in enumerate(header):
                if col and "urhobo" in col.lower():
                    urhobo_idx = i
                if col and "english" in col.lower():
                    english_idx = i

            if urhobo_idx is not None and english_idx is not None:
                for row in rows:
                    if row:
                        urhobo_word = row[urhobo_idx]
                        english_word = row[english_idx]
                        entries.append((urhobo_word, english_word))
            else:
                print(f"Could not detect Urhobo or English columns on page {page_num + 1}")
        else:
            print(f"No table found on page {page_num + 1}")

# Convert to DataFrame
df = pd.DataFrame(entries, columns=["Urhobo", "English"])
print(df.head())


No table found on page 1
No table found on page 2
Empty DataFrame
Columns: [Urhobo, English]
Index: []


In [9]:
### A challenge was encountered here. The pdf tables are not being recognized as tables by the code possibly as a result of scanned images, e.t.c
### The next step would be to manually parse the document

In [10]:
pdf_path = "UrhoboDictionary(1).pdf"
entries = []
with pdfplumber.open(pdf_path) as pdf:
    for page_num in [3, 4]:  # pages 4 and 5 (0-based indexing)
        page = pdf.pages[page_num]
        text = page.extract_text()

        if text:
            lines = text.split("\n")
            for line in lines:
                parts = line.strip().split()

                # Only process lines with at least 2 words
                if len(parts) >= 2:
                    urhobo = parts[0]
                    english = " ".join(parts[2:]) if len(parts) > 2 else parts[1]
                    entries.append((urhobo, english))
        else:
            print(f"No text found on page {page_num + 1}")

# Convert to DataFrame
df = pd.DataFrame(entries, columns=["Urhobo", "English"])
print(df.head())


   Urhobo                                            English
0  Urhobo  A.O. Ukere (1986) Web version by Roger Blench ...
1  Urhobo                                      English gloss
2       A                                                  a
3      án  exclamation of disbelief, shock or surprise, a...
4   abaka                                        grasshopper


In [11]:
### Another challenge is that it keeps including the headers of each pages. 
### To solve it, drop headers per page

In [12]:
header_keywords = ["Urhobo Dictionary", "A.O. Ukere (1986)", "Web version by Roger Blench (2005)"]  

def is_header(line):
    """Return True if line is a header or contains known header keyword"""
    return any(keyword.lower() in line.lower() for keyword in header_keywords)

with pdfplumber.open(pdf_path) as pdf:
    for page_num in [3, 4]:  # Pages 4 and 5
        page = pdf.pages[page_num]
        text = page.extract_text()

        if text:
            lines = text.split("\n")
            for line in lines:
                if is_header(line):
                    continue  # Skip known header lines

                parts = line.strip().split()

                if len(parts) >= 2:
                    urhobo = parts[0]
                    english = " ".join(parts[2:]) if len(parts) > 2 else parts[1]
                    entries.append((urhobo, english))
        else:
            print(f"No text found on page {page_num + 1}")

# Convert to DataFrame
df = pd.DataFrame(entries, columns=["Urhobo", "English"])
print(df.head())


   Urhobo                                            English
0  Urhobo  A.O. Ukere (1986) Web version by Roger Blench ...
1  Urhobo                                      English gloss
2       A                                                  a
3      án  exclamation of disbelief, shock or surprise, a...
4   abaka                                        grasshopper


In [13]:
pdf_path = "UrhoboDictionary(1).pdf"

with pdfplumber.open(pdf_path) as pdf:
    page = pdf.pages[3]  # Example: page 4
    text = page.extract_text()
    
    if text:
        lines = text.split("\n")
        print("Raw lines on page 4:")
        for i, line in enumerate(lines):
            print(f"{i}: {line}")

Raw lines on page 4:
0: Urhobo Dictionary A.O. Ukere (1986) Web version by Roger Blench (2005)
1: Urhobo PoS English gloss
2: A a
3: án excl. exclamation of disbelief, shock or surprise, also used for questioning.
4: abaka n. grasshopper
5: abavo a. equal
6: abé n. guilt (in a case or law-suit)
7: abémuó n. wrestling
8: abérha adv. thrice
9: abẹrẹn n. sword
10: abívẹ adv. twice
11: áda1 n. forked stick
12: ada2 n. outing
13: adérha n. three-way junction
14: ádié n. position/part
15: adján n. bat
16: adjaóghẹné n. swallow
17: ádjeghwọghwọ n. camphor
18: adjené n. self acclaimed witch or wizard
19: ádjọkrótẹgba n. pants
20: adjudju n. fan
21: adjugẹ n. wheel
22: áfe n. menstruation
23: áfiéha n. stadium, playground
24: áfiédjọ n. masquerade playground
25: afiotọ n. rabbit
26: afọrhe n. brain
27: agada n. crotch, area between the legs
28: ágógó n. gong, bell
29: agọ n. camp, temporary settlement
30: aguare n. court
31: agbá (ogba) n. oil bean seed
32: agbada n. bridge
33: agbadarizobo n

In [14]:
with pdfplumber.open(pdf_path) as pdf:
    for page_num in [3, 4]:  # Pages 4 and 5
        page = pdf.pages[page_num]
        text = page.extract_text()

        if text:
            lines = text.split("\n")
            
            # ✅ Skip the first line (assumed to be the header)
            for line in lines[1:]:
                parts = line.strip().split()

                if len(parts) >= 2:
                    urhobo = parts[0]
                    english = " ".join(parts[2:]) if len(parts) > 2 else parts[1]
                    entries.append((urhobo, english))
        else:
            print(f"No text found on page {page_num + 1}")

# Convert to DataFrame
df = pd.DataFrame(entries, columns=["Urhobo", "English"])
print(df.head())


   Urhobo                                            English
0  Urhobo  A.O. Ukere (1986) Web version by Roger Blench ...
1  Urhobo                                      English gloss
2       A                                                  a
3      án  exclamation of disbelief, shock or surprise, a...
4   abaka                                        grasshopper


In [15]:
print (df)

      Urhobo                                            English
0     Urhobo  A.O. Ukere (1986) Web version by Roger Blench ...
1     Urhobo                                      English gloss
2          A                                                  a
3         án  exclamation of disbelief, shock or surprise, a...
4      abaka                                        grasshopper
..       ...                                                ...
261   amiédi                                      n. banga soup
262  amióviẹ                                              tears
263    amóno                                        who (coll.)
264  ámrádjẹ              charm for protection against the dead
265     amwá                                              cloth

[266 rows x 2 columns]


In [16]:
df.to_csv("urhobo_english_pages_4_5.csv", index=False)

In [17]:
#### Updating my csv to iclude all pages

In [18]:
pdf_path = "UrhoboDictionary(1).pdf"
entries = []

with pdfplumber.open(pdf_path) as pdf:
    for page_num in range(3, 53):  # pages 4 to 53 inclusive
        page = pdf.pages[page_num]
        text = page.extract_text()

        if text:
            lines = text.split("\n")
            
            # Skip first line (header) on each page
            for line in lines[1:]:
                parts = line.strip().split()

                if len(parts) >= 2:
                    urhobo = parts[0]
                    english = " ".join(parts[2:]) if len(parts) > 2 else parts[1]
                    entries.append((urhobo, english))
        else:
            print(f"No text found on page {page_num + 1}")

# Convert to DataFrame
df = pd.DataFrame(entries, columns=["Urhobo", "English"])
print(df.head())

# Save to CSV
df.to_csv("urhobo_english_pages_4_to_53.csv", index=False)

   Urhobo                                            English
0  Urhobo                                      English gloss
1       A                                                  a
2      án  exclamation of disbelief, shock or surprise, a...
3   abaka                                        grasshopper
4   abavo                                              equal


In [19]:
### Clean the data

In [20]:
df = df.dropna()
df = df[(df["Urhobo"].str.strip() != "") & (df["English"].str.strip() != "")]

In [21]:
import re

def clean_text(text):
    text = text.lower().strip()                      # Lowercase & trim spaces
    text = re.sub(r"[^\w\s'-]", '', text)            # Remove punctuation except apostrophes & hyphens
    text = re.sub(r"\s+", " ", text)                 # Normalize multiple spaces
    return text

df["Urhobo"] = df["Urhobo"].apply(clean_text)
df["English"] = df["English"].apply(clean_text)


In [22]:
df = df.drop_duplicates()

In [23]:
df = df[(df["Urhobo"].str.len() > 1) & (df["English"].str.len() > 1)]
df = df[(df["Urhobo"].str.len() < 100) & (df["English"].str.len() < 100)]


In [24]:
#### To export the file

In [25]:
df["Urhobo"].to_csv("urhobo.txt", index=False, header=False)
df["English"].to_csv("english.txt", index=False, header=False)

In [26]:
df.to_csv("urhobo_english_corpus.csv", index=False)