In [2]:
# !pip install PyPDF2 python-docx

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Using cached python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Using cached python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx, PyPDF2
Successfully installed PyPDF2-3.0.1 python-docx-1.1.2


In [4]:
import os
import PyPDF2
from docx import Document

def extract_text_from_document(file_path, txt_encoding='utf-8'):
    """
    Extracts text from a PDF, DOCX, or TXT file and returns it as a Python string.

    Parameters:
    - file_path (str): Path to the input file.
    - txt_encoding (str): Encoding for reading TXT files (default: 'utf-8').

    Returns:
    - str: Extracted text from the document.

    Raises:
    - FileNotFoundError: If the file does not exist.
    - ValueError: If the file format is unsupported.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file '{file_path}' does not exist.")
    
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()
    
    text = ""
    
    if ext == '.pdf':
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text_pages = []
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text_pages.append(page_text)
            text = '\n'.join(text_pages)
    
    elif ext == '.docx':
        doc = Document(file_path)
        text_paragraphs = [paragraph.text for paragraph in doc.paragraphs]
        text = '\n'.join(text_paragraphs)
    
    elif ext == '.txt':
        with open(file_path, 'r', encoding=txt_encoding) as f:
            text = f.read()
    
    else:
        raise ValueError(f"Unsupported file format: '{ext}'")
    # print(text)
    return text


extract_text_from_document("Obidokun_Tunji_David.pdf")

'Tunji\n \nObidokun\n \nML/AI\n \nEngineer\n \n📞\n \n+234\n \n813\n \n100\n \n9351\n \n|\n \n✉\n \ntunjiobidokun@gmail.com\n \n|\n \n🔗\n \nLinkedIn:\n \nLinkedIn\n \n \nProfessional\n \nSummary\n \nResults-driven\n \nML/AI\n \nEngineer\n \nwith\n \n3+\n \nyears\n \nof\n \nexperience\n \nin\n \nbuilding\n \nscalable,\n \ndata-driven\n \napplications.\n \nProficient\n \nin\n \nPython,\n \ncloud\n \ncomputing\n \n(Microsoft\n \nAzure),\n \nand\n \ndeploying\n \nmachine\n \nlearning\n \nmodels\n \nfor\n \nfraud\n \ndetection,\n \nrecommendation\n \nsystems,\n \nand\n \npredictive\n \nanalytics.\n \nA\n \ncollaborative\n \nteam\n \nplayer\n \npassionate\n \nabout\n \nleveraging\n \nAI\n \nto\n \nsolve\n \nreal-world\n \nproblems.\n \n \nTechnical\n \nSkills\n \nProgramming:\n \nPython,\n \nSQL\n \nFrameworks\n \n&\n \nLibraries:\n \nNumpy,\n \nPandas,\n \nScikit-learn,\n \nTensorFlow,\n \nPyTorch,\n \nOpenCV,\n \nFastAPI\n \nCloud\n \n&\n \nDevOps:\n \nMicrosoft\n \nAzure,\n \nDocker,\n \nC

In [5]:
import os
import re
from docx import Document
import PyPDF2

def extract_text_to_markdown(file_path, txt_encoding='utf-8'):
    """
    Extracts text from a PDF, DOCX, or TXT file and returns it as Markdown-formatted text.
    
    Args:
        file_path (str): Path to the input file.
        txt_encoding (str): Encoding for TXT files (default: 'utf-8').
    
    Returns:
        str: Extracted text formatted in Markdown.
    
    Raises:
        FileNotFoundError: If the file doesn't exist.
        ValueError: If the file format is unsupported.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    _, ext = os.path.splitext(file_path)
    ext = ext.lower()

    if ext == '.pdf':
        text = _extract_text_from_pdf(file_path)
    elif ext == '.docx':
        text = _extract_text_from_docx(file_path)
    elif ext == '.txt':
        text = _extract_text_from_txt(file_path, txt_encoding)
    else:
        raise ValueError(f"Unsupported file format: {ext}")

    return _convert_to_markdown(text)

def _extract_text_from_pdf(file_path):
    """Extracts raw text from PDF."""
    text = []
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text.append(page.extract_text() or "")
    return "\n".join(text)

def _extract_text_from_docx(file_path):
    """Extracts raw text from DOCX (preserves paragraphs)."""
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def _extract_text_from_txt(file_path, encoding):
    """Extracts raw text from TXT."""
    with open(file_path, 'r', encoding=encoding) as f:
        return f.read()

def _convert_to_markdown(text):
    """Converts raw text to Markdown with basic formatting."""
    # Convert bullet points (•, ●, etc.) to Markdown lists
    text = re.sub(r'(?m)^\s*[•●▪]\s*', '- ', text)
    
    # Convert numbered lists (1., 2.) 
    text = re.sub(r'(?m)^\s*(\d+)\.\s*', r'\1. ', text)
    
    # Bold section headers (lines in ALL CAPS or followed by colons)
    text = re.sub(
        r'(?m)^([A-Z][A-Z\s\-]+:?)$', 
        r'**\1**', 
        text
    )
    
    # Preserve multiple newlines (Markdown needs two for a paragraph break)
    text = re.sub(r'\n{3,}', '\n\n', text.strip())
    
    return text

markdown_text = extract_text_to_markdown("Obidokun_Tunji_David.pdf")
print(markdown_text)  # Returns Markdown-formatted text

Tunji
 
Obidokun
 
ML/AI
 
Engineer
 
📞
 
+234
 
813
 
100
 
9351
 
|
 
✉
 
tunjiobidokun@gmail.com
 
|
 
🔗
 
LinkedIn:
 
LinkedIn
 
 
Professional
 
Summary
 
Results-driven
 
ML/AI
 
Engineer
 
with
 
3+
 
years
 
of
 
experience
 
in
 
building
 
scalable,
 
data-driven
 
applications.
 
Proficient
 
in
 
Python,
 
cloud
 
computing
 
(Microsoft
 
Azure),
 
and
 
deploying
 
machine
 
learning
 
models
 
for
 
fraud
 
detection,
 
recommendation
 
systems,
 
and
 
predictive
 
analytics.
 
**A
 **
collaborative
 
team
 
player
 
passionate
 
about
 
leveraging
 
**AI
 **
to
 
solve
 
real-world
 
problems.
 
 
Technical
 
Skills
 
Programming:
 
Python,
 
**SQL
 **
Frameworks
 
&
 
Libraries:
 
Numpy,
 
Pandas,
 
Scikit-learn,
 
TensorFlow,
 
PyTorch,
 
OpenCV,
 
FastAPI
 
Cloud
 
&
 
DevOps:
 
Microsoft
 
Azure,
 
Docker,
 
CI/CD
 
Databases:
 
MySQL,
 
PostgreSQL,
 
Oracle,
 
Microsoft
 
**SQL
 **
Machine
 
Learning
 
&
 
**AI:**
 
Supervised
 
&
 
Unsupervised
 
Learning,
 
Deep
