In [1]:
import pdfplumber
from pathlib import Path

In [2]:
data_path = Path("data/Data Input.pdf")

In [3]:
def extract_pdf(
        path: Path = data_path
):
    with pdfplumber.open(path) as file:
        return "".join([p.extract_text() for p in file.pages if p.extract_text()])

In [4]:
extract_pdf()

'Vijay Kumar was born on March 15, 1989, in Jaipur, Rajasthan, making him 35 years old as of 2024.\nHis birthdate is formatted as 1989-03-15 in ISO format for easy parsing, while his age serves as a\nkey demographic marker for analytical purposes. Born and raised in the Pink City of India, his\nbirthplace provides valuable regional profiling context, and his O+ blood group is noted for\nemergency contact purposes. As an Indian national, his citizenship status is important for\nunderstanding his work authorization and visa requirements across different employment\nopportunities.\nVijay\'s professional journey began on July 1, 2012, when he joined his first company as a Junior\nDeveloper with an annual salary of 350,000 INR. His career progression shows remarkable growth,\nwith his current role at Resse Analytics beginning on June 15, 2021, where he serves as a Senior\nData Engineer earning 2,800,000 INR annually. Before this position, he worked at LakeCorp\nSolutions from February 1, 20

In [4]:
from data_ext.components.extract import extract_pdf

In [5]:
text = extract_pdf(path = "data/Data Input.pdf")

In [6]:
text

'Vijay Kumar was born on March 15, 1989, in Jaipur, Rajasthan, making him 35 years old as of 2024.\nHis birthdate is formatted as 1989-03-15 in ISO format for easy parsing, while his age serves as a\nkey demographic marker for analytical purposes. Born and raised in the Pink City of India, his\nbirthplace provides valuable regional profiling context, and his O+ blood group is noted for\nemergency contact purposes. As an Indian national, his citizenship status is important for\nunderstanding his work authorization and visa requirements across different employment\nopportunities.\nVijay\'s professional journey began on July 1, 2012, when he joined his first company as a Junior\nDeveloper with an annual salary of 350,000 INR. His career progression shows remarkable growth,\nwith his current role at Resse Analytics beginning on June 15, 2021, where he serves as a Senior\nData Engineer earning 2,800,000 INR annually. Before this position, he worked at LakeCorp\nSolutions from February 1, 20

In [1]:
from data_ext.utils.common import load_spacy_model

In [2]:
nlp = load_spacy_model()

In [3]:
def split_sentences(text):
    doc = nlp(text)
    return [s.text.strip() for s in doc.sents]

In [8]:
sentences = split_sentences(text = text)

In [9]:
cleaned = [s.replace("\n", " ").strip() for s in sentences]

In [10]:
cleaned

['Vijay Kumar was born on March 15, 1989, in Jaipur, Rajasthan, making him 35 years old as of 2024.',
 'His birthdate is formatted as 1989-03-15 in ISO format for easy parsing, while his age serves as a key demographic marker for analytical purposes.',
 'Born and raised in the Pink City of India, his birthplace provides valuable regional profiling context, and his O+ blood group is noted for emergency contact purposes.',
 'As an Indian national, his citizenship status is important for understanding his work authorization and visa requirements across different employment opportunities.',
 "Vijay's professional journey began on July 1, 2012, when he joined his first company as a Junior Developer with an annual salary of 350,000 INR.",
 'His career progression shows remarkable growth, with his current role at Resse Analytics beginning on June 15, 2021, where he serves as a Senior Data Engineer earning 2,800,000 INR annually.',
 'Before this position, he worked at LakeCorp Solutions from F

In [1]:
from data_ext.pipelines.extract_data_pipeline import DataExtract

In [2]:
pipeline = DataExtract(data_path = "data/Data Input.pdf")

In [3]:
sentences = pipeline.main()

In [4]:
from data_ext.components.extract_info import extraction

In [5]:
lst = extraction(sentences = sentences)

In [6]:
lst

[{'key': 'Full Name',
  'value': 'Vijay',
  'confidence': 0.95,
  'comments': 'Extracted from the sentence as the subject.'},
 {'key': 'Date of Birth',
  'value': '1989-03-15',
  'confidence': 0.95,
  'comments': 'Formatted in ISO format for easy parsing.'},
 {'key': 'Place of Birth',
  'value': 'Pink City of India',
  'confidence': 0.95,
  'comments': 'Born and raised in the Pink City of India.'},
 {'key': 'Age',
  'value': 'serves as a key demographic marker for analytical purposes',
  'confidence': 0.95,
  'comments': 'Describes the significance of age in analysis.'},
 {'key': 'Blood Group',
  'value': 'O+',
  'confidence': 0.95,
  'comments': 'His O+ blood group is noted for emergency contact purposes.'},
 {'key': 'Citizenship Status',
  'value': 'Indian national',
  'confidence': 0.95,
  'comments': 'His citizenship status is important for understanding his work authorization and visa requirements.'},
 {'key': 'Start Date',
  'value': 'February 1, 2018',
  'confidence': 0.9,
  'co