In [1]:
import os
# libraries for model
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.chains import LLMChain

# libraries for document loading
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import Docx2txtLoader

# libraries for pydantic functions
from typing import List
from typing import Optional
from pydantic import BaseModel, Field
from langchain.llms import OpenLLM


def connect_llm_model(openai_key):
    openai_model_name = "gpt-3.5-turbo" # can change model name here
    llm_kwargs = dict(
    model_name=openai_model_name,
    openai_api_key = openai_key,
    temperature = 0.3,
    model_kwargs=dict(
        frequency_penalty=0.1
        ),
    )
    chat_model = ChatOpenAI(**llm_kwargs)
    return chat_model

def file_reader(file_name):
    root, file_extension = os.path.splitext(file_name.lower())
        
    if file_extension == '.pdf':
        loader = PyPDFLoader(file_name)
        pages = loader.load_and_split()
        extracted_text = "\n".join([page.page_content for page in pages])
    elif file_extension == '.docx':
        # Assuming you have a read_text_from_docx function
        loader = Docx2txtLoader(file_name)
        pages = loader.load_and_split()
        extracted_text = "\n".join([page.page_content for page in pages])
    elif file_extension == '.txt':
        loader = TextLoader(file_name)
        pages = loader.load()
        extracted_text = "\n".join([page.page_content for page in pages])
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

    return extracted_text

# Pydantic class defining the extraction of job-related information for an output parser
class Job_Description(BaseModel):
    """Description of a job posting"""

    company: str = Field(
        ..., description="Name of the company that has the job opening"
    )
    job_title: str = Field(..., description="Job title")
    team: str = Field(
        ...,
        description="Name of the team within the company. Team name should be null if it's not known.",
    )
    job_summary: str = Field(
        ..., description="Brief summary of the job, not exceeding 100 words"
    )
    salary: str = Field(
        ...,
        description="Salary amount or range. Salary should be null if it's not known.",
    )
    duties: List[str] = Field(
        ...,
        description="The role, responsibilities and duties of the job as an itemized list, not exceeding 500 words",
    )
    qualifications: List[str] = Field(
        ...,
        description="The qualifications, skills, and experience required for the job as an itemized list, not exceeding 500 words",
    )

# Pydantic class that defines a list of skills in the job posting
class Job_Skills(BaseModel):
    """Skills from a job posting"""

    technical_skills: List[str] = Field(
        ...,
        description="An itemized list of technical skills, including programming languages, technologies, and tools.",
    )
    non_technical_skills: List[str] = Field(
        ...,
        description="An itemized list of non-technical Soft skills.",
    )

# Pydantic class that defines a list of skills in the resume
class Resume_Skills(BaseModel):
    technical_skills: List[str] = Field(
        ...,
        description="An individual itemized list of technical skills Examples: Python, MS Office etc",
    )
    non_technical_skills: List[str] = Field(
        ...,
        description="An individual itemized list of non-technical skills like soft skills",
    )

# Pydantic class that defines a format of resume parser
class Resume_Format(BaseModel):
    """Format of resume"""
    Basics: str = Field(
        ..., description = " The basics of  the for given user resume input."
    )
    Introduction: str = Field(
        ..., description = " write only 1 line introduction for the introduction section for given user resume input."
    )
    Work_Experiences: str = Field(
        ..., description = " The experiece of the candidates with job, duration and description of work done like xyz company from 09-2022 to 08-2023 performed work on spark and databases"
    )
    Education: str = Field(
        ..., description = " The education of the candidates with university, duration and description of work done like xyx university from 08-2013 to 07-2018 studied this courses"
    )
    Awards: Optional[str] = Field(
        ..., description = " The awards are the achievments and honours of the candidates. If the resume don't have the award no need to add this to resume"
    )
    Projects: str = Field(
        ..., description = " The projects section of contains the project, duration and roles and responsibilities of the candidate like xyz project from 05-2019 to 04-2020 worked on backend etc"
    )
    Skills: List[str] = Field(
        ...,
        description=" An itemized list of technical skills and non-technical Soft skills of the user",
    )

# Pydantic class defining the format of improvements
class Resume_Improvements(BaseModel):
    improvements: List[str] = Field(
        ..., description="List of suggestions for improvement"
    )

In [2]:
openai_key = "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"

In [3]:
model = connect_llm_model(openai_key)

  warn_deprecated(


In [4]:
model

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7f4212d27730>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7f4212d39a60>, temperature=0.3, model_kwargs={'frequency_penalty': 0.1}, openai_api_key='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', openai_proxy='')

In [5]:
from langchain.document_loaders import Docx2txtLoader

In [8]:
loader = Docx2txtLoader("SS_Resume.docx")

In [11]:
pages = loader.load_and_split()
extracted_text = "\n".join([page.page_content for page in pages])
extracted_text

'SAMVIT SWAMINATHAN\n\nDallas, TX;  +1 (214) 846-5426;  samvit998@gmail.com\n\nwww.linkedin.com/in/samvits \n\n\nEDUCATION\n\n\t\tThe University of Texas at Dallas, Richardson, TX\tGraduating May 2025\n\n\t\tMaster of Science, Business Analytics\n\n\n\t\tVellore Institute of Technology, Vellore, India\tMay 2023\n\n\t\tBachelor of Technology, Computer Science and Engineering\tGPA: 3.53\n\n\t\t\t\n\n\t\t\tSKILLS & CERTIFICATIONS\t\n\n\t\tCertifications:\t\tAWS Introduction to ML Services, Coursera ML With Python (IBM), Udemy Statistics for DS\n              \tand Business Analysis\n\n\t\tProgramming Tools:      Python, SQL, Java Spring Boot, Docker, Kubernetes, Linux, Go Lang, R, HTML, CSS, PHP\n\n\t\tSoftware:\t\t\tMsSQL, Workbench, Anaconda, Postman, R, Jupyter, Pandas, Matplotlib\n\n\n\nWORK EXPERIENCE\t\n\n\t\tAddverb Technologies Private Limited, Noida, India\tJan 2023 – Jun 2023\n\n\t\tSoftware Engineering and Data Science Intern\n\n\t\tCreated a virtual GTP (Goods to Person) stati

In [13]:
from langchain.document_loaders import TextLoader
loader = TextLoader("sample.txt")
pages = loader.load_and_split()

In [14]:
extracted_text = "\n".join([page.page_content for page in pages])
extracted_text

'hello'

In [15]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("Samvit_Swaminathan.pdf")
pages = loader.load_and_split()

In [16]:
extracted_text = "\n".join([page.page_content for page in pages])
extracted_text

'SAMVIT SWAMINATHAN  \nDallas, TX;  +1 (214) 846-5426 ;  samvit998 @gmail.com  \nwww.linkedin.com/in/samvits   \n \nEDUCATION  \nThe University of Texas at Dallas , Richardson, TX  Graduating May  2025 \nMaster of Scienc e, Business Analyti cs \n \nVellore Institute of Technology , Vellore , India  May  2023 \nBachelor of Technology , Computer Science and Engineering  GPA: 3.53  \n \nSKILLS  & CERTIFICATIONS   \nCertifications :  AWS Introduction to ML Services , Coursera ML With Python (IBM), Udemy Statistics for DS  \n               and Business Analysis  \nProgramming  Tools :      Python, SQL, Java  Spring  Boot, Docker, Kubernetes, Linux, Go Lang, R, HTML, CSS , PHP  \nSoftware:    MsSQL, Workbench , Anaconda, Postman, R, Jupyter, Pandas, Matplotlib  \n \nWORK EXPERIENCE   \nAddverb Technologies Private Limited , Noida , India  Jan 2023 – Jun 2023  \nSoftware Engineering and Data Science Intern  \n● Created a  virtual GTP (Goods to Person) station which analyzed data of over 1.3 m

In [18]:
resume = file_reader("Samvit_Swaminathan.pdf")
resume

'SAMVIT SWAMINATHAN  \nDallas, TX;  +1 (214) 846-5426 ;  samvit998 @gmail.com  \nwww.linkedin.com/in/samvits   \n \nEDUCATION  \nThe University of Texas at Dallas , Richardson, TX  Graduating May  2025 \nMaster of Scienc e, Business Analyti cs \n \nVellore Institute of Technology , Vellore , India  May  2023 \nBachelor of Technology , Computer Science and Engineering  GPA: 3.53  \n \nSKILLS  & CERTIFICATIONS   \nCertifications :  AWS Introduction to ML Services , Coursera ML With Python (IBM), Udemy Statistics for DS  \n               and Business Analysis  \nProgramming  Tools :      Python, SQL, Java  Spring  Boot, Docker, Kubernetes, Linux, Go Lang, R, HTML, CSS , PHP  \nSoftware:    MsSQL, Workbench , Anaconda, Postman, R, Jupyter, Pandas, Matplotlib  \n \nWORK EXPERIENCE   \nAddverb Technologies Private Limited , Noida , India  Jan 2023 – Jun 2023  \nSoftware Engineering and Data Science Intern  \n● Created a  virtual GTP (Goods to Person) station which analyzed data of over 1.3 m

In [19]:
file_reader("sample.txt")

'hello'

In [20]:
file_reader("SS_Resume.docx")

'SAMVIT SWAMINATHAN\n\nDallas, TX;  +1 (214) 846-5426;  samvit998@gmail.com\n\nwww.linkedin.com/in/samvits \n\n\nEDUCATION\n\n\t\tThe University of Texas at Dallas, Richardson, TX\tGraduating May 2025\n\n\t\tMaster of Science, Business Analytics\n\n\n\t\tVellore Institute of Technology, Vellore, India\tMay 2023\n\n\t\tBachelor of Technology, Computer Science and Engineering\tGPA: 3.53\n\n\t\t\t\n\n\t\t\tSKILLS & CERTIFICATIONS\t\n\n\t\tCertifications:\t\tAWS Introduction to ML Services, Coursera ML With Python (IBM), Udemy Statistics for DS\n              \tand Business Analysis\n\n\t\tProgramming Tools:      Python, SQL, Java Spring Boot, Docker, Kubernetes, Linux, Go Lang, R, HTML, CSS, PHP\n\n\t\tSoftware:\t\t\tMsSQL, Workbench, Anaconda, Postman, R, Jupyter, Pandas, Matplotlib\n\n\n\nWORK EXPERIENCE\t\n\n\t\tAddverb Technologies Private Limited, Noida, India\tJan 2023 – Jun 2023\n\n\t\tSoftware Engineering and Data Science Intern\n\n\t\tCreated a virtual GTP (Goods to Person) stati

In [None]:
# https://www.linkedin.com/jobs/view/3774779142
job_posting = """
Data Engineer
Capital One · McLean, VA  4 hours ago  · 14 applicants
Full-timeMatches your job preferences, job type is Full-time.  Entry level
10,001+ employees · Financial Services
32 company alumni work here · 157 school alumni work here
See how you compare to 14 applicants. Try Premium for $0
Skills: Apache Spark, Big Data, +8 more
View verifications related to this job post.View verifications related to this job post.
Show all

Apply

Save
Save Data Engineer at Capital One
Share
Show more options
About the job
Center 1 (19052), United States of America, McLean, VirginiaData Engineer

Do you love building and pioneering in the technology space? Do you enjoy solving complex business problems in a fast-paced, collaborative, inclusive, and iterative delivery environment? At Capital One, you'll be part of a big group of makers, breakers, doers and disruptors, who solve real problems and meet real customer needs. Capital One’s Finance Tech team is seeking a Senior Associate, Data Engineer who is passionate about marrying data with emerging technologies. As a Capital One Senior Associate, Data Engineer, you’ll have the opportunity to be on the forefront of driving a major transformation within Capital One.

What You’ll Do

 Proactively seeks out opportunities to address customer needs and influences stakeholders so that we are building the best solutions for the most important problems 
 Support the design and development of scalable data architectures and systems that extract, store, and process large amounts of data 
 Build and optimize data pipelines for efficient data ingestion, transformation, and loading from various sources while ensuring data quality and integrity 
 Collaborate with Data Scientists, Machine Learning Engineers, Business Analysts and/or Product Owners to understand their requirements and provide efficient solutions for data exploration, analysis, and modeling 
 Implement testing, validation and pipeline observability to ensure data pipelines are meeting customer SLAs 

Basic Qualifications: 

 Bachelor’s Degree 
 At least 2 years of experience in application development (Internship experience does not apply) 
 At least 1 year of experience in big data technologies 

Preferred Qualifications: 

 3+ years of experience developing data pipelines using Python or Scala 
 2+ years of experience with distributed computing tools (Spark, EMR, Hadoop) 
 2+ years of experience with UNIX/Linux including basic commands and shell scripting 
 1+ years of experience with a public cloud (AWS, Microsoft Azure, Google Cloud) 
 1+ years of data warehousing experience (Redshift or Snowflake) 
 1+ years of experience with Agile engineering practices 

At this time, Capital One will not sponsor a new applicant for employment authorization for this position.

Capital One offers a comprehensive, competitive, and inclusive set of health, financial and other benefits that support your total well-being. Learn more at the Capital One Careers website . Eligibility varies based on full or part-time status, exempt or non-exempt status, and management level.

This role is expected to accept applications for a minimum of 5 business days.No agencies please. Capital One is an equal opportunity employer committed to diversity and inclusion in the workplace. All qualified applicants will receive consideration for employment without regard to sex (including pregnancy, childbirth or related medical conditions), race, color, age, national origin, religion, disability, genetic information, marital status, sexual orientation, gender identity, gender reassignment, citizenship, immigration status, protected veteran status, or any other basis prohibited under applicable federal, state or local law. Capital One promotes a drug-free workplace. Capital One will consider for employment qualified applicants with a criminal history in a manner consistent with the requirements of applicable laws regarding criminal background inquiries, including, to the extent applicable, Article 23-A of the New York Correction Law; San Francisco, California Police Code Article 49, Sections 4901-4920; New York City’s Fair Chance Act; Philadelphia’s Fair Criminal Records Screening Act; and other applicable federal, state, and local laws and regulations regarding criminal background inquiries.

If you have visited our website in search of information on employment opportunities or to apply for a position, and you require an accommodation, please contact Capital One Recruiting at 1-800-304-9102 or via email at RecruitingAccommodation@capitalone.com . All information you provide will be kept confidential and will be used only to the extent required to provide needed reasonable accommodations.

For technical support or questions about Capital One's recruiting process, please send an email to Careers@capitalone.com

Capital One does not provide, endorse nor guarantee and is not liable for third-party products, services, educational tools or other information available through this site.

Capital One Financial is made up of several different entities. Please note that any position posted in Canada is for Capital One Canada, any position posted in the United Kingdom is for Capital One Europe and any position posted in the Philippines is for Capital One Philippines Service Corp. (COPSSC)."""

In [22]:
# https://python.langchain.com/docs/modules/model_io/output_parsers/
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain.pydantic_v1 import BaseModel, Field

from langchain.prompts import HumanMessagePromptTemplate
from langchain.schema import HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Resume_Skills)

SystemMessege = "Extract the information and get the specified skills only mentioned in the resume"
human_message_prompt = HumanMessagePromptTemplate.from_template(template = 'Using format {format_instructions} and resume {resume} to get the itemised list')
chat_prompt = ChatPromptTemplate.from_messages(
    [SystemMessege, human_message_prompt]
    )
output = model(
        chat_prompt.format_prompt(resume=resume,format_instructions = parser.get_format_instructions()).to_messages()
    )

  warn_deprecated(


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-lIX1G***************************************K1kh. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [23]:
print(output.content)

NameError: name 'output' is not defined

In [None]:
# using resume.pdf
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain.pydantic_v1 import BaseModel, Field

from langchain.prompts import HumanMessagePromptTemplate
from langchain.schema import HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate

from resume_llm import Job_Skills

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Job_Skills)

SystemMessege = "Extract and mine the information provided and give directly the list of all the specified skills mentioned in the job posting"
human_message_prompt = HumanMessagePromptTemplate.from_template(template = 'Using format {format_instructions} and job posting {job_posting} to get the itemised list')
chat_prompt = ChatPromptTemplate.from_messages(
    [SystemMessege, human_message_prompt]
    )
output = model(
        chat_prompt.format_prompt(job_posting=job_posting,format_instructions = parser.get_format_instructions()).to_messages()
    )

In [None]:
print(output.content)

In [None]:
# using resume.pdf
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain.pydantic_v1 import BaseModel, Field

from langchain.prompts import HumanMessagePromptTemplate
from langchain.schema import HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate

from resume_llm import Job_Description

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=resume_llm.Job_Description)

SystemMessege = "Extract and mine the information provided in the job posting"
human_message_prompt = HumanMessagePromptTemplate.from_template(template = 'Using format {format_instructions} and job posting {job_posting} to get the itemised list')
chat_prompt = ChatPromptTemplate.from_messages(
    [SystemMessege, human_message_prompt]
    )
output = model(
        chat_prompt.format_prompt(job_posting=job_posting,format_instructions = parser.get_format_instructions()).to_messages()
    )

In [None]:
print(output.content)

In [None]:
import json

job_data_dict = json.loads(output.content)
job_data_dict

In [None]:
duties = job_data_dict['duties']
duties

In [None]:
qualifications = job_data_dict['qualifications']
qualifications