In [None]:
!pip install langchain
!pip install langchain-openai

!sudo apt install tesseract-ocr
!pip install pytesseract

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.27 (from langchain)
  Downloading langchain_core-0.2.28-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.96-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.27->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4

In [None]:

import os
import csv
from PIL import Image
import pytesseract
from typing import List, Optional
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

api_key = "api-key-here"

llm = ChatOpenAI(model="gpt-4", api_key=api_key, temperature=0)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert in extracting detailed information from resumes. "
            "Your task is to identify and extract the following details from the provided resume text: "
            "- Candidate's full name "
            "- Contact information including phone number, email address, and LinkedIn profile "
            "- Educational background (degree, major, institution, and graduation date) "
            "- Professional experience (job title, company, duration, and key responsibilities) "
            "- List of skills (enumerate each skill separately) "
            "- Awards and honors "
            "- Certifications "
            "- Projects "
            "If any information is not available, indicate 'N/A' for that field. "
            "Ensure that you adhere strictly to the specified format and provide no additional commentary or explanations."
        ),
        ("human", "{text}"),
    ]
)

class Education(BaseModel):
    degree: str
    major: str
    institution: str
    graduation_date: str

class WorkExperience(BaseModel):
    job_title: str
    company: str
    duration: str
    responsibilities: str

class ResumeData(BaseModel):
    full_name: str
    phone_number: Optional[str]
    email: Optional[str]
    linkedin_profile: Optional[str]
    education: List[Education]
    work_experience: List[WorkExperience]
    skills: List[str]
    awards_and_honors: List[str]
    certifications: List[str]
    projects: List[str]

runnable = prompt | llm.with_structured_output(schema=ResumeData)

def perform_ocr(image_path):
    image = Image.open(image_path)
    ocr_text = pytesseract.image_to_string(image)
    return ocr_text

def extract_structured_data(text):
    result = runnable.invoke({"text": text})
    return result

def process_document(image_path):
    ocr_text = perform_ocr(image_path)
    if ocr_text:
        structured_data = extract_structured_data(ocr_text)
        return structured_data
    return None

def save_data_to_csv(data, csv_filename="documents.csv"):
    header = [
        "full_name", "phone_number", "email", "linkedin_profile",
        "degree1", "major1", "institution1", "graduation_date1",
        "degree2", "major2", "institution2", "graduation_date2",
        "job_title1", "company1", "duration1", "responsibilities1",
        "job_title2", "company2", "duration2", "responsibilities2",
        "skills", "awards_and_honors", "certifications", "projects"
    ]

    with open(csv_filename, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        for item in data:
            item_dict = item.dict()
            flat_item_dict = {
                "full_name": item_dict["full_name"],
                "phone_number": item_dict.get("phone_number", "N/A"),
                "email": item_dict.get("email", "N/A"),
                "linkedin_profile": item_dict.get("linkedin_profile", "N/A"),
                "degree1": item_dict["education"][0]['degree'] if len(item_dict["education"]) > 0 else "N/A",
                "major1": item_dict["education"][0]['major'] if len(item_dict["education"]) > 0 else "N/A",
                "institution1": item_dict["education"][0]['institution'] if len(item_dict["education"]) > 0 else "N/A",
                "graduation_date1": item_dict["education"][0]['graduation_date'] if len(item_dict["education"]) > 0 else "N/A",
                "degree2": item_dict["education"][1]['degree'] if len(item_dict["education"]) > 1 else "N/A",
                "major2": item_dict["education"][1]['major'] if len(item_dict["education"]) > 1 else "N/A",
                "institution2": item_dict["education"][1]['institution'] if len(item_dict["education"]) > 1 else "N/A",
                "graduation_date2": item_dict["education"][1]['graduation_date'] if len(item_dict["education"]) > 1 else "N/A",
                "job_title1": item_dict["work_experience"][0]['job_title'] if len(item_dict["work_experience"]) > 0 else "N/A",
                "company1": item_dict["work_experience"][0]['company'] if len(item_dict["work_experience"]) > 0 else "N/A",
                "duration1": item_dict["work_experience"][0]['duration'] if len(item_dict["work_experience"]) > 0 else "N/A",
                "responsibilities1": item_dict["work_experience"][0]['responsibilities'] if len(item_dict["work_experience"]) > 0 else "N/A",
                "job_title2": item_dict["work_experience"][1]['job_title'] if len(item_dict["work_experience"]) > 1 else "N/A",
                "company2": item_dict["work_experience"][1]['company'] if len(item_dict["work_experience"]) > 1 else "N/A",
                "duration2": item_dict["work_experience"][1]['duration'] if len(item_dict["work_experience"]) > 1 else "N/A",
                "responsibilities2": item_dict["work_experience"][1]['responsibilities'] if len(item_dict["work_experience"]) > 1 else "N/A",
                "skills": "; ".join(item_dict.get("skills", [])),
                "awards_and_honors": "; ".join(item_dict.get("awards_and_honors", [])),
                "certifications": "; ".join(item_dict.get("certifications", [])),
                "projects": "; ".join(item_dict.get("projects", []))
            }
            writer.writerow(flat_item_dict)

def process_multiple_documents(directory_path, csv_filename="documents.csv"):
    results = []
    for filename in os.listdir(directory_path):
        if filename.endswith((".png", ".jpg", ".jpeg", ".pdf")):
            image_path = os.path.join(directory_path, filename)
            data = process_document(image_path)
            if data:
                results.append(data)
    save_data_to_csv(results, csv_filename)
    return results

directory_path = "/content/myresume_v1"
all_data = process_multiple_documents(directory_path)
for data in all_data:
    print(data)


full_name='Raviteja Moolinti Nallakkagari' phone_number='716-936-6677' email='mnoolint@buffalo.edu' linkedin_profile='www.linkedin.com/in/munraviteja' education=[Education(degree='MS in Computer Science and Engineering (AI/ML Track)', major='Computer Science and Engineering', institution='University at Buffalo, The State University of New York', graduation_date='December 2024'), Education(degree='Bachelor of Technology in Computer Science and Engineering', major='Computer Science and Engineering', institution='Sri Venkateswara University', graduation_date='June 2022')] work_experience=[WorkExperience(job_title='Research Assistant', company='University at Buffalo, The State University of New York', duration='January 2023 - Present', responsibilities='Employed Large Language Models (LLMs) with prompt engineering to extract synthesis and activation conditions from 1000+ MOF research papers, achieving 96% accuracy. Implemented predictive models correlating synthesis parameters with materia

In [None]:

import os
import csv
from PIL import Image
import pytesseract
from typing import List, Optional
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

api_key = "api-key-here"

llm = ChatOpenAI(model="gpt-4", api_key=api_key, temperature=0)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert in extracting detailed information from resumes. "
            "Your task is to identify and extract the following details from the provided resume text: "
            "- Candidate's full name "
            "- Contact information including phone number, email address, and LinkedIn profile "
            "- Educational background (degree, major, institution, and graduation date) "
            "- Professional experience (job title, company, duration, and key responsibilities) "
            "- List of skills (enumerate each skill separately) "
            "- Awards and honors "
            "- Certifications "
            "- Projects "
            "If any information is not available, indicate 'N/A' for that field. "
            "Ensure that you adhere strictly to the specified format and provide no additional commentary or explanations."
        ),
        ("human", "{text}"),
    ]
)

class Education(BaseModel):
    degree: str
    major: str
    institution: str
    graduation_date: str

class WorkExperience(BaseModel):
    job_title: str
    company: str
    duration: str
    responsibilities: str

class ResumeData(BaseModel):
    full_name: str
    phone_number: Optional[str]
    email: Optional[str]
    linkedin_profile: Optional[str]
    education: List[Education]
    work_experience: List[WorkExperience]
    skills: List[str]
    awards_and_honors: List[str]
    certifications: List[str]
    projects: List[str]

runnable = prompt | llm.with_structured_output(schema=ResumeData)

def perform_ocr(image_path):
    image = Image.open(image_path)
    ocr_text = pytesseract.image_to_string(image)
    return ocr_text

def extract_structured_data(text):
    result = runnable.invoke({"text": text})
    return result

def process_document(image_path):
    ocr_text = perform_ocr(image_path)
    if ocr_text:
        structured_data = extract_structured_data(ocr_text)
        return structured_data
    return None

def save_data_to_csv(data, csv_filename="documents.csv"):
    header = [
        "full_name", "phone_number", "email", "linkedin_profile",
        "degree1", "major1", "institution1", "graduation_date1",
        "degree2", "major2", "institution2", "graduation_date2",
        "job_title1", "company1", "duration1", "responsibilities1",
        "job_title2", "company2", "duration2", "responsibilities2",
        "skills", "awards_and_honors", "certifications", "projects"
    ]

    with open(csv_filename, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        for item in data:
            item_dict = item.dict()
            flat_item_dict = {
                "full_name": item_dict["full_name"],
                "phone_number": item_dict.get("phone_number", "N/A"),
                "email": item_dict.get("email", "N/A"),
                "linkedin_profile": item_dict.get("linkedin_profile", "N/A"),
                "degree1": item_dict["education"][0]['degree'] if len(item_dict["education"]) > 0 else "N/A",
                "major1": item_dict["education"][0]['major'] if len(item_dict["education"]) > 0 else "N/A",
                "institution1": item_dict["education"][0]['institution'] if len(item_dict["education"]) > 0 else "N/A",
                "graduation_date1": item_dict["education"][0]['graduation_date'] if len(item_dict["education"]) > 0 else "N/A",
                "degree2": item_dict["education"][1]['degree'] if len(item_dict["education"]) > 1 else "N/A",
                "major2": item_dict["education"][1]['major'] if len(item_dict["education"]) > 1 else "N/A",
                "institution2": item_dict["education"][1]['institution'] if len(item_dict["education"]) > 1 else "N/A",
                "graduation_date2": item_dict["education"][1]['graduation_date'] if len(item_dict["education"]) > 1 else "N/A",
                "job_title1": item_dict["work_experience"][0]['job_title'] if len(item_dict["work_experience"]) > 0 else "N/A",
                "company1": item_dict["work_experience"][0]['company'] if len(item_dict["work_experience"]) > 0 else "N/A",
                "duration1": item_dict["work_experience"][0]['duration'] if len(item_dict["work_experience"]) > 0 else "N/A",
                "responsibilities1": item_dict["work_experience"][0]['responsibilities'] if len(item_dict["work_experience"]) > 0 else "N/A",
                "job_title2": item_dict["work_experience"][1]['job_title'] if len(item_dict["work_experience"]) > 1 else "N/A",
                "company2": item_dict["work_experience"][1]['company'] if len(item_dict["work_experience"]) > 1 else "N/A",
                "duration2": item_dict["work_experience"][1]['duration'] if len(item_dict["work_experience"]) > 1 else "N/A",
                "responsibilities2": item_dict["work_experience"][1]['responsibilities'] if len(item_dict["work_experience"]) > 1 else "N/A",
                "skills": "; ".join(item_dict.get("skills", [])),
                "awards_and_honors": "; ".join(item_dict.get("awards_and_honors", [])),
                "certifications": "; ".join(item_dict.get("certifications", [])),
                "projects": "; ".join(item_dict.get("projects", []))
            }
            writer.writerow(flat_item_dict)

def process_multiple_documents(directory_path, csv_filename="documents.csv"):
    results = []
    for filename in os.listdir(directory_path):
        if filename.endswith((".png", ".jpg", ".jpeg", ".pdf")):
            image_path = os.path.join(directory_path, filename)
            data = process_document(image_path)
            if data:
                results.append(data)
    save_data_to_csv(results, csv_filename)
    return results


directory_path = "/content/resumes"
all_data = process_multiple_documents(directory_path)
for data in all_data:
    print(data)


full_name='N/A' phone_number='N/A' email='N/A' linkedin_profile='N/A' education=[Education(degree='MBA', major='Business Administration', institution='OR. CV Raman University', graduation_date='N/A'), Education(degree='B.com', major='Commerce', institution='Gulbarga University', graduation_date='N/A')] work_experience=[WorkExperience(job_title='Software Engineer', company='Trucktion', duration='N/A', responsibilities='Designing and developing web pages using Reactjs. Redux, HTMLS, CSS3, Bootstrap, JavaScript and Material.ul. Developed the application for responsive to different screen resolution using Bootstrap and CSS3 media queries. Managing application-level state by using Redux and Redux hooks — use Dispatch, use Selector. Designing and using dynamic and reusable components. Design and develop static or dynamic components based on the requirements from product owner.'), WorkExperience(job_title='Software Engineer', company='Learning Management System (LMS)', duration='N/A', respons

In [None]:
import pandas as pd
df = pd.read_csv('documents.csv')
df

Unnamed: 0,full_name,phone_number,email,linkedin_profile,degree1,major1,institution1,graduation_date1,degree2,major2,...,duration1,responsibilities1,job_title2,company2,duration2,responsibilities2,skills,awards_and_honors,certifications,projects
0,,,,,MBA,Business Administration,OR. CV Raman University,,B.com,Commerce,...,,Designing and developing web pages using React...,Software Engineer,Learning Management System (LMS),,Learning Management System is designed to trac...,Reactjs; Redux; HTML5; CSS3; Bootstrap; JavaSc...,,,Trucktion; Learning Management System (LMS)
1,Aakash Sotunke,579897022 / 7585765171,aakashsotunkezn9eqamaiiicom,,Bachelor of Engineering,,Pune University,2018,HSC,,...,2021-Present,"Normalizing, Grouping, Cleaning data of pf, ex...",,"Intelligic Software Pvt Ltd, Pune",June 2020 - Dec 2020,Auto Insurance fraud detection,Python; MySQL; ML Model building; Data pre-pro...,,,Auto Insurance fraud detection
2,,,,,,,,,,,...,June 2019 to present,Analyzing & validating the completeness of bus...,Software Test Engineer (Intern),VCITY Technology,January 2019 to June 2019,,Good understanding of Dynamic Xpath and Itrame...,,,
3,,,,,,,,,,,...,Aug 2019 to present,Prepare reusable methods to improve productivi...,,,,,Java language programming; Finding x path loca...,,,Kogan
4,Aniket Bharat Kakade,,,,,,,,,,...,,,,,,,Operating systems; Testing Tools; Databases; M...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,Krishnajagan G,,,,,,,,,,...,,,,,,,JAVA Web Applications; Web Application monitor...,,,
94,,,,,,,,,,,...,,"Worked on enhancement projects, Initiated and ...",,,,,,,,
95,Anjali,,,,eee,,Veer Bahadur Singh Purwanchal University,,Diploma,,...,,,,,,,C#; Xamarin Forms; Web API; MVVM; FCM; Signal;...,,,ZobleProductions : (Mobile App : Android & 105...
96,Ganesh Biradar,+91-7030876676,ganesh3biradar@gmail.com,,,,,,,,...,2.5 years,Manual and Automation testing (Using Selenium ...,,,,,Manual and Automation testing; Selenium with J...,,,


In [None]:
df.to_csv('final_resume_info_extracted.csv',index=False)