In [None]:
!pip install PyPDF2
!pip install python-magic
!pip install python-docx
!pip install textract
!pip install spacy-lookups-data
!pip install -U spacy
!pip install PyMuPDF 



In [6]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file with error handling.
    :param pdf_path: Path to the PDF file.
    :return: Extracted text as a string or an error message.
    """
    try:
        # Create a PDF reader object
        reader = PdfReader(pdf_path)
        
        # Initialize an empty string to hold the extracted text
        extracted_text = ""

        # Iterate over all pages and extract text
        for page in reader.pages:
            extracted_text += page.extract_text()

        return extracted_text
    
    except FileNotFoundError:
        return f"Error: The file at {pdf_path} was not found."
    except Exception as e:
        return f"Error extracting text from the PDF: {e}"


In [7]:
import textract

def extract_text_from_doc(file_path):
    """
    Extract text from a .doc file using textract.
    :param file_path: Path to the .doc file.
    :return: Extracted text as a string.
    """
    try:
        text = textract.process(file_path).decode("utf-8")
        return text
    except Exception as e:
        return f"Error extracting text from .doc file: {e}"

In [8]:
# Extracting texts from docx file format
from docx import Document

def extract_text_from_docx(file_path):
    """
    Extracts text from a .docx file with error handling.
    :param file_path: Path to the .docx file.
    :return: Extracted text as a string or an error message.
    """
    try:
        # Load the .docx file
        document = Document(file_path)
        
        # Initialize an empty string to store the extracted text
        extracted_text = ""

        # Iterate through each paragraph in the document
        for paragraph in document.paragraphs:
            extracted_text += paragraph.text + "\n"

        return extracted_text

    except FileNotFoundError:
        return f"Error: The file at {file_path} was not found."
    except Exception as e:
        return f"Error extracting text from the .docx file: {e}"

In [9]:
try:
    import magic
except ImportError:
    print("The 'magic' library is not installed. Install it using 'pip install python-magic' or 'pip install python-magic-bin' for Windows.")
    raise

def check_file_type(file_path):
    """
    Checks the file type based on its MIME type with error handling.
    :param file_path: Path to the file to be checked.
    :return: File type ('pdf', 'docx', 'msword', or 'wrong_format') or an error message.
    """
    try:
        # Initialize the magic object
        mime = magic.Magic(mime=True)
        file_type = mime.from_file(file_path)
        
        # Check MIME type and return corresponding file type
        if "application/pdf" in file_type:
            return "pdf"
        elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in file_type:
            return "docx"
        elif "application/msword" in file_type:
            return "msword"
        else:
            return "wrong_format"
    except FileNotFoundError:
        return f"Error: The file at {file_path} was not found."
    except magic.MagicException as e:
        return f"Error with the magic library: {e}"
    except Exception as e:
        return f"Error determining file type: {e}"

In [10]:
def getting_final_text(file_path):
    my_file_type = check_file_type(file_path)

    if my_file_type == "pdf":
        text = extract_text_from_pdf(file_path)
        return text
    elif my_file_type == "docx":
        text = extract_text_from_docx(file_path)
        return text
    elif my_file_type == "msword":
        text = extract_text_from_doc(file_path)
        return text
    else :
        return "Use diff format (.pdf, .docx)"


In [17]:
# file_path = "/kaggle/input/resumes/Resumes/data/data/ENGINEERING/10985403.pdf"
# file_path = "/kaggle/input/docx-for-testing/Achyuth Resume_8.docx"
file_path = "/kaggle/input/resumes/Resumes/data/data/ENGINEERING/10030015.pdf"

import re
resume_content = getting_final_text(file_path)
cleaned_text = re.sub(r'\s+', ' ', resume_content.strip())
print(cleaned_text)

ENGINEERING LAB TECHNICIAN Career Focus My main objective in seeking employment with Triumph Actuation Systems Inc. is to work in a professional atmosphere where I can utilize my skills and continue to gain experience in the aerospace industry to advance in my career. Professional Experience Engineering Lab Technician Oct 2016 to Current Company Name ï¼​ City , State Responsible for testing various seat structures to meet specific certification requirements. Â Maintain and calibrate test instruments to ensure testing capabilities are maintained. Ensure data is captured and recorded correctly for certification test reports. Duties also dynamic test set-up and static suite testing. Engineering Lab Technician, Sr. Specialist Apr 2012 to Oct 2016 Company Name ï¼​ City , State Utilized skills learned from LabView Course 1 training to construct and maintain LabView VI programs. Responsible for fabricating and maintaining hydraulic/electrical test equipment to complete development and qualifi

### Code for Model


In [1]:
import pickle
train_data = pickle.load(open('/kaggle/input/train-data/train_data.pkl','rb'))
train_data[0]

('Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [2]:
import spacy
import random
from spacy.training.example import Example

# Load Pre-trained Model
nlp = spacy.load("en_core_web_sm")

def train_model(train_data):
    # Check if 'ner' exists in the pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    else:
        ner = nlp.get_pipe("ner")  # Get existing NER pipeline
    
    # Add labels in the NLP pipeline
    for _, annotation in train_data:
        for ent in annotation.get("entities", []):
            ner.add_label(ent[2])
    
    # Remove other pipelines if they are there
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(20):  # train for 20 iterations
            print(f"Starting iteration {itn}")
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                try:
                    # Display the training example
                    # print(f"\nTraining on: {text}")
                    # print(f"Annotations: {annotations}")
                    
                    # Create an example and update the model
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update(
                        [example],  # batch of examples
                        drop=0.1,  # dropout - make it harder to memorise data
                        sgd=optimizer,  # callable to update weights
                        losses=losses,
                    )
                    
                    # Test model predictions on the same text
                    test_doc = nlp(text)
                    print("Predictions:")
                    for ent in test_doc.ents:
                        print(f"  {ent.text} -> {ent.label_}")
                    
                except Exception as e:
                    # print(f"Error during training: {e}")
                    pass
            print(f"Losses: {losses}")



# Start Training Model
train_model(train_data)


[2024-12-12 10:07:33,186] [INFO] Added vocab lookups: lexeme_norm
[2024-12-12 10:07:33,187] [INFO] Created vocabulary
[2024-12-12 10:07:33,189] [INFO] Finished initializing nlp object


Starting iteration 0
Predictions:
  Prakriti Shaurya Senior System Engineer - Infosys Limited  Mangalore, Karnataka - Email me on Indeed: indeed.com/r/Prakriti- Shaurya/5339383f9294887e  Detail-oriented individual with three years of experience as an IT Consultant looking for opportunity to develop my professional skills in a vibrant and stable environment, and to use those skills for the benefits of the organization in best possible way.  Willing to relocate to: Bengaluru, Karnataka - Bangalore Urban, Karnataka  WORK EXPERIENCE  Senior System Engineer  Infosys Limited -  Mangalore, Karnataka -  January 2017 to Present  Working as an IT Consultant under application maintenance and support for METLIFE Insurance company.  System Engineer  Infosys Limited -  Mangalore, Karnataka -  December 2014 to December 2016  Worked as an IT Consultant under application maintenance and support for METLIFE Insurance company.  SOFTWARE  EDUCATION  Bachelor of Technology in Information Technology  Vellor



Predictions:
  Soumya Balan -> Name
  Soumya Balan -> Name
  - BE -> Name
  Computer Science -> Name
  - 3 -> Name
  yr Work -> Name
  Experience at -> Name
  Microsoft Corporation -> Name
  Thiruvananthapuram, -> Name
  Kerala - -> Name
  Email me -> Name
  on Indeed -> Name
  : indeed.com/r/Soumya- -> Name
  Balan/8c7fbb9917935f20   -> Name
  ➢ To -> Name
  work in -> Name
  a progressive -> Name
  organization where -> Name
  I can -> Name
  enhance my skills and -> Name
  learning to -> Name
  contribute to the -> Name
  success of -> Name
  the organization.   -> Name
  Willing to -> Name
  relocate: -> Name
  Anywhere   -> Name
  EXPERIENCE   -> Name
  Technical Support -> Name
  Engineer   -> Name
  Microsoft iGTSC -> Name
  -   -> Name
  Bengaluru, -> Name
  Karnataka - -> Name
  July 2013 -> Name
  to October -> Name
  Position: -> Name
  TECHNICAL SUPPORT -> Name
  ENGINEER   -> Name
  Company: -> Name
  Microsoft Corporation -> Name
  - Microsoft -> Name
  India Global -> Na



Predictions:
  , C.    KEY STRENGTHS • Initiative, Leadership Qualities and Team spirit. • Proficiency in Communication skills, Positive attitude. • Good knowledge about Technology and interest towards new learning. • Responsibility and patience to do work assigned by superiors. -> Companies worked at
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:




Predictions:
Predictions:
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:
Predictions:




Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:




Predictions:




Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:




Predictions:
Predictions:




Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:




Predictions:
Predictions:
Predictions:
Predictions:




Predictions:
  Kiran Kumar -> Name
Predictions:




Predictions:
Predictions:
  Still Studying  Bhagabati Nodal High School,Sarakana  SKILLS  Typewriting, Editing  LINKS  https://plus.google.com/u/0/108623501355423636575  https://www.indeed.com/r/Prabhu-Prasad-Mohapatra/1e4b62ea17458993?isid=rex-download&ikw=download-top&co=IN https://www.indeed.com/r/Prabhu-Prasad-Mohapatra/1e4b62ea17458993?isid=rex-download&ikw=download-top&co=IN https://plus.google.com/u/0/108623501355423636575 -> Designation
Predictions:
  Ashish Indoriya -> Name
  Bachelor of Computer Applications in Computer Applications  C V Raman University Bilaspur -  Raman, Punjab  Secondary School Certificate  Chhattisgarh Board of Secondary Education -  Raipur, Chhattisgarh  SKILLS  Java, SQL, J2EE, Sping, Hibernate, JavaScript, HTML, CSS  ADDITIONAL INFORMATION  Serving notice period -> Designation




Predictions:




Predictions:
  Bengaluru -> Location
  Bangalore -> Location
  BSC in Computer science  Kakatiya University  SKILLS  Testing (10+ years), Program Management (10+ years), Automation Testing (10+ years), Selenium Webdriver (4 years), Project Management (10+ years), Java (10+ years), AWS (10+ years), Cloud Computing (4 years)  ADDITIONAL INFORMATION  Expertise Involved and excelling in enterprise-wide initiatives, complex and time-critical business projects providing distinctive leadership at the following levels: ● Account Management: Understanding inter-related services of the organization; representing organization within an account; selling entire portfolio of enterprise services; strategy to action by suggesting right processes, skills, culture (people), technology and content; effectively maintaining relationships with customers. Understanding key challenges of the customer; financial performance of the client; competitors within the account; technology budget or spend; client spend



Predictions:
  Suman Biswas -> Name
  Bengaluru -> Location
  Software Engineer  Shell -  Bangalore, Karnataka -  March 2016 to Present  SAP UI5 Lead, Native HANA Developer • Technical leader of frontend team (5 team members) for SAP UI5, Fiori, OData, XSJS since 2016. • Successfully delivered seven end to end development projects (CRV, CRD, Prelude, Charon, CTT, GPD, AIF). • Analyse business requirement, develop solution, prepare prototype, security model, application architecture, effort estimation and involve in development delivery.  • Project to expose on-premise HANA box data to outside world via HCP connectivity for mobile application. • Requirement analysis, planning, design and developing the multiple HANA XS application. • Creation of HANA tables, calculation views, used unit and currency conversion.  • Implementing row level and application enable security in applications using Analytic privilege, xsaccess.  • Worked in Agile and DevOps methodology. • Interviewer on SAP UI5 



Predictions:
  Anil Kumar -> Name
  Microsoft Azure (Basic Management)  FTP and TELNET -  2008 to 2008  • Design & implementation of Servers, Exchange Servers and Network according to the customer specific needs. • Designing and implementations of LAN /WAN, protocols used like TCP /IP, UDP, DHCP, HTTP, FTP and TELNET. • Providing remote support for maintaining Pentium, Pentium II, Pentium III, Servers on Windows NT, Windows 2000 Server, MS Windows Server 2003 R2 and MS Windows Server 2008 R2. • Software Office 97 Pro, Office 2000 pro. Office XP Pro, Office 2003, Microsoft Office 2007, Microsoft Office 2010, Microsoft Office 2013, Microsoft Office 2016 & Visual studio installation, MacAfee, Norton Antivirus, Escan Antivirus and other software & support. • Configuration of E-mail Clint software's that is (Outlook Express, Windows mail and Microsoft Outlook. • Installation and maintenance of Intel dual processor servers like HP, IBM, Dell Tower and Rack based server. • Managing Network Ra



Predictions:
  Kandrapu Reddy -> Name
  Senior Travel Operations (Domestic -> Designation
  Bharath International Travels -> Designation
  B.B.A in -> Designation
Predictions:
  Srushti Bhadale -> Name
  Bengaluru -> Location




Predictions:
  Bengaluru -> Location
  B.E in -> Designation
  Bengaluru -> Location
Predictions:
  Karthik Gururaj -> Name
  Bachelor of -> Designation




Predictions:
  Vikas Singh -> Name
  Infosys Limited - -> Designation
  Infosys Limited - -> Designation
  Infosys Limited - -> Designation
  Bachelor of Technology in Electronics and Communication Engineering  GLA Institute of Technology and Management -  Mathura, Uttar Pradesh  September 2009 to June 2013  SKILLS  SECURITY (5 years), INFORMATION SECURITY (5 years), ACTIVE DIRECTORY (3 years), UNIX (Less than 1 year)  ADDITIONAL INFORMATION  TECHNICAL SKILLS  ● Operating Systems: Windows, Solaris ● Languages: Python, Core Java, SQL, Unix ● Software: Sailpoint IIQ, Oracle IAM, Beeline, SAP, Active Directory, Phantom, Quest change auditor, Microsoft Office Suite ● Information Security: Concepts and best practices -> Degree
Predictions:
  Bachelor's  SKILLS  Excel (Less than 1 year), Word (Less than 1 year)  LINKS  https://www.linkedin.com/in/jacob-philip-a52744138  ADDITIONAL INFORMATION  CORECOMPETENCIES: • Meetdead-lineswitheaseandefficiency • Pleasantandeffectivecustomerservice&amp;m



Predictions:
  Microsoft - -> Designation
Predictions:
  Infosys Limited - -> Designation
  Infosys Limited - -> Designation
  Infosys Limited   -> Designation




Predictions:
  Infosys Limited - -> Designation
  Microsoft Order -> Designation
Predictions:
  Pankaj Bhosale -> Name
  Microsoft SQL -> Designation




Predictions:
  Bengaluru -> Location
  Bengaluru -> Location
Predictions:
  Somanath Behera -> Name
  Bengaluru -> Location
  Bengaluru -> Location
Predictions:
  Divesh Singh -> Name




Predictions:
Predictions:
  Khushboo Choudhary -> Name
Predictions:




Predictions:
  Praveen Bhaskar -> Name
Predictions:
  Nikhileshkumar Ikhar -> Name
  Bengaluru -> Location
  Bengaluru -> Location
  Bengaluru -> Location
  Bengaluru -> Location




Predictions:
  Arpit Godha -> Name
Predictions:




Predictions:
  Manisha Bharti -> Name
Predictions:
  Sivaganesh Selvakumar -> Name




Predictions:
  Madas Peddaiah -> Name
Predictions:
  Shrinidhi Selva -> Name
Predictions:
  Sharan Adla -> Name




Predictions:
Predictions:
  Vijayalakshmi Govindarajan -> Name




Predictions:
Predictions:
  Pranay Sathu -> Name
Predictions:




Predictions:
Predictions:
  Harini Komaravelli -> Name




Predictions:
  Bengaluru -> Location
Predictions:
  Hemil Bhavsar -> Name
Predictions:
  Pavithra M -> Name
  Bengaluru -> Location




Predictions:
  Yathishwaran P -> Name
  Maximo Consultant -> Name
Predictions:
  Navas Koya -> Name
  System Engineer -> Name




Predictions:
  Tejasri Gunnam -> Name
  Bengaluru -> Location
  Cisco SystemsIndPvtLtd -> Name
  Bengaluru -> Location
  Bengaluru -> Location
Predictions:




Predictions:
  Karthik GV -> Name
  Hyderabad -> Location
  Architect   -> Name
  Microsoft India -> Name
  Hyderabad -> Location
  Microsoft India -> Name
  Microsoft Azure -> Name
  Microsoft India -> Name
  Hyderabad -> Location
  Microsoft India -> Name
  Hyderabad -> Location
  Microsoft India -> Name
  Hyderabad -> Location
  Program Management -> Name
  Microsoft Technology -> Name
  Scrum Master -> Name
  Microsoft IT -> Name
Predictions:
  Abdul B -> Name
  Karnataka, -> Name
  Arabic Language -> Name
  Bengaluru -> Location
  Hyderabad -> Location
  Microsoft Dynamics -> Name
  Technical Skills -> Name
Predictions:
  Tanmoy Maity -> Name
  HVAC Technician -> Name
  Kolkata -> Location




Predictions:
  Shaik Tazuddin -> Name
  Bengaluru -> Location
  Senior Process -> Name
  Bengaluru -> Location
  Senior Process -> Name
  Infosys BPM -> Name
  S.V University -> Name
  Tirupati -> Location
  Margadarshi Junior -> Name
Losses: {'ner': np.float32(12774.164)}
Starting iteration 1
Predictions:
  Ramesh HP -> Name
  Bangalore -> Location
  CES ASSOCIATE -> Name
  SAP ARIBA -> Name
  MCA in COMPUTER -> Designation
  Dayananda Sagar -> Name
  Bengaluru -> Location
  Hassan -> Location
Predictions:
  Shraddha Achar -> Name
  Mathura -> Location
  indeed.com/r/Shraddha-Achar/ -> Location
  Mangalore -> Location
  Poorna Prajna -> Name
Predictions:
  Vikas Singh -> Name
  Chandigarh -> Location
  Technology Analyst -> Name
  Infosys Limited -> Name
  Chandigarh -> Location
  Infosys Limited -> Name
  Chandigarh -> Location
  Infosys Limited -> Name
  Bachelor of Technology in Electronics -> Designation
  GLA Institute of Technology and Management -   -> Designation
  Mathura -> 

In [3]:
# Saving the model
nlp.to_disk('nlp_ner_model')


In [4]:
#Loading Model
nlp_model = spacy.load('nlp_ner_model')

In [35]:
# Applying the model
doc = nlp_model(tx)

# Initialize dictionary to store entities
entities_dict = {}

# Extract entities and populate the dictionary
for ent in doc.ents:
    if ent.label_ not in entities_dict:
        entities_dict[ent.label_] = []
    entities_dict[ent.label_].append(ent.text)

# Print the dictionary
print(entities_dict)



{'Name': ['HR PERSONNEL ASSISTANT'], 'Skills': ['Administrative Support, Auditing, Clerical, Copy, Customer Service, Data Entry, Delivery, Documentation, Fax, File Management, Letters, Meeting Facilitation, Organizational Skills, Proofreading, Receptionist, Research, Scanning, Scheduling, Secretarial, Telephone Skills, Office Equipment Maintenance, and Inventory Management.'], 'Degree': ["Bachelor's of Arts Degree in Political Science and Law Montclair"]}


In [34]:
import sys, fitz  # PyMuPDF

# Path to the PDF file
fname = '/kaggle/input/resumes/Resumes/data/data/HR/10399912.pdf'

# Open the PDF document
doc = fitz.open(fname)

# Initialize an empty string to store the extracted text
text = ""

# Iterate over the pages of the PDF
for page in doc:
    text += page.get_text()  # Use get_text() instead of getText()

# Remove unnecessary line breaks and join text
tx = " ".join(text.split('\n'))
print(tx)


HR PERSONNEL ASSISTANT Summary I am a U.S. citizen who is authorized to work in the US for any employer. I have worked 8 years as an Office Clerk, 2 years as a Student Intern/Office Assistant, and 4 years as a Contractor. I am applying for the Data Entry Clerk position (Advert ID# 224278 Advert ID# 224278). My skills and experiences include: Administrative Support, Auditing, File Management, Meeting Facilitation, Office Materials Management, & Inventory Management. Highlights COMPUTER SKILLS: Microsoft Word, MS Excel, MS Outlook, MS PowerPoint, PeopleSoft. TYPING SKILLS: 40-60 WPM. ADDITIONAL SKILLS: Administrative Support, Auditing, Clerical, Copy, Customer Service, Data Entry, Delivery, Documentation, Fax, File Management, Letters, Meeting Facilitation, Organizational Skills, Proofreading, Receptionist, Research, Scanning, Scheduling, Secretarial, Telephone Skills, Office Equipment Maintenance, and Inventory Management. Experience Company Name City , State HR Personnel Assistant 03/2