In [1]:
import numpy as np
import easyocr
import re
import os
import pycountry
import requests
import spacy
from sentence_transformers import SentenceTransformer
from pdf2image import convert_from_path
from docx import Document
from urllib.parse import quote
from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor

nlp = spacy.load("en_core_web_lg")
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [2]:
us_universities = [
    "Harvard University",
    "Stanford University",
    "Massachusetts Institute of Technology",
    "California Institute of Technology",
    "Princeton University",
    "Yale University",
    "University of Chicago",
    "Columbia University",
    "University of Pennsylvania",
    "Cornell University",
    "University of California, Berkeley",
    "University of California, Los Angeles",
    "University of Michigan",
    "Duke University",
    "Northwestern University",
    "Johns Hopkins University",
    "New York University",
    "University of California, San Diego",
    "University of Washington",
    "University of Texas at Austin",
    "Brown University",
    "University of Wisconsin-Madison",
    "University of Illinois Urbana-Champaign",
    "University of Southern California",
    "University of California, Davis",
    "University of North Carolina at Chapel Hill",
    "Carnegie Mellon University",
    "Boston University",
    "Georgia Institute of Technology",
    "Pennsylvania State University"
]

all_universities = []

# Loop through all countries
for country in pycountry.countries:
    country_name = country.name

    # Special case for United States
    if country_name == "United States":
        all_universities.extend(us_universities)
        continue

    encoded_name = quote(country_name)
    url = f"http://universities.hipolabs.com/search?country={encoded_name}"

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            universities = response.json()
            if universities:
                uni_names = [uni['name'] for uni in universities]
                all_universities.extend(uni_names)
        else:
            print(f"Failed to fetch data for {country_name}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Request failed for {country_name}: {e}")


print(f"\nTotal universities collected: {len(all_universities)}")
print(all_universities)


Total universities collected: 7441
['Afghan University', 'Aria Institute of Higher Education', 'American University of Afghanistan', 'Al-Birony University', 'Badakhshan University', 'Balkh University', 'Baghlan University', 'Bakhtar University', 'Bost University', 'Bamiyan University', 'Dawat University', 'Dunya Institute of Higher Education', 'Faryab Higher Education Institute', 'Ghazni University', 'Herat University', 'Ibn Sina University', 'Jawzjan University', 'Kaboora Institute of Higher Education', 'Kabul Health Sciences Institute', 'Kandahar University', 'Kardan University', 'Karwan Institute of Higher Education', 'Kateb Institute of Higher Education', 'Khana-e-Noor Institute of Higher Education', 'Kabul Education University', 'Kabul Medical University', 'Kabul University', 'Khurasan University', 'Maryam Institute of Higher Education', 'National Military Academy of Afghanistan', 'Nangarhar University', 'Pamir University', 'Parwan University', 'Polytechnical University of Kabul'

In [73]:

def extract_text_from_pdf(file_path):
    pages = convert_from_path(file_path)
    reader = easyocr.Reader(['en'])
    ocr_text = ''
    for page in pages:
        img_array = np.array(page)
        result = reader.readtext(img_array, detail=0)
        ocr_text += ' '.join(result) + '\n'
    return ocr_text

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    doc_text = '\n'.join([para.text for para in doc.paragraphs])
    return doc_text


def extract_info(ocr_text):
    global final_skills_resume  # Declare we want to modify the global variable
    print(ocr_text)

    # Use regex pattern to find a potential name
    name = None
    pattern = r"(\b[A-Z][A-Za-z]*\b)\s(\b[A-Z][A-Za-z]*\b)"
    match = re.search(pattern, ocr_text)
    if match:
        name = match.group()
    else:
      name = "Not found"

    # Extract Email
    email_match = re.search(r'\b[a-z0-9\.\-+]+@[a-z0-9\.\-+]+\.[a-z]{2,}\b', ocr_text, re.IGNORECASE)
    email = email_match.group(0) if email_match else "Not found"

    # Extract Phone
    phone_match = re.search(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', ocr_text)
    phone = phone_match.group(0) if phone_match else "Not found"

    # Extract CGPA
    cgpa_match = re.search(r'\b(4\.00|0?[0-3]\.[0-9]{2})\b', ocr_text)
    cgpa = cgpa_match.group(0) if cgpa_match else "Not found"

    # Extract Degree
    degree_match = re.search(r"(?i)\b(?:Bachelor|B\.S\.|B\.A\.|Master|M\.S\.|M\.A\.|Ph\.D\.|MHIL)\s(?:[A-Za-z]+\s){0,2}[A-Za-z]+\b", ocr_text)
    degree = degree_match.group(0) if degree_match else "Not found"

    #Extracting skills
    annotations = skill_extractor.annotate(ocr_text)
    results = annotations.get("results", {})
    skills = []
    for key, matches in results.items():
        for match in matches:
            # Extract the skill text if available and longer than one character
            skill_text = match.get("doc_node_value", "").strip()
            if len(skill_text) > 1:
                skills.append(skill_text)
    # Remove duplicates and sort alphabetically
    final_skill = sorted(set(skills))
    final_skills_resume = ", ".join(final_skill)

    # Extract University/Institute name(s)
    matched_universities = []
    for uni in all_universities:
        if uni.lower() in ocr_text.lower():
            matched_universities.append(uni)
    universities_found = matched_universities if matched_universities else ["Not found"]
    universities_found = ", ".join(universities_found)

    # Results
    print(f"✅ Name: {name}")
    print(f"✅ Email: {email}")
    print(f"✅ Phone: {phone}")
    print(f"✅ CGPA: {cgpa}")
    print(f"✅ Degree: {degree}")
    print(f"✅ Universities: {universities_found}")
    print(f"✅ Skills: {final_skills_resume}")


# File path example
file_path = '/content/SafiResumeTest.pdf'

if file_path.lower().endswith('.pdf'):
    ocr_text = extract_text_from_pdf(file_path)
elif file_path.lower().endswith('.docx'):
    ocr_text = extract_text_from_docx(file_path)
else:
    print("Unsupported file format")

# Process extracted text
extract_info(ocr_text)




Syed Safiullah CONTACT CAREER OBJECTIVE Phone (+92) 311-7639-116 Upcoming Computer Science Graduate who seeks new dynamic tech opportunities. Enthusiastic  about  making new innovative projects, building creative web pages, Email inducing an efficient learning environment: Passionate about web applications and most safiullah.syed01.@gmail.com javascript-related technologies. Obsessed with new tech updates, understand work - LinkedIn ethics, profound problem solver and ready to contribute and add my perspective in the ever growing tech world. https:/IWww linkedin com/in/syed_ safiullahl23 Github https: / githubcom/Shah-Codes_ Better PROFESSIONAL EXPERIENCE EDUCATION University of Central Punjab, Web Development Intern Lahore | Sep 2024 CodeSoft Dec 2022-Feb 2023 B.S. in Computer Science (OOP,DSA,OSAI,CCN,IS) CGPA: 3.99 MORE ABOUT ME The Punjab Group Of Colleges, As a Web Enthusiast, I am keen on pursuing opportunities that will allow me to Lahore 2020 enhance and expand my knowledge in 

In [64]:
model = SentenceTransformer("anass1209/resume-job-matcher-all-MiniLM-L6-v2") # ranking model

job_description = "Bachelor's degree in Computer Science, Software Engineering. Proficiency with Next.js, JavaScript, HTML, CSS, and React. Familiarity with responsive design and UI/UX design concepts. It is advantageous to have backend experience with Flask, Django, or Node.js. Understanding of database administration (MySQL, MongoDB). Knowledge of cloud deployment and Git version control. Design and develop server-side logic using Laravel and PHP and as well as .NET and Restful APIs."

annotations = skill_extractor.annotate(job_description)
results = annotations.get("results", {})
skills = []
for key, matches in results.items():
    for match in matches:
        skill_text = match.get("doc_node_value", "").strip()
        if len(skill_text) > 1:
            skills.append(skill_text)
final_skill__ = sorted(set(skills))
final_skills_job = ", ".join(final_skill__)

job_description_skills = [final_skills_job]
resume_skills = [final_skills_resume]

embeddings_job = model.encode(job_description_skills)
embeddings_resume = model.encode(resume_skills)

similarities = model.similarity(embeddings_job, embeddings_resume)
similarities = np.array(similarities)

print("Job Skills:", final_skills_job)
print("Resume Skills:", final_skills_resume)
print("Similarity Score:", np.mean(similarities))

Job Skills: CSS, HTML, backend, computer science, database administration, design concepts, django, flask, git, javascript, laravel, mongodb, mysql, next js, node js, php, react, responsive design, restful apis, server side, software engineering, version control
Resume Skills: c++, collaborate, com, computer science, critical thinking, customer service, database management system, dispatcher, engineering project, firebase, innovation, kotlin, linux, management system, microsoft office, mysql, pre engineering, problem solve, python, san, teamwork, time management
Similarity Score: 0.7673333


In [1]:
!pip install pdf2image
!pip install spacy
!pip install easyocr
!apt-get install poppler-utils  # Critical for PDF processing
!pip install pdf2image easyocr
!apt-get install -y poppler-utils
!pip install requests
!pip install python-docx
!pip install pycountry
!pip install skillNer
!python -m spacy download en_core_web_lg


Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->easyocr)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->easyocr)
  Downloading nvidia_cuda_runtime_