In [7]:
import spacy
from spacy.training.example import Example
from spacy.matcher import PhraseMatcher
import re
import random
from spacy.util import minibatch, compounding

# -------------------------------
# 1️⃣ Known skills (expanded)
# -------------------------------
known_skills = [
    "react.js", "node.js", "express.js", "mysql", "mongodb", "fastapi",
    "vite", "jwt", "postman", "twilio", "aws lambda", "aws cognito",
    "dynamodb", "python", "spacy", "librosa", "bootstrap", "java", "c",
    "html5", "css3", "axios", "agile/scrum", "responsive design", "ds",
    "system design basics", "jsx", "rest api", "git", "github"
]

# -------------------------------
# 2️⃣ Training data for NER
# -------------------------------
TRAIN_DATA = [
    ("Built APIs with Node.js, Express.js, and MySQL.", {"entities": [(14, 21, "SKILL"), (23, 32, "SKILL"), (38, 43, "SKILL")]}),
    ("Developed cloud functions using AWS Lambda and DynamoDB.", {"entities": [(32, 42, "SKILL"), (47, 55, "SKILL")]}),
    ("Created web apps with React.js, Vite, and Bootstrap.", {"entities": [(20, 27, "SKILL"), (29, 33, "SKILL"), (39, 48, "SKILL")]}),
    ("Built AI solutions using Python, spaCy, and Librosa.", {"entities": [(24, 30, "SKILL"), (32, 37, "SKILL"), (43, 50, "SKILL")]}),
    ("Implemented authentication using JWT and Postman.", {"entities": [(32, 35, "SKILL"), (40, 47, "SKILL")]}),
]

# -------------------------------
# 3️⃣ Load SpaCy model & prepare NER
# -------------------------------
nlp = spacy.load("en_core_web_sm")
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")
if "SKILL" not in ner.labels:
    ner.add_label("SKILL")

# -------------------------------
# 4️⃣ Fine-tune NER (minibatch)
# -------------------------------
if TRAIN_DATA:
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.resume_training()
        for i in range(30):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(2.0, 16.0, 1.5))
            for batch in batches:
                texts, annotations = zip(*batch)
                examples = []
                for text, ann in batch:
                    doc = nlp.make_doc(text)
                    examples.append(Example.from_dict(doc, ann))
                nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)
            print(f"Iteration {i+1}, Losses: {losses}")

# -------------------------------
# 5️⃣ PhraseMatcher for known skills
# -------------------------------
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(skill) for skill in known_skills]
matcher.add("SKILL", patterns)

# -------------------------------
# 6️⃣ Text cleaning (robust)
# -------------------------------
def clean_text(text):
    text = text.strip()
    text = re.sub(r"[\(\)|,]", " ", text)  # remove (, | ,)
    text = re.sub(r"\s+", " ", text)
    return text.lower()

# -------------------------------
# 7️⃣ Extract skills
# -------------------------------
def extract_skills(text):
    text_clean = clean_text(text)
    doc = nlp(text_clean)

    # PhraseMatcher for known skills
    matched_skills = set([doc[start:end].text.lower() for _, start, end in matcher(doc)])

    # NER predictions
    ner_skills = set([ent.text.lower() for ent in doc.ents if ent.label_ == "SKILL"])

    # Combine
    all_skills = matched_skills.union(ner_skills)

    known = sorted([s for s in all_skills if s in known_skills])
    unknown = sorted([s for s in all_skills if s not in known_skills])

    return known, unknown

# -------------------------------
# 8️⃣ Extract personal info
# -------------------------------
def extract_personal_info(text):
    text_cleaned = clean_text(text)
    lines = text.splitlines()
    name = ""
    for line in lines:
        line = line.strip()
        if line and re.match(r"^[A-Za-z\s\-\.]+$", line):
            name = line
            break
    email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    email = email_match.group(0) if email_match else ""
    phone_match = re.search(r"(\+?\d[\d\s-]{7,}\d)", text)
    phone = phone_match.group(0) if phone_match else ""
    linkedin_match = re.search(r"https?://(www\.)?linkedin\.com/[^\s,]+", text)
    linkedin = linkedin_match.group(0) if linkedin_match else ""
    github_match = re.search(r"https?://(www\.)?github\.com/[^\s,]+", text)
    github = github_match.group(0) if github_match else ""
    return {"name": name, "email": email, "phone": phone, "linkedin": linkedin, "github": github}

# -------------------------------
# 9️⃣ Test on CV
# -------------------------------
cv_text = """
PAVAN CHANDRAPPA HOTTIGOUDRA
Software Engineer | Full-Stack(MERN) | Rest APIs | AWS | AI & Data-Driven System
+91 7483022523 | pavandvh27@gmail.com | https://linkedin.com/in/pavan | https://github.com/pavan
Tech Stack: Node.js, Express.js, AWS Lambda, React.js, MySQL, FastAPI, Vite, JWT, Postman, Twilio
"""

personal_info = extract_personal_info(cv_text)
known_skills_found, unknown_skills_found = extract_skills(cv_text)

print("🧑 Personal Info:")
for k, v in personal_info.items():
    print(f"  {k}: {v}")

print("\n✅ Known Skills:")
for k in known_skills_found:
    print("  -", k)

print("\n❓ Unknown Skills:")
for u in unknown_skills_found:
    print("  -", u)


Iteration 1, Losses: {'ner': 6.622135419140746}
Iteration 2, Losses: {'ner': 2.925693204918483}
Iteration 3, Losses: {'ner': 3.9386713766969517}
Iteration 4, Losses: {'ner': 3.222151044429803}
Iteration 5, Losses: {'ner': 3.1699709109647514}
Iteration 6, Losses: {'ner': 3.6515146719725036}
Iteration 7, Losses: {'ner': 2.4866343166239124}
Iteration 8, Losses: {'ner': 4.101499932592237}
Iteration 9, Losses: {'ner': 2.518325702643724}
Iteration 10, Losses: {'ner': 2.591015669329465}
Iteration 11, Losses: {'ner': 2.835669827058166}
Iteration 12, Losses: {'ner': 2.904091538493544}
Iteration 13, Losses: {'ner': 2.234317871689236}
Iteration 14, Losses: {'ner': 3.3193737687773686}
Iteration 15, Losses: {'ner': 2.363161404740587}
Iteration 16, Losses: {'ner': 2.20570524669047}
Iteration 17, Losses: {'ner': 1.8927399225282469}
Iteration 18, Losses: {'ner': 2.2691721898493213}
Iteration 19, Losses: {'ner': 7.004281670157811}
Iteration 20, Losses: {'ner': 6.6687433088786605}
Iteration 21, Losses: 

In [8]:
test_resumes = [
    # ---------------- Resume 1 ----------------
    """
RAHUL NARAYAN
Software Developer | Cloud-Native Systems | AI Solutions
+91 9876543210 | rahul.narayan@example.com | https://linkedin.com/in/rahul | https://github.com/rahul
Worked on cloud functions using AWS Lambda and DynamoDB to automate workflows.
Developed web applications with React.js, Node.js, and Express.js for better user experience.
Implemented AI features using Python and spaCy for text analysis.
Created responsive dashboards using Vite and Bootstrap.
""",
    # ---------------- Resume 2 ----------------
    """
SNEHA KUMARI
Full-Stack Engineer | Backend & AI Integration
+91 9123456780 | sneha.k@example.com | https://linkedin.com/in/sneha | https://github.com/sneha
Designed APIs with FastAPI and integrated MySQL and MongoDB databases for data management.
Built real-time notifications using Twilio and JWT authentication.
Developed front-end components using React.js and Bootstrap for mobile-first design.
Utilized Python and Librosa for audio signal processing in AI-powered features.
""",
    # ---------------- Resume 3 ----------------
    """
ADITYA SHARMA
Cloud & Backend Developer
+91 9988776655 | aditya.sharma@example.com | https://linkedin.com/in/aditya | https://github.com/aditya
Implemented serverless functions using AWS Lambda and secured services with Cognito.
Built REST APIs using Node.js and Express.js with MySQL for storing user data.
Created interactive web apps using React.js and Vite with reusable components.
Integrated Postman for API testing and debugging workflows.
""",
    # ---------------- Resume 4 ----------------
    """
PRIYA PATIL
AI & Web Developer
+91 9876501234 | priya.patil@example.com | https://linkedin.com/in/priya | https://github.com/priya
Built AI-powered tools using Python, spaCy, and Librosa to analyze user data.
Developed front-end with React.js, Bootstrap, and Vite for responsive UI.
Created backend services with Node.js, Express.js, and MongoDB.
Implemented authentication and API testing with JWT and Postman.
Deployed serverless architecture using AWS Lambda and DynamoDB.
""",
    """
PAVAN CHANDRAPPA HOTTIGOUDRA
Software Engineer | Full-Stack(MERN) | Rest APIs | AWS | AI & Data-Driven System
+91 7483022523 | pavandvh27@gmail.com | LinkedIn | GitHub

Software  Engineer  with  experience  in  full-stack
development,   cloud-native   APIs,   and   AI-driven
applications.   Skilled  in  MERN   stack,   RestAPI,
React.js,  MongoDB,  and  AWS  (Lambda,  Cognito,
DynamoDB) with expertise in secure
authentication,   scalable   deployments,   and   data-
driven  system  design.  Actively  practicing  DSA  on
LeetCode  and  passionate  about  building  robust,
efficient, and impactful software solutions.
WORK EXPERIENCE

Intern – Backend Developer
Gandeevan Technologies, Bengaluru, India
(Hybrid) Jul 2025 – Present
• Built  secure  RESTful  APIs  for  authentication
and    data    management,    integrating    AWS
Cognito for identity and access control.
• Designed  scalable  serverless  workflows  and
automated  deployments  via  CI/CD  pipelines,
improving reliability and release speed.
• Enhanced  API  security  through  unit  testing,
validation, and best practices while
collaborating with cross-functional teams.
Tech   Stack: Node.js,   Express.js,   AWS   (Cognito,
DynamoDB, API Gateway, Lambda), CI/CD, Serverless
Architecture.
PROJECT

Blood Donation Management System
• Developed a scalable SPA using React.js (Vite)
with 15+ reusable components, achieving 95%
performance and mobile responsiveness.
• Implemented   JWT   auth   with   OTP   reset,
securing    all    private    routes    via    Express
middleware.
• Built 10+ RESTful APIs with Node.js/Express,
integrated  MySQL  to  manage  donors,  blood
banks, and requests.
Tech: React.js, Node.js, MySQL, Express, Twilio,
REST APIs
Smart Interview Preparation App
• Built secure RESTful APIs and authentication
systems,  integrating  cloud  services  to  ensure
scalability and data protection.
• Developed   a   smart   interview   preparation
platform with  mock  interviews,  resume-based
question   generation,   and   real-time   speech
feedback.
• Designed  intuitive  dashboards  and  features  to
track performance, progress, and
achievements, focusing on user-centric
experience.
Tech: React.js, FastAPI, MongoDB, Librosa,
Whisper, spaCy.
EDUCTION

Visveshwaraya Technological University
Regional Center Mysuru
Computer Science and Engineering
CGPA : 8.7 (Aug 2022 – Jun 2026)
Kumadvathi Science and Commerce PU College
Shikaripura, Shivamogga, Karnataka
Percentage : 95.5% (Jul 2020 – Apr 2022)
JGSS English Medium School
Haveri, Karnataka
Percentage: 83% (Jun 2011 – Mar 2020)
SKILLS

Java, AWS, Python,  C,  HTML5,  CSS3,  MySQL,
React.js,   Vite,   JSX,   Bootstrap,   Axios,   Node.js,
Express.js, REST API, JWT, Git, GitHub, Postman,
Agile/Scrum,  Responsive  Design,  DSA,  System
Design Basics
CERTIFICATIONS

Inceptrix   Hackathon   2025 – Jain   University
Solved    real-world    problems    using    intelligent
innovation and rapid prototyping.
HACK-2-INTERN – VTU, CPGS Mysuru
Built real-time software in a team setting; improved
debugging and development skills.
NPTEL – The  Joy  of  Computing  using  Python
Learned   Python,   algorithms,   file   handling,   and
problem-solving fundamentals.
Green  Skills  &  AI  Foundation  Course – VTU,
AICTE,  Shell India Studied  AI  for  sustainability,
smart energy, and ethical tech innovation.
    """
]
for i, resume in enumerate(test_resumes, 1):
    print(f"\n--- Resume {i} ---")
    personal_info = extract_personal_info(resume)
    known_skills_found, unknown_skills_found = extract_skills(resume)
    print("🧑 Personal Info:", personal_info)
    print("✅ Known Skills:", known_skills_found)
    print("❓ Unknown Skills:", unknown_skills_found)



--- Resume 1 ---
🧑 Personal Info: {'name': 'RAHUL NARAYAN', 'email': 'rahul.narayan@example.com', 'phone': '+91 9876543210', 'linkedin': 'https://linkedin.com/in/rahul', 'github': 'https://github.com/rahul'}
✅ Known Skills: ['aws lambda', 'bootstrap', 'dynamodb', 'express.js', 'node.js', 'python', 'react.js', 'spacy', 'vite']
❓ Unknown Skills: []

--- Resume 2 ---
🧑 Personal Info: {'name': 'SNEHA KUMARI', 'email': 'sneha.k@example.com', 'phone': '+91 9123456780', 'linkedin': 'https://linkedin.com/in/sneha', 'github': 'https://github.com/sneha'}
✅ Known Skills: ['bootstrap', 'fastapi', 'jwt', 'librosa', 'mongodb', 'mysql', 'python', 'react.js', 'twilio']
❓ Unknown Skills: []

--- Resume 3 ---
🧑 Personal Info: {'name': 'ADITYA SHARMA', 'email': 'aditya.sharma@example.com', 'phone': '+91 9988776655', 'linkedin': 'https://linkedin.com/in/aditya', 'github': 'https://github.com/aditya'}
✅ Known Skills: ['aws lambda', 'express.js', 'mysql', 'node.js', 'postman', 'react.js', 'vite']
❓ Unknown

In [10]:
import spacy
from spacy.training.example import Example
from spacy.matcher import PhraseMatcher
from spacy.util import minibatch, compounding
import random
import re

# -------------------------------
# 1️⃣ Skill variations
# -------------------------------
skill_variations = {
    "node.js": ["node.js", "nodejs", "nodeJs", "NODE.JS"],
    "react.js": ["react.js", "reactjs", "reactJs", "React"],
    "express.js": ["express.js", "express", "Express.js"],
    "mysql": ["mysql", "MySQL"],
    "mongodb": ["mongodb", "mongoDB", "MongoDb"],
    "vite": ["vite", "VITE"],
    "jwt": ["jwt", "JWT"],
    "postman": ["postman", "POSTMAN"],
    "aws lambda": ["aws lambda", "AWS Lambda", "Lambda"],
    "aws cognito": ["aws cognito", "Cognito"],
    "dynamodb": ["dynamodb", "DynamoDB"],
    "python": ["python", "Python"],
    "spacy": ["spacy", "spaCy"],
    "librosa": ["librosa", "Librosa"],
    "bootstrap": ["bootstrap", "Bootstrap"],
    "jsx": ["jsx", "JSX"],
    "rest api": ["rest api", "REST API"],
}

# -------------------------------
# 2️⃣ Load SpaCy and add NER
# -------------------------------
nlp = spacy.load("en_core_web_sm")
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

if "SKILL" not in ner.labels:
    ner.add_label("SKILL")

# -------------------------------
# 3️⃣ Text cleaning
# -------------------------------
def clean_text(text):
    text = text.strip()
    text = re.sub(r"[\(\)|,]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.lower()

# -------------------------------
# 4️⃣ Auto-generate TRAIN_DATA from resumes
# -------------------------------
def generate_train_data(resumes, skill_variations):
    train_data = []
    for resume in resumes:
        text_original = resume
        entities = []
        text_lower = text_original.lower()
        for skill, variations in skill_variations.items():
            for var in variations:
                for match in re.finditer(r'\b' + re.escape(var.lower()) + r'\b', text_lower):
                    entities.append((match.start(), match.end(), "SKILL"))
        train_data.append((text_original, {"entities": entities}))
    return train_data

# -------------------------------
# 5️⃣ PhraseMatcher for known skills
# -------------------------------
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
for skill, variations in skill_variations.items():
    patterns = [nlp.make_doc(v) for v in variations]
    matcher.add(skill, patterns)

# -------------------------------
# 6️⃣ Skill extraction
# -------------------------------
def extract_skills(text):
    text_clean = clean_text(text)
    doc = nlp(text_clean)

    matched_skills = set()
    for match_id, start, end in matcher(doc):
        skill_label = nlp.vocab.strings[match_id]
        matched_skills.add(skill_label.lower())

    ner_skills = set([ent.text.lower() for ent in doc.ents if ent.label_ == "SKILL"])
    all_skills = matched_skills.union(ner_skills)

    known = sorted([s for s in all_skills if s in skill_variations])
    unknown = sorted([s for s in all_skills if s not in skill_variations])
    return known, unknown

# -------------------------------
# 7️⃣ Personal info extraction
# -------------------------------
def extract_personal_info(text):
    lines = text.splitlines()
    name = ""
    for line in lines:
        line = line.strip()
        if line and re.match(r"^[A-Za-z\s\-\.]+$", line):
            name = line
            break
    email = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    phone = re.search(r"(\+?\d[\d\s-]{7,}\d)", text)
    linkedin = re.search(r"https?://(www\.)?linkedin\.com/[^\s,]+", text)
    github = re.search(r"https?://(www\.)?github\.com/[^\s,]+", text)
    return {
        "name": name,
        "email": email.group(0) if email else "",
        "phone": phone.group(0) if phone else "",
        "linkedin": linkedin.group(0) if linkedin else "",
        "github": github.group(0) if github else ""
    }

# -------------------------------
# 8️⃣ Validation function to check overfitting/underfitting
# -------------------------------
def validate_ner(resumes):
    correct, predicted_total, actual_total = 0, 0, 0
    for resume in resumes:
        doc = nlp(resume)
        predicted = set([ent.text.lower() for ent in doc.ents if ent.label_ == "SKILL"])
        actual = set()
        for skill, variations in skill_variations.items():
            for var in variations:
                if re.search(r'\b' + re.escape(var) + r'\b', resume, re.IGNORECASE):
                    actual.add(skill)
        correct += len(predicted & actual)
        predicted_total += len(predicted)
        actual_total += len(actual)
    precision = correct / predicted_total if predicted_total else 0
    recall = correct / actual_total if actual_total else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    print(f"Precision: {precision:.2%}, Recall: {recall:.2%}, F1-score: {f1:.2%}")
    return precision, recall, f1

# -------------------------------
# 9️⃣ Train NER
# -------------------------------
def train_ner(resumes, skill_variations, n_epochs=30):
    TRAIN_DATA = generate_train_data(resumes, skill_variations)
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.resume_training()
        for epoch in range(n_epochs):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(2.0, 16.0, 1.5))
            for batch in batches:
                texts, annotations = zip(*batch)
                examples = [Example.from_dict(nlp.make_doc(t), a) for t, a in batch]
                nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)
            print(f"Epoch {epoch+1}/{n_epochs} — Losses: {losses}")

    nlp.to_disk("ner_model")
    print("Model saved to ner_model/")

# -------------------------------
#  🔟 Example usage
# -------------------------------
train_resumes = [
    """ALICE JOHNSON ...""",
    """BOB SMITH ...""",
    """CAROL LEE ...""",
    """DAVID KUMAR ...""",
    """EVELYN CHEN ...""",
    """FRANK WU ..."""
]

train_ner(train_resumes, skill_variations, n_epochs=40)
validate_ner(train_resumes)


Epoch 1/40 — Losses: {'ner': 2.793452283635041}
Epoch 2/40 — Losses: {'ner': 1.4955650032058458}
Epoch 3/40 — Losses: {'ner': 0.00016181467963910338}
Epoch 4/40 — Losses: {'ner': 3.6381758325934236e-05}
Epoch 5/40 — Losses: {'ner': 8.342458645778399e-09}
Epoch 6/40 — Losses: {'ner': 1.0167777211994722e-08}
Epoch 7/40 — Losses: {'ner': 1.058194665469815e-05}
Epoch 8/40 — Losses: {'ner': 3.101190213313754e-06}
Epoch 9/40 — Losses: {'ner': 5.486041816539672e-09}
Epoch 10/40 — Losses: {'ner': 9.128216009161061e-11}
Epoch 11/40 — Losses: {'ner': 2.7164852514602936e-11}
Epoch 12/40 — Losses: {'ner': 6.693852608417624e-11}
Epoch 13/40 — Losses: {'ner': 5.778224264175894e-10}
Epoch 14/40 — Losses: {'ner': 1.9530703547548156e-11}
Epoch 15/40 — Losses: {'ner': 4.339097283356816e-11}
Epoch 16/40 — Losses: {'ner': 3.550857759234886e-12}
Epoch 17/40 — Losses: {'ner': 5.1336386891847157e-14}
Epoch 18/40 — Losses: {'ner': 2.333377875870449e-11}
Epoch 19/40 — Losses: {'ner': 4.942122774962182e-12}
Epo

(0, 0, 0)

In [12]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.3/400.7 MB ? eta -:--:--
     ---------------------------------------- 1.0/400.7 MB 3.0 MB/s eta 0:02:16
     ---------------------------------------- 1.6/400.7 MB 3.1 MB/s eta 0:02:09
     ---------------------------------------- 2.9/400.7 MB 3.8 MB/s eta 0:01:45
     ---------------------------------------- 3.9/400.7 MB 4.1 MB/s eta 0:01:37
     ---------------------------------------- 4.7/400.7 MB 4.1 MB/s eta 0:01:38
      --------------------------------------- 5.8/400.7 MB 4.2 MB/s eta 0:01:35
      --------------------------------------- 6.8/400.7 MB 4.4 MB/s eta 0:01:31
      --------------------------------------- 7.9/400.7 MB 4.5 MB/s eta 0:01:29
      -------------------------------

In [13]:
import spacy
nlp = spacy.load("en_core_web_lg")
print("Model loaded successfully!")

Model loaded successfully!


In [15]:
import spacy
from spacy.training.example import Example
from spacy.matcher import PhraseMatcher
from spacy.util import minibatch, compounding
import random
import re

# -------------------------------
# 1️⃣ Skill variations (expanded)
# -------------------------------
skill_variations = {
    "node.js": ["node.js", "nodejs", "nodeJs", "NODE.JS"],
    "react.js": ["react.js", "reactjs", "reactJs", "React", "REACTJS", "React native"],
    "express.js": ["express.js", "express", "Express.js", "Express.JS"],
    "mysql": ["mysql", "MySQL"],
    "mongodb": ["mongodb", "mongoDB", "MongoDb"],
    "vite": ["vite", "VITE"],
    "jwt": ["jwt", "JWT"],
    "postman": ["postman", "POSTMAN"],
    "aws lambda": ["aws lambda", "AWS Lambda", "Lambda"],
    "aws cognito": ["aws cognito", "Cognito"],
    "dynamodb": ["dynamodb", "DynamoDB"],
    "python": ["python", "Python"],
    "spacy": ["spacy", "spaCy"],
    "librosa": ["librosa", "Librosa"],
    "bootstrap": ["bootstrap", "Bootstrap"],
    "jsx": ["jsx", "JSX"],
    "rest api": ["rest api", "REST API", "REST APIs"],
    "redis": ["redis", "Redis"],
    "css3": ["css3", "CSS3"]
}

# -------------------------------
# 2️⃣ Load transformer-based SpaCy model
# -------------------------------
nlp = spacy.load("en_core_web_lg")  # stronger transformer model
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")
for skill in skill_variations:
    if skill.upper() not in ner.labels:
        ner.add_label("SKILL")

# -------------------------------
# 3️⃣ Expanded TRAIN_DATA
# -------------------------------
TRAIN_DATA = [
    ("Built APIs with Node.js, Express.js, and MySQL.", {"entities": [(14, 21, "SKILL"), (23, 32, "SKILL"), (38, 43, "SKILL")]}),
    ("Developed cloud functions using AWS Lambda and DynamoDB.", {"entities": [(32, 42, "SKILL"), (47, 55, "SKILL")]}),
    ("Created web apps with React.js, Vite, and Bootstrap.", {"entities": [(20, 27, "SKILL"), (29, 33, "SKILL"), (39, 48, "SKILL")]}),
    ("Implemented authentication using JWT and tested APIs via Postman.", {"entities": [(35, 38, "SKILL"), (58, 65, "SKILL")]}),
    ("Front-end work with ReactJS, JSX, and Bootstrap.", {"entities": [(17, 24, "SKILL"), (26, 29, "SKILL"), (35, 44, "SKILL")]}),
    ("Worked on Nodejs, Express, REST APIs, MongoDB, and MySQL.", {"entities": [(8, 14, "SKILL"), (16, 23, "SKILL"), (25, 33, "SKILL"), (35, 42, "SKILL"), (48, 53, "SKILL")]}),
    ("Built machine learning pipelines in Python with spaCy and Librosa.", {"entities": [(37, 43, "SKILL"), (49, 54, "SKILL"), (59, 66, "SKILL")]}),
    ("Database work with MySQL, MongoDB, and Redis caching.", {"entities": [(20, 25, "SKILL"), (27, 34, "SKILL"), (40, 45, "SKILL")]}),
]

# -------------------------------
# 4️⃣ Fine-tune NER
# -------------------------------
if TRAIN_DATA:
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.resume_training()
        n_epochs = 15  # more epochs for better learning
        for epoch in range(n_epochs):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(2.0, 16.0, 1.5))
            for batch in batches:
                texts, annotations = zip(*batch)
                examples = [Example.from_dict(nlp.make_doc(t), a) for t, a in batch]
                nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)
            print(f"Epoch {epoch+1}/{n_epochs} — Losses: {losses}")

# -------------------------------
# 5️⃣ PhraseMatcher for skill variations
# -------------------------------
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
for skill, variations in skill_variations.items():
    patterns = [nlp.make_doc(v) for v in variations]
    matcher.add(skill, patterns)

# -------------------------------
# 6️⃣ Text cleaning
# -------------------------------
def clean_text(text):
    text = text.strip()
    text = re.sub(r"[\(\)|,]", " ", text)  # remove (, | ,)
    text = re.sub(r"\s+", " ", text)
    return text.lower()

# -------------------------------
# 7️⃣ Skill extraction
# -------------------------------
def extract_skills(text):
    text_clean = clean_text(text)
    doc = nlp(text_clean)

    matched_skills = set()
    for match_id, start, end in matcher(doc):
        skill_label = nlp.vocab.strings[match_id]
        matched_skills.add(skill_label.lower())

    ner_skills = set([ent.text.lower() for ent in doc.ents if ent.label_ == "SKILL"])
    all_skills = matched_skills.union(ner_skills)

    known = sorted([s for s in all_skills if s in skill_variations])
    unknown = sorted([s for s in all_skills if s not in skill_variations])
    return known, unknown

# -------------------------------
# 8️⃣ Personal info extraction
# -------------------------------
def extract_personal_info(text):
    lines = text.splitlines()
    name = ""
    for line in lines:
        line = line.strip()
        if line and re.match(r"^[A-Za-z\s\-\.]+$", line):
            name = line
            break
    email = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    phone = re.search(r"(\+?\d[\d\s-]{7,}\d)", text)
    linkedin = re.search(r"https?://(www\.)?linkedin\.com/[^\s,]+", text)
    github = re.search(r"https?://(www\.)?github\.com/[^\s,]+", text)
    return {
        "name": name,
        "email": email.group(0) if email else "",
        "phone": phone.group(0) if phone else "",
        "linkedin": linkedin.group(0) if linkedin else "",
        "github": github.group(0) if github else ""
    }

# -------------------------------
# 9️⃣ Test resumes
# -------------------------------
test_resumes = [
    """ALICE JOHNSON ...""",  # same resumes as before
]

for i, resume in enumerate(test_resumes, 1):
    print(f"\n--- Resume {i} ---")
    personal_info = extract_personal_info(resume)
    known_skills_found, unknown_skills_found = extract_skills(resume)
    print("🧑 Personal Info:", personal_info)
    print("✅ Known Skills:", known_skills_found)
    print("❓ Unknown Skills:", unknown_skills_found)




Epoch 1/15 — Losses: {'ner': 4.419294722876495}
Epoch 2/15 — Losses: {'ner': 3.4025006853103212}
Epoch 3/15 — Losses: {'ner': 3.649885867547811}
Epoch 4/15 — Losses: {'ner': 3.42407873951669}
Epoch 5/15 — Losses: {'ner': 3.791662938986466}
Epoch 6/15 — Losses: {'ner': 3.5082755368671643}
Epoch 7/15 — Losses: {'ner': 3.236031970954714}
Epoch 8/15 — Losses: {'ner': 2.581859796709697}
Epoch 9/15 — Losses: {'ner': 2.440496381751922}
Epoch 10/15 — Losses: {'ner': 2.205741116776898}
Epoch 11/15 — Losses: {'ner': 2.8437795902272622}
Epoch 12/15 — Losses: {'ner': 2.590886679332111}
Epoch 13/15 — Losses: {'ner': 2.107688010760314}
Epoch 14/15 — Losses: {'ner': 2.341847686619233}
Epoch 15/15 — Losses: {'ner': 2.0234971129639083}

--- Resume 1 ---
🧑 Personal Info: {'name': 'ALICE JOHNSON ...', 'email': '', 'phone': '', 'linkedin': '', 'github': ''}
✅ Known Skills: []
❓ Unknown Skills: []
