In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import re
from collections import defaultdict

In [2]:
# Path to your saved model
MODEL_PATH = "./ner_skills_model"  # Update this to your model path

In [3]:
# 1. Load your fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

In [4]:
# 2. Create NER pipeline
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"  # Merge entities with same type
)


In [5]:
# 2. Create NER pipeline
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"  # Merge entities with same type
)


In [7]:
# 3. Function to extract and organize entities from resume text
def extract_resume_entities(resume_text):
    print("Processing resume text...")
    # Split text into manageable chunks if it's very long
    # This helps avoid potential issues with maximum sequence length
    max_length = 512
    chunks = []
    
    if len(resume_text) > max_length:
        # Simple chunking by sentences
        sentences = re.split(r'(?<=[.!?])\s+', resume_text)
        current_chunk = ""
        
        for sentence in sentences:
            if len(current_chunk) + len(sentence) < max_length:
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "
        
        if current_chunk:
            chunks.append(current_chunk.strip())
    else:
        chunks = [resume_text]
    
    # Process each chunk
    all_entities = []
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        entities = ner_pipeline(chunk)
        all_entities.extend(entities)
    
    # Organize entities by category
    organized_entities = defaultdict(list)
    
    for entity in all_entities:
        entity_text = entity["word"]
        entity_type = entity["entity_group"]
        score = round(entity["score"], 3)
        
        # Add to appropriate category with confidence score
        organized_entities[entity_type].append({
            "text": entity_text,
            "confidence": score
        })
    
    return organized_entities


In [12]:
# 4. Test with your resume
print("Ready to process your resume!")

# Option 1: Read from a file
resume_file_path = "test.txt"  # Update this path
try:
    with open(resume_file_path, 'r', encoding='utf-8') as file:
        resume_text = file.read()
    print(f"Successfully read resume from {resume_file_path}")
except:
    print(f"Could not read from {resume_file_path}")
    # Fallback to manual input
    resume_text = """
    # Paste your resume text here if reading from file fails
    """

Ready to process your resume!
Successfully read resume from test.txt


In [13]:
# Process the resume
entities = extract_resume_entities(resume_text)

# Display results
print("\n=== EXTRACTED INFORMATION FROM YOUR RESUME ===\n")

if not entities:
    print("No entities were detected. This might indicate:")
    print("1. The model needs more training")
    print("2. The resume format is very different from training data")
    print("3. There might be an issue with how the text is being processed")
else:
    # Print each entity type and its found entities
    for entity_type, entity_list in entities.items():
        print(f"\n{entity_type.upper()}:")
        # Sort by confidence score
        sorted_entities = sorted(entity_list, key=lambda x: x["confidence"], reverse=True)
        # Remove duplicates while preserving order
        seen = set()
        unique_entities = []
        for entity in sorted_entities:
            if entity["text"].lower() not in seen:
                seen.add(entity["text"].lower())
                unique_entities.append(entity)
        
        # Print unique entities with their confidence scores
        for entity in unique_entities:
            print(f"  - {entity['text']} (confidence: {entity['confidence']})")

print("\n=== ANALYSIS COMPLETE ===")


Processing resume text...
Processing chunk 1/13...
Processing chunk 2/13...
Processing chunk 3/13...
Processing chunk 4/13...
Processing chunk 5/13...
Processing chunk 6/13...
Processing chunk 7/13...
Processing chunk 8/13...
Processing chunk 9/13...
Processing chunk 10/13...
Processing chunk 11/13...
Processing chunk 12/13...
Processing chunk 13/13...

=== EXTRACTED INFORMATION FROM YOUR RESUME ===


MISC:
  - Git (confidence: 0.8650000095367432)
  - ##S (confidence: 0.7459999918937683)
  - ##ven (confidence: 0.7170000076293945)
  - S3 (confidence: 0.6980000138282776)
  - CI (confidence: 0.6890000104904175)
  - CSV (confidence: 0.6710000038146973)
  - ESLint (confidence: 0.6669999957084656)
  - AMQP (confidence: 0.652999997138977)
  - ##Z (confidence: 0.6470000147819519)
  - WAF (confidence: 0.6150000095367432)
  - NextJS (confidence: 0.5899999737739563)
  - ##DS (confidence: 0.5879999995231628)
  - ##au (confidence: 0.5479999780654907)
  - EC2 (confidence: 0.5479999780654907)
  - Git

In [16]:
entities['DB']

[{'text': 'Kafka', 'confidence': 0.358},
 {'text': 'Ka', 'confidence': 0.271},
 {'text': '##ka', 'confidence': 0.249},
 {'text': 'AWS', 'confidence': 0.509},
 {'text': 'PostgreSQL', 'confidence': 0.736},
 {'text': 'AWS', 'confidence': 0.541},
 {'text': 'AWS', 'confidence': 0.376},
 {'text': 'A', 'confidence': 0.434},
 {'text': 'AWS', 'confidence': 0.418},
 {'text': 'AWS', 'confidence': 0.386},
 {'text': 'AWS', 'confidence': 0.35},
 {'text': 'SQL', 'confidence': 0.924},
 {'text': 'AWS', 'confidence': 0.398},
 {'text': 'PostgreSQL', 'confidence': 0.712},
 {'text': 'MySQL', 'confidence': 0.933}]