In [1]:
import pandas as pd

# Specify the path to your CSV file
file_location = R"data/Resume.csv"
resume_data = pd.read_csv(file_location)

# Display the first few rows of the dataset
resume_data.head()



Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [None]:

import requests
from io import StringIO

# URL for the JSON Lines file
json_url = 'https://raw.githubusercontent.com/mystery2life/NLP-Project/main/temp_patterns.jsonl'

# Retrieve the data from the URL
response = requests.get(json_url)
response.raise_for_status()  # Raises an exception for HTTP errors

# Convert JSON Lines string into a pandas DataFrame
skills_data = pd.read_json(StringIO(response.text), lines=True)


In [None]:

# Download the spaCy model
#!python -m spacy download en_core_web_lg

#!python -m spacy download en_core_web_trf
# !pip install spacy-transformers

# !python -m spacy download en_core_web_md



In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_md")

from spacy.matcher import Matcher

# Initialize the matcher with the vocabulary
skill_matcher = Matcher(nlp.vocab)

# Add patterns to the matcher
for idx, row in skills_data.iterrows():
    skill_pattern = row['pattern']  # Assuming this is a list of dictionaries
    skill_matcher.add("SKILL", [skill_pattern])



In [None]:
# Install tqdm for progress tracking
!pip install tqdm



In [None]:
from tqdm import tqdm

def extract_skills_from_resumes(resume_texts):
    extracted_data = []
    for resume in tqdm(resume_texts, desc="Extracting skills"):
        doc = nlp(resume)
        match_results = skill_matcher(doc)
        skill_spans = [doc[start:end] for _, start, end in match_results]
        entities = [(span.start_char, span.end_char, "SKILL") for span in skill_spans if span.text.strip()]
        extracted_data.append((resume, {"entities": entities}))
    return extracted_data

# Limit the data to the first 5 resumes
sample_resumes = resume_data['Resume_str'].tolist()

# Extract skills from the sample resumes
sample_training_data = extract_skills_from_resumes(sample_resumes)

# Print the first item of the training data
print(sample_training_data[1])


In [None]:
import pickle
# Save sample_training_data to a file
# with open('sample_training_data.pkl', 'wb') as f:
#     pickle.dump(sample_training_data, f)

# Load sample_training_data from the file
with open('sample_training_data.pkl', 'rb') as f:
    sample_training_data = pickle.load(f)

In [None]:

# Install scikit-learn for model training
#!pip install scikit-learn
!pip install spacy-lookups-data


In [None]:

from sklearn.model_selection import train_test_split

# Unzip the training data into texts and annotations
texts, annotations = zip(*sample_training_data)

# Split the data into training and validation sets
train_texts, val_texts, train_annotations, val_annotations = train_test_split(
    texts, annotations, test_size=0.2, random_state=42
)


# Recreate the training and validation sets as tuples (text, annotation)
train_data = list(zip(train_texts, train_annotations))
val_data = list(zip(val_texts, val_annotations))





In [None]:
train_data

In [None]:
import spacy
import random
from tqdm import tqdm
from spacy.tokens import Span
from spacy.training import Example
from spacy.lookups import load_lookups
from spacy.matcher import Matcher
import pandas as pd
import os
import pickle



ner = nlp.get_pipe("ner")

# Function to remove overlapping entities
def remove_overlapping_entities(entities):
    """
    Remove overlapping entities from the list.
    
    Args:
        entities (list): List of tuples containing start, end, and label of entities.
    
    Returns:
        list: List of non-overlapping entities.
    """
    entities = sorted(entities, key=lambda x: x[0])  # Sort by start position
    non_overlapping_entities = []
    last_end = -1
    for start, end, label in entities:
        if start >= last_end:  # No overlap
            non_overlapping_entities.append((start, end, label))
            last_end = end
    return non_overlapping_entities

# Add the 'SKILL' entity label to the NER model if it's not already known
if 'SKILL' not in ner.labels:
    ner.add_label("SKILL")

# Function to add custom entity labels from training data
def add_custom_entity_labels(train_data):
    """
    Add custom entity labels from training data to the NER model.
    
    Args:
        train_data (list): List of tuples containing text and annotations.
    """
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            if ent[2] not in ner.labels:
                ner.add_label(ent[2])

# Add custom entities from training data
add_custom_entity_labels(train_data)

# Disable other pipeline components during training to train only NER
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

# Function to preprocess training data to remove overlapping entities
def preprocess_training_data(train_data):
    """
    Preprocess the training data to remove overlapping entities.
    
    Args:
        train_data (list): List of tuples containing text and annotations.
    
    Returns:
        list: List of preprocessed training data.
    """
    preprocessed_data = []
    for text, annotations in train_data:
        entities = annotations.get("entities")
        non_overlapping_entities = remove_overlapping_entities(entities)
        preprocessed_data.append((text, {"entities": non_overlapping_entities}))
    return preprocessed_data

# Preprocess training data to remove overlapping entities
train_data = preprocess_training_data(train_data)

# Function to train NER model
def train_ner_model(train_data, n_iter=10):
    """
    Train the NER model with the given training data.
    
    Args:
        train_data (list): List of tuples containing text and annotations.
        n_iter (int): Number of training iterations.
    """
    with nlp.disable_pipes(*unaffected_pipes):  # Only train NER
        optimizer = nlp.initialize()  # Correct initialization for transformer-based models
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            for text, annotations in tqdm(train_data, desc=f"Iteration {itn+1}"):
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update([example], sgd=optimizer, drop=0.5, losses=losses)
            print(f"Iteration {itn+1} Losses: {losses}")

# Train the NER model
train_ner_model(train_data)

In [None]:
import os
import spacy
from spacy.tokens import Span
from spacy.training import Example
from spacy.scorer import Scorer

# Define a valid path for saving the model
model_dir = r"/Users/RanjithJames/Downloads/NLP-Project-project/data/model"

# Create the directory if it does not exist
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save the trained model to disk
def save_model(nlp, model_dir):
    """
    Save the trained spaCy model to disk.
    
    Args:
        nlp (Language): The spaCy model to be saved.
        model_dir (str): The directory path where the model will be saved.
    """
    nlp.to_disk(model_dir)

save_model(nlp, model_dir)

# Load the model from the saved directory into a new variable
def load_model(model_dir):
    """
    Load a spaCy model from a specified directory.
    
    Args:
        model_dir (str): The directory path from where the model will be loaded.
    
    Returns:
        Language: The loaded spaCy model.
    """
    return spacy.load(model_dir)

newnlp = load_model(model_dir)


# Process a new text through the loaded model
def print_entities(ner_model, text):
    """
    Process the text through the NER model and print the recognized entities and their labels.
    
    Args:
        ner_model (Language): The spaCy NER model.
        text (str): The text to be processed.
    """
    doc = ner_model(text)
    for ent in doc.ents:
        print(ent.text, ent.label_)

# Example text to process
test_text = "Abid Ali Awan Data Scientist I am a certified data scientist professional, who loves building machine learning models and blogs about the latest AI technologies. I am currently testing AI Products at PEC-PITC, which later gets approved for human trials. abidaliawan@tutamail.com +923456855126 Islamabad, Pakistan abidaliawan.me WORK EXPERIENCE Data Scientist Pakistan Innovation and Testing Center - PEC 04/2021 - Present, Islamabad, Pakistan Redesigned data of engineers that were mostly scattered and unavailable. Designed dashboard and data analysis report to help higher management make better decisions. Accessibility of key information has created a new culture of making data-driven decisions. Contact: Ali Raza Asif - darkslayerraza10@gmail.com Data Scientist Freelancing/Kaggle 11/2020 - Present, Islamabad, Pakistan Engineered a healthcare system. Used machine learning to detect some of the common decisions. The project has paved the way for others to use new techniques to get better results. Participated in Kaggle machine learning competitions. Learned new techniques to get a better score and finally got to 1 percent rank. Researcher / Event Organizer CREDIT 02/2017 - 07/2017, Kuala Lumpur, Malaysia Marketing for newly build research lab. Organized technical events and successfully invited the multiple company's CEO for talks. Reduced the gap between industries and educational institutes. Research on new development in the IoT sector. Created research proposal for funding. Investigated the new communication protocol for IoT devices. Contact: Dr. Tan Chye Cheah - dr.chyecheah.t@apu.edu.my EDUCATION MSc in Technology Management Staffordshire University 11/2015 - 04/2017, Postgraduate with Distinction Challenges in Implementing IoT-enabled Smart cities in Malaysia. Bachelors Electrical Telecommunication Engineering COMSATS Institute of Information Technology, Islamabad 08/2010 - 01/2014, CGPA: 3.09 Networking Satellite communications Programming/ Matlab Telecommunication Engineering SKILLS Designing Leadership Media/Marketing R/Python SQL Tableau NLP Data Analysis Machine learning Deep learning Webapp/Cloud Feature Engineering Ensembling Time Series Technology Management ACHIEVEMENTS 98th Hungry Geese Simulation Competition (08/2021) 2nd in Covid-19 vaccinations around the world (07/2021) 8th in Automatic Speech Recognition in WOLOF (06/2021) Top 10 in WiDS Datathon. (03/2021) 40th / 622 in MagNet: Model the Geomagnetic Field Hosted by NOAA (02/2021) 18th in Rock, Paper, Scissors/Designing AI Agent Competition. (02/2021) PROJECTS Goodreads Profile Analysis WebApp (09/2021) Data Analysis Web Scraping XLM Interactive Visualization Contributed in orchest.io (08/2021) Testing and Debuging Technical Article Proposing new was to Improve ML pipelines World Vaccine Update System (06/2021) Used sqlite3 for database Automated system for daily update the Kaggle DB and Analysis Interactive dashboard mRNA-Vaccine-Degradation-Prediction (06/2021) Explore our dataset and then preprocessed sequence, structure, and predicted loop type features Train deep learning GRU model Trip Advisor Data Analysis/ML (04/2021) Preprocessing Data, Exploratory Data analysis, Word clouds. Feature Engineering, Text processing. BiLSTM Model for predicting rating, evaluation, model performance. Jane Street Market Prediction (03/2021) EDA, Feature Engineering, experimenting with hyperparameters. Ensembling: Resnet, NN Embeddings, TF Simple NN model. Using simple MLP pytorch model. Achievements/Tasks Achievements/Tasks Achievements/Tasks Thesis Courses"
# Print recognized entities and their labels
print_entities(newnlp, test_text)


In [4]:
import os
import spacy
from spacy.tokens import Span
from spacy.training import Example
from spacy.scorer import Scorer

def load_model(model_dir):
    """
    Load a spaCy model from a specified directory.
    
    Args:
        model_dir (str): The directory path from where the model will be loaded.
    
    Returns:
        Language: The loaded spaCy model.
    """
    return spacy.load(model_dir)
newnlp = load_model(model_dir)


# Process a new text through the loaded model
def print_entities(ner_model, text):
    """
    Process the text through the NER model and print the recognized entities and their labels.
    
    Args:
        ner_model (Language): The spaCy NER model.
        text (str): The text to be processed.
    """
    doc = ner_model(text)
    for ent in doc.ents:
        print(ent.text, ent.label_)

# Example text to process
test_text = "Abid Ali Awan Data Scientist I am a certified data scientist professional, who loves building machine learning models and blogs about the latest AI technologies. I am currently testing AI Products at PEC-PITC, which later gets approved for human trials. abidaliawan@tutamail.com +923456855126 Islamabad, Pakistan abidaliawan.me WORK EXPERIENCE Data Scientist Pakistan Innovation and Testing Center - PEC 04/2021 - Present, Islamabad, Pakistan Redesigned data of engineers that were mostly scattered and unavailable. Designed dashboard and data analysis report to help higher management make better decisions. Accessibility of key information has created a new culture of making data-driven decisions. Contact: Ali Raza Asif - darkslayerraza10@gmail.com Data Scientist Freelancing/Kaggle 11/2020 - Present, Islamabad, Pakistan Engineered a healthcare system. Used machine learning to detect some of the common decisions. The project has paved the way for others to use new techniques to get better results. Participated in Kaggle machine learning competitions. Learned new techniques to get a better score and finally got to 1 percent rank. Researcher / Event Organizer CREDIT 02/2017 - 07/2017, Kuala Lumpur, Malaysia Marketing for newly build research lab. Organized technical events and successfully invited the multiple company's CEO for talks. Reduced the gap between industries and educational institutes. Research on new development in the IoT sector. Created research proposal for funding. Investigated the new communication protocol for IoT devices. Contact: Dr. Tan Chye Cheah - dr.chyecheah.t@apu.edu.my EDUCATION MSc in Technology Management Staffordshire University 11/2015 - 04/2017, Postgraduate with Distinction Challenges in Implementing IoT-enabled Smart cities in Malaysia. Bachelors Electrical Telecommunication Engineering COMSATS Institute of Information Technology, Islamabad 08/2010 - 01/2014, CGPA: 3.09 Networking Satellite communications Programming/ Matlab Telecommunication Engineering SKILLS Designing Leadership Media/Marketing R/Python SQL Tableau NLP Data Analysis Machine learning Deep learning Webapp/Cloud Feature Engineering Ensembling Time Series Technology Management ACHIEVEMENTS 98th Hungry Geese Simulation Competition (08/2021) 2nd in Covid-19 vaccinations around the world (07/2021) 8th in Automatic Speech Recognition in WOLOF (06/2021) Top 10 in WiDS Datathon. (03/2021) 40th / 622 in MagNet: Model the Geomagnetic Field Hosted by NOAA (02/2021) 18th in Rock, Paper, Scissors/Designing AI Agent Competition. (02/2021) PROJECTS Goodreads Profile Analysis WebApp (09/2021) Data Analysis Web Scraping XLM Interactive Visualization Contributed in orchest.io (08/2021) Testing and Debuging Technical Article Proposing new was to Improve ML pipelines World Vaccine Update System (06/2021) Used sqlite3 for database Automated system for daily update the Kaggle DB and Analysis Interactive dashboard mRNA-Vaccine-Degradation-Prediction (06/2021) Explore our dataset and then preprocessed sequence, structure, and predicted loop type features Train deep learning GRU model Trip Advisor Data Analysis/ML (04/2021) Preprocessing Data, Exploratory Data analysis, Word clouds. Feature Engineering, Text processing. BiLSTM Model for predicting rating, evaluation, model performance. Jane Street Market Prediction (03/2021) EDA, Feature Engineering, experimenting with hyperparameters. Ensembling: Resnet, NN Embeddings, TF Simple NN model. Using simple MLP pytorch model. Achievements/Tasks Achievements/Tasks Achievements/Tasks Thesis Courses"
# Print recognized entities and their labels
print_entities(newnlp, test_text)

AI SKILL
testing SKILL
AI SKILL
Testing SKILL
data analysis SKILL
machine learning SKILL
Marketing SKILL
Engineering SKILL
communications SKILL
Engineering SKILL
Marketing SKILL
R SKILL
Python SKILL
SQL SKILL
Tableau SKILL
NLP SKILL
Data Analysis SKILL
Engineering SKILL
Time Series SKILL
Simulation SKILL
AI SKILL
Data Analysis SKILL
Visualization SKILL
Testing SKILL
ML SKILL
database SKILL
Data Analysis SKILL
ML SKILL
Data analysis SKILL
Engineering SKILL
Engineering SKILL


In [10]:
import shutil
from huggingface_hub import Repository, login

def save_model_to_huggingface(nlp, model_dir, repo_id, token):
    """
    Save the trained spaCy model to Hugging Face Model Hub by cloning to a new folder and moving files.
    
    Args:
        nlp (Language): The spaCy model to be saved.
        model_dir (str): The directory path where the model will be saved.
        repo_id (str): The repository ID on Hugging Face (e.g., 'Ranjithjames/SPACY_NER').
        token (str): Your Hugging Face API token.
    """
    # Save the spaCy model to the specified directory
    nlp.to_disk(model_dir)

    # Authenticate the Hugging Face Hub
    login(token=token)
    
    # Temporary directory for cloning the repository
    temp_model_dir = model_dir + "_temp"
    
    # Clone the repository to the temporary directory
    repo_url = f"https://huggingface.co/{repo_id}"
    repo = Repository(local_dir=temp_model_dir, clone_from=repo_url, use_auth_token=token)
    
    # Move the saved model files to the cloned repository directory
    for item in os.listdir(model_dir):
        s = os.path.join(model_dir, item)
        d = os.path.join(temp_model_dir, item)
        if os.path.isdir(s):
            shutil.move(s, d)
        else:
            shutil.move(s, d)
    
    # Add and push the model files to the repository
    repo.git_add(auto_lfs_track=True)
    repo.git_commit("Update spaCy model")
    repo.git_push()
    
    # Clean up: move files back to original directory and remove temporary directory
    for item in os.listdir(temp_model_dir):
        s = os.path.join(temp_model_dir, item)
        d = os.path.join(model_dir, item)
        if os.path.isdir(s):
            shutil.move(s, d)
        else:
            shutil.move(s, d)
    shutil.rmtree(temp_model_dir)

# Example usage
model_dir = "data/model"
repo_id = "Ranjithjames/SPACY_NER"
token = "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"

save_model_to_huggingface(newnlp, model_dir, repo_id, token)


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Ranjithjames/SPACY_NER into local empty directory.


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/RanjithJames/.cache/huggingface/token
Login successful


Adding files tracked by Git LFS: ['vocab/strings.json', 'vocab/vectors', '.DS_Store', 'attribute_ruler/patterns', 'ner/model', 'ner/moves', 'parser/model', 'parser/moves', 'senter/model', 'tagger/model', 'tok2vec/model', 'tokenizer', 'vocab/key2row']. This may take a bit of time if the files are large.
Upload file vocab/vectors:   0%|          | 1.00/588M [00:00<?, ?B/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A
[A



[A[A[A[A


Upload file vocab/vectors:   8%|▊         | 45.8M/588M [00:18<03:39, 2.59MB/s]









[A[A[A[A[A[A[A[A[A[A
[A


[A[A[A



[A[A[A[A







Upload fil

In [11]:
from huggingface_hub import hf_hub_download
import os
import shutil

def download_model_from_huggingface(repo_id, model_dir, token):
    """
    Download the model from Hugging Face Model Hub and store it in the specified directory.
    
    Args:
        repo_id (str): The repository ID on Hugging Face (e.g., 'Ranjithjames/SPACY_NER').
        model_dir (str): The directory path where the model will be stored.
        token (str): Your Hugging Face API token.
    """
    # Create the model directory if it doesn't exist
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    # Download the model files from the Hugging Face repository
    files_to_download = ["config.json", "tokenizer.json", "model.bin", "special_tokens_map.json", "vocab.txt"]
    
    for file_name in files_to_download:
        file_path = hf_hub_download(repo_id=repo_id, filename=file_name, use_auth_token=token)
        shutil.move(file_path, os.path.join(model_dir, file_name))
        
# Example usage
model_dir = "data/model"
repo_id = "Ranjithjames/SPACY_NER"
token = "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"

download_model_from_huggingface(repo_id, model_dir, token)
