In [None]:
import pandas as pd

# Specify the path to your CSV file
file_location = R"data/Resume.csv"
resume_data = pd.read_csv(file_location)

# Display the first few rows of the dataset
resume_data.head()



In [None]:

import requests
from io import StringIO

# URL for the JSON Lines file
json_url = 'https://raw.githubusercontent.com/RanjithJames/NLP_RESUME_FINAL_PROJECT/main/Skill_patterns.jsonl'

# Retrieve the data from the URL
response = requests.get(json_url)
response.raise_for_status()  # Raises an exception for HTTP errors

# Convert JSON Lines string into a pandas DataFrame
skills_data = pd.read_json(StringIO(response.text), lines=True)


In [None]:

# Download the spaCy model
#!python -m spacy download en_core_web_lg

#!python -m spacy download en_core_web_trf
# !pip install spacy-transformers

#!python -m spacy download en_core_web_md

#!pip install spacy[cuda]

In [None]:
import spacy
# Enable GPU usage
spacy.require_gpu()
# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_md")

from spacy.matcher import Matcher

# Initialize the matcher with the vocabulary
skill_matcher = Matcher(nlp.vocab)

# Add patterns to the matcher
for idx, row in skills_data.iterrows():
    skill_pattern = row['pattern']  # Assuming this is a list of dictionaries
    skill_matcher.add("SKILL", [skill_pattern])



In [None]:
# Install tqdm for progress tracking
#!pip install tqdm

In [None]:
from tqdm import tqdm

def extract_skills_from_resumes(resume_texts):
    extracted_data = []
    for resume in tqdm(resume_texts, desc="Extracting skills"):
        doc = nlp(resume)
        match_results = skill_matcher(doc)
        skill_spans = [doc[start:end] for _, start, end in match_results]
        entities = [(span.start_char, span.end_char, "SKILL") for span in skill_spans if span.text.strip()]
        extracted_data.append((resume, {"entities": entities}))
    return extracted_data

# Limit the data to the first 5 resumes
sample_resumes = resume_data['Resume_str'].tolist()

# Extract skills from the sample resumes
sample_training_data = extract_skills_from_resumes(sample_resumes)

# Print the first item of the training data
print(sample_training_data[1])


In [None]:
import pickle
# Save sample_training_data to a file
# with open('sample_training_data.pkl', 'wb') as f:
#     pickle.dump(sample_training_data, f)

# Load sample_training_data from the file
with open('sample_training_data.pkl', 'rb') as f:
    sample_training_data = pickle.load(f)

In [None]:

# Install scikit-learn for model training
#!pip install scikit-learn
#!pip install spacy-lookups-data


In [None]:

from sklearn.model_selection import train_test_split

# Unzip the training data into texts and annotations
texts, annotations = zip(*sample_training_data)

# Split the data into training and validation sets
train_texts, val_texts, train_annotations, val_annotations = train_test_split(
    texts, annotations, test_size=0.2, random_state=42
)

# Recreate the training and validation sets as tuples (text, annotation)
train_data = list(zip(train_texts, train_annotations))
val_data = list(zip(val_texts, val_annotations))

In [None]:
import os

# Add the directory to the PATH environment variable
os.environ['PATH'] += os.pathsep + r'G:\bin'

# Verify the PATH has been updated
print(os.environ['PATH'])


In [None]:
#!pip install spacy[cuda]
#!pip install cupy-cuda12x

In [None]:
import spacy
import random
from tqdm import tqdm
from spacy.tokens import Span
from spacy.training import Example
from spacy.lookups import load_lookups
from spacy.matcher import Matcher
import pandas as pd
import os
import pickle



ner = nlp.get_pipe("ner")

# Function to remove overlapping entities
def remove_overlapping_entities(entities):
    """
    Remove overlapping entities from the list.
    
    Args:
        entities (list): List of tuples containing start, end, and label of entities.
    
    Returns:
        list: List of non-overlapping entities.
    """
    entities = sorted(entities, key=lambda x: x[0])  # Sort by start position
    non_overlapping_entities = []
    last_end = -1
    for start, end, label in entities:
        if start >= last_end:  # No overlap
            non_overlapping_entities.append((start, end, label))
            last_end = end
    return non_overlapping_entities

# Add the 'SKILL' entity label to the NER model if it's not already known
if 'SKILL' not in ner.labels:
    ner.add_label("SKILL")

# Function to add custom entity labels from training data
def add_custom_entity_labels(train_data):
    """
    Add custom entity labels from training data to the NER model.
    
    Args:
        train_data (list): List of tuples containing text and annotations.
    """
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            if ent[2] not in ner.labels:
                ner.add_label(ent[2])

# Add custom entities from training data
add_custom_entity_labels(train_data)

# Disable other pipeline components during training to train only NER
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

# Function to preprocess training data to remove overlapping entities
def preprocess_training_data(train_data):
    """
    Preprocess the training data to remove overlapping entities.
    
    Args:
        train_data (list): List of tuples containing text and annotations.
    
    Returns:
        list: List of preprocessed training data.
    """
    preprocessed_data = []
    for text, annotations in train_data:
        entities = annotations.get("entities")
        non_overlapping_entities = remove_overlapping_entities(entities)
        preprocessed_data.append((text, {"entities": non_overlapping_entities}))
    return preprocessed_data

# Preprocess training data to remove overlapping entities
train_data = preprocess_training_data(train_data)

# Function to train NER model
def train_ner_model(train_data, n_iter=1):
    """
    Train the NER model with the given training data.
    
    Args:
        train_data (list): List of tuples containing text and annotations.
        n_iter (int): Number of training iterations.
    """
    with nlp.disable_pipes(*unaffected_pipes):  # Only train NER
        optimizer = nlp.initialize()  # Correct initialization for transformer-based models
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            for text, annotations in tqdm(train_data, desc=f"Iteration {itn+1}"):
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update([example], sgd=optimizer, drop=0.5, losses=losses)
            print(f"Iteration {itn+1} Losses: {losses}")

# Train the NER model
train_ner_model(train_data[1:100])

In [None]:
import os
import spacy
from spacy.tokens import Span
from spacy.training import Example
from spacy.scorer import Scorer

# Define a valid path for saving the model
model_dir = r"data/model"

# Create the directory if it does not exist
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save the trained model to disk
def save_model(nlp, model_dir):
    """
    Save the trained spaCy model to disk.
    
    Args:
        nlp (Language): The spaCy model to be saved.
        model_dir (str): The directory path where the model will be saved.
    """
    nlp.to_disk(model_dir)

save_model(nlp, model_dir)



In [None]:
# Load the model from the saved directory into a new variable


def load_model(model_dir):
    """
    Load a spaCy model from a specified directory.
    
    Args:
        model_dir (str): The directory path from where the model will be loaded.
    
    Returns:
        Language: The loaded spaCy model.
    """
    return spacy.load(model_dir)

newnlp = load_model('data/model')


# Process a new text through the loaded model
def print_entities(ner_model, text):
    """
    Process the text through the NER model and print the recognized entities and their labels.
    
    Args:
        ner_model (Language): The spaCy NER model.
        text (str): The text to be processed.
    """
    doc = ner_model(text)
    for ent in doc.ents:
        print(ent.text, ent.label_)

# Example text to process
test_text = """### Data Scientist Job Description

**Job Title:** Data Scientist

**Location:** [City, State]

**Company:** [Company Name]

**Job Type:** Full-time

**About Us:**
[Company Name] is a leading [industry] company, dedicated to [mission statement or company goal]. We leverage cutting-edge technologies and data-driven strategies to drive innovation and make informed decisions that impact our business and customers. We are seeking a passionate and analytical Data Scientist to join our dynamic team.

**Job Summary:**
As a Data Scientist at [Company Name], you will play a key role in driving data-driven decision-making by developing, deploying, and maintaining machine learning models and advanced analytics solutions. You will work closely with cross-functional teams to analyze large datasets, uncover insights, and solve complex business problems. This role requires a deep understanding of statistical methods, machine learning algorithms, and data manipulation techniques.

**Key Responsibilities:**
- **Data Analysis & Exploration:**
  - Collect, process, and analyze large datasets from various sources to identify trends, patterns, and anomalies.
  - Perform exploratory data analysis (EDA) to generate insights and support business decisions.
  
- **Model Development:**
  - Develop, test, and deploy predictive models and machine learning algorithms to solve business problems.
  - Optimize models for performance, scalability, and interpretability.
  
- **Collaboration & Communication:**
  - Work closely with business stakeholders to understand their needs and translate them into analytical solutions.
  - Communicate complex analytical concepts and insights to non-technical stakeholders in a clear and concise manner.

- **Data Visualization:**
  - Create interactive dashboards and visualizations to present data insights and trends.
  - Provide recommendations based on data-driven insights to support decision-making.

- **Continuous Improvement:**
  - Stay up-to-date with the latest advancements in data science, machine learning, and AI technologies.
  - Identify opportunities to improve data processes and contribute to the development of data-driven strategies.

**Qualifications:**
- Bachelor’s degree in Computer Science, Statistics, Mathematics, or a related field; Master’s or PhD preferred.
- 2+ years of experience in data science, machine learning, or a related field.
- Proficiency in programming languages such as Python, R, or SQL.
- Experience with machine learning frameworks and libraries such as TensorFlow, scikit-learn, or PyTorch.
- Strong knowledge of statistical methods, data mining, and predictive modeling techniques.
- Experience with data visualization tools like Tableau, Power BI, or matplotlib.
- Familiarity with big data tools and platforms such as Hadoop, Spark, or AWS.
- Excellent problem-solving skills and attention to detail.
- Strong communication skills and ability to work collaboratively in a team environment.

**Preferred Skills:**
- Experience with natural language processing (NLP) or deep learning.
- Knowledge of cloud computing and data engineering.
- Experience with A/B testing and experiment design.

**Benefits:**
- Competitive salary and performance-based bonuses.
- Health, dental, and vision insurance.
- 401(k) plan with company match.
- Generous paid time off and holiday schedule.
- Opportunities for professional development and career growth.

**How to Apply:**
Interested candidates should submit their resume, cover letter, and portfolio of relevant work to [email address] with the subject line “Data Scientist Application – [Your Name]”.

[Company Name] is an equal opportunity employer. We celebrate diversity and are committed to creating an inclusive environment for all employees."""
# Print recognized entities and their labels
print_entities(newnlp, test_text)


In [None]:
import shutil
from huggingface_hub import Repository, login

def save_model_to_huggingface(nlp, model_dir, repo_id, token):
    """
    Save the trained spaCy model to Hugging Face Model Hub by cloning to a new folder and moving files.
    
    Args:
        nlp (Language): The spaCy model to be saved.
        model_dir (str): The directory path where the model will be saved.
        repo_id (str): The repository ID on Hugging Face (e.g., 'Ranjithjames/SPACY_NER').
        token (str): Your Hugging Face API token.
    """
    # Save the spaCy model to the specified directory
    nlp.to_disk(model_dir)

    # Authenticate the Hugging Face Hub
    login(token=token)
    
    # Temporary directory for cloning the repository
    temp_model_dir = model_dir + "_temp"
    
    # Clone the repository to the temporary directory
    repo_url = f"https://huggingface.co/{repo_id}"
    repo = Repository(local_dir=temp_model_dir, clone_from=repo_url, use_auth_token=token)
    
    # Move the saved model files to the cloned repository directory
    for item in os.listdir(model_dir):
        s = os.path.join(model_dir, item)
        d = os.path.join(temp_model_dir, item)
        if os.path.isdir(s):
            shutil.move(s, d)
        else:
            shutil.move(s, d)
    
    # Add and push the model files to the repository
    repo.git_add(auto_lfs_track=True)
    repo.git_commit("Update spaCy model")
    repo.git_push()
    
    # Clean up: move files back to original directory and remove temporary directory
    for item in os.listdir(temp_model_dir):
        s = os.path.join(temp_model_dir, item)
        d = os.path.join(model_dir, item)
        if os.path.isdir(s):
            shutil.move(s, d)
        else:
            shutil.move(s, d)
    shutil.rmtree(temp_model_dir)

# Example usage
model_dir = "data/model"
repo_id = "Ranjithjames/SPACY_NER"
token = "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"

save_model_to_huggingface(newnlp, model_dir, repo_id, token)


In [None]:
from huggingface_hub import HfApi, hf_hub_download
import os
import shutil

def download_all_files_from_huggingface(repo_id, model_dir, token):
    """
    Download all files from the Hugging Face Model Hub repository and store them in the specified directory.
    
    Args:
        repo_id (str): The repository ID on Hugging Face (e.g., 'Ranjithjames/SPACY_NER').
        model_dir (str): The directory path where the model will be stored.
        token (str): Your Hugging Face API token.
    """
    # Create the model directory if it doesn't exist
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    # Initialize the Hugging Face API
    api = HfApi()
    
    # List all files in the repository
    repo_files = api.list_repo_files(repo_id=repo_id, use_auth_token=token)
    
    # Download each file and move it to the model directory
    for file_name in repo_files:
        # Download the file from the repository
        file_path = hf_hub_download(repo_id=repo_id, filename=file_name, use_auth_token=token)
        
        # Determine the destination path
        destination_path = os.path.join(model_dir, file_name)
        
        # Ensure the directory exists
        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
        
        # Move the file to the destination directory
        shutil.move(file_path, destination_path)

# Example usage
model_dir = "data/modell"
repo_id = "Ranjithjames/SPACY_NER"
token = "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"

download_all_files_from_huggingface(repo_id, model_dir, token)


## Evaluate

In [None]:
import spacy
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from tqdm import tqdm
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets
import numpy as np

# Load the model
def load_model(model_dir):
    """
    Load a spaCy model from a specified directory.
    
    Args:
        model_dir (str): The directory path from where the model will be loaded.
    
    Returns:
        Language: The loaded spaCy model.
    """
    return spacy.load(model_dir)

newnlp = load_model('data/model')

def non_overlapping_entities(entities):
    """
    Ensure entities are non-overlapping by sorting and filtering.
    
    Args:
        entities (list of tuples): List of entities (start, end, label).
    
    Returns:
        list of tuples: Filtered non-overlapping entities.
    """
    sorted_entities = sorted(entities, key=lambda x: x[0])
    non_overlapping = []

    for ent in sorted_entities:
        if not non_overlapping or ent[0] >= non_overlapping[-1][1]:
            non_overlapping.append(ent)
    return non_overlapping

# Function to evaluate the model with progress tracking
def evaluate_model(ner_model, test_data):
    """
    Evaluate the NER model using the test data and calculate precision, recall, F1 scores, and support.
    
    Args:
        ner_model (Language): The spaCy NER model.
        test_data (list of tuples): A list where each tuple is of the form (text, {'entities': [(start, end, label), ...]})
    
    Returns:
        dict: A dictionary containing precision, recall, F1 scores, support, and confusion matrix.
    """
    true_entities = []
    pred_entities = []

    for text, annotations in tqdm(test_data, desc="Evaluating"):
        doc = ner_model(text)
        true_ents = non_overlapping_entities(annotations['entities'])
        pred_ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        
        # Generate token-based BIO tags for the true entities
        true_tags = offsets_to_biluo_tags(ner_model.make_doc(text), true_ents)
        
        # Generate token-based BIO tags for the predicted entities
        pred_tags = offsets_to_biluo_tags(ner_model.make_doc(text), pred_ents)
        
        # Filter to only include entities with 'skill' in their label (case insensitive)
        filtered_true_tags = [tag for tag in true_tags if 'skill' in tag.lower() and tag.lower() != 'l-skill']
        filtered_pred_tags = [tag for tag in pred_tags if 'skill' in tag.lower() and tag.lower() != 'l-skill']


        # Trim both lists to the same length
        min_length = min(len(filtered_true_tags), len(filtered_pred_tags))
        filtered_true_tags = filtered_true_tags[:min_length]
        filtered_pred_tags = filtered_pred_tags[:min_length]
        
        # print(filtered_true_tags)
        # print(filtered_pred_tags)
        # Append to the lists
        true_entities.extend(filtered_true_tags)
        pred_entities.extend(filtered_pred_tags)
        

    # Calculate precision, recall, f1, and support
    precision, recall, f1, support = precision_recall_fscore_support(true_entities, pred_entities, average='weighted')

    # Generate confusion matrix
    labels = list(set(true_entities + pred_entities))
    conf_matrix = confusion_matrix(true_entities, pred_entities, labels=labels)
    
    # Generate classification report
    class_report = classification_report(true_entities, pred_entities, labels=labels)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'support': support,
        'confusion_matrix': conf_matrix,
        'classification_report': class_report
    }


# Evaluate the model with progress tracking
evaluation_results = evaluate_model(newnlp, val_data)

print("Precision: {:.4f}".format(evaluation_results['precision']))
print("Recall: {:.4f}".format(evaluation_results['recall']))
print("F1 Score: {:.4f}".format(evaluation_results['f1']))
print("Confusion Matrix:\n{}".format(evaluation_results['confusion_matrix']))
print("Classification Report:\n{}".format(evaluation_results['classification_report']))
