# BERT Model

In [4]:
from transformers import BertTokenizer, BertModel
import torch

In [108]:
resume_text = "Hello i am a junior undergrad student."
application_text = "I am a experienced designer"
opening_description = "Need a really experienced and great full stack web developer"

In [109]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

In [110]:
def get_embeddings(text):
    input_ids = tokenizer(text, return_tensors='pt', padding=True, truncation=True)['input_ids']
    with torch.no_grad():
        embeddings = model(input_ids).last_hidden_state.mean(dim=1)
    return embeddings

In [111]:
resume_embeddings = get_embeddings(resume_text)
application_embeddings = get_embeddings(application_text)
description_embeddings = get_embeddings(opening_description)

In [95]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_resume_app = cosine_similarity(resume_embeddings, description_embeddings)
similarity_application_description = cosine_similarity(application_embeddings, description_embeddings)

In [96]:
similarity_resume_app

array([[0.7279686]], dtype=float32)

In [97]:
similarity_application_description

array([[0.7867838]], dtype=float32)

In [98]:
match_percentage = (similarity_resume_app + similarity_application_description) / 2
print("Match Percentage:", match_percentage[0][0])

Match Percentage: 0.7573762


# TFIDF Model

In [241]:
resume_text = "Hello i am a junior undergrad student."
application_text = "I am pathetic at web developement"
opening_description = "Need a really experienced and great full stack web developer"

In [242]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
import string

ps=PorterStemmer()

custom_stopwords = ["need", "want", "this", "that", "fast"]

def stem(x):
    L = []
    tagged_tokens = pos_tag(x)
    for token, pos in tagged_tokens:
        stemmed_token = ps.stem(token.lower())
        if stemmed_token not in L and stemmed_token not in stopwords.words("english") and stemmed_token not in string.punctuation:
            L.append(stemmed_token)
    return " ".join(L)

In [243]:
resume_text = stem(resume_text.split())
application_text = stem(application_text.split())
opening_description = stem(opening_description.split())

In [244]:
application_text

'pathet web develop'

In [245]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

combined_text = [resume_text, application_text]
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_text)

opening_tfidf = tfidf_vectorizer.transform([opening_description])
similarity_scores = cosine_similarity(opening_tfidf, tfidf_matrix)

In [246]:
print("Cosine Similarity (Resume vs. Opening Description):", similarity_scores[0][0])
print("Cosine Similarity (Application vs. Opening Description):", similarity_scores[0][1])

Cosine Similarity (Resume vs. Opening Description): 0.0
Cosine Similarity (Application vs. Opening Description): 0.816496580927726


# MPNET Model

In [19]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from transformers import pipeline

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load sentiment analysis model
sentiment_analysis = pipeline("sentiment-analysis")

# Load BERT-based model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

def get_embeddings(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

def get_sentiment_score(text):
    sentiment = sentiment_analysis(text)[0]
    if sentiment['label']=='POSITIVE':
        return sentiment['score']
    else:
        return sentiment['score']*-1

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [194]:
def get_similarity_score(application_text, opening_description):
    application_embeddings = get_embeddings([application_text])
    description_embeddings = get_embeddings([opening_description])

    similarity_score = cosine_similarity(application_embeddings, description_embeddings)[0][0]

    application_sentiment_score = get_sentiment_score(application_text)
    final_score = similarity_score+similarity_score*(abs(application_sentiment_score)**0.33)*(application_sentiment_score/abs(application_sentiment_score))

    return round((final_score/2)*100,5)

In [195]:
get_similarity_score(
    "I am an expert at full stack web development and have many years of experience and strong fundamentals",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

79.80434

In [204]:
get_similarity_score(
    "I am great at web development and have years of experience",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

71.33491

In [197]:
get_similarity_score(
    "I am good at web development and little experience in it",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

62.64155

In [198]:
get_similarity_score(
    "I am bad at web development and have no experience",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

0.00212

In [199]:
get_similarity_score(
    "I am pathetic at web development and have no experience",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

0.00225

In [200]:
get_similarity_score(
    "I am okay okay at web development and have just started learning it",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

59.28756

In [201]:
get_similarity_score(
    "I am good at machine learning",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

25.83193

In [205]:
get_similarity_score(
    "I am good at web development but i dont know machine learning",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.2198

# Using Roberta with TFIDF

In [9]:
from transformers import pipeline

classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)

def get_sentence_sentiment(sentence):
    model_outputs = classifier([sentence])
    output_obj={}
    
    for obj in model_outputs[0]:
        output_obj[obj['label']]=obj['score']
        
    return output_obj

In [10]:
sentiment_weights = {
    "admiration": 0.9,
    "amusement": 0.3,
    "anger": -0.4,
    "annoyance": -0.3,
    "approval": 0.3,
    "caring": 0.05,
    "confusion": -0.05,
    "curiosity": 0.3,
    "desire": 0.4,
    "disappointment": -0.3,
    "disgust": -0.2,
    "disapproval": -0.1,
    "embarrassment": 0.0,
    "excitement": 0.3,
    "fear": 0.0,
    "gratitude": 0.2,
    "grief": 0.0,
    "joy": 0.0,
    "love": 0.0,
    "nervousness": 0.0,
    "optimism": 0.05,
    "pride": -0.05,
    "realization": 0.0,
    "relief": 0.0,
    "remorse": 0.0,
    "sadness": -0.4,
    "surprise": 0.0,
    "neutral": 0.4,
}

In [11]:
def get_sentiment_score(sentence):
    sentiment_obj = get_sentence_sentiment(sentence)

    sentiment_labels = sentiment_obj.keys()
    sentiment_scores = sentiment_obj.values()
    
    # Normalize scores to a common scale (0 to 1)
    normalized_scores = [score / max(sentiment_scores) for score in sentiment_scores]
    
    # Calculate the weighted sentiment score
    weighted_sentiment_score = sum(normalized_scores[i] * sentiment_weights[label] for i, label in enumerate(sentiment_labels))

    return weighted_sentiment_score

In [12]:
sentence = "I am learning web development but i know a lot and i a few years of experience in this field"

sentiment_score = get_sentiment_score(sentence)

print(f"Weighted Sentiment Score: {sentiment_score:.2f}")

Weighted Sentiment Score: 0.64


In [395]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
import string

ps=PorterStemmer()

custom_stopwords = ["need", "want", "this", "that", "fast"]

def stem(x):
    L = []
    tagged_tokens = pos_tag(x)
    for token, pos in tagged_tokens:
        token=token.lower()
        if pos != 'JJ' and pos != 'JJR' and pos != 'JJS' and token not in custom_stopwords:  # Remove adjectives
            stemmed_token = ps.stem(token)
            if stemmed_token not in L and stemmed_token not in stopwords.words("english") and stemmed_token not in string.punctuation:
                L.append(stemmed_token)
    return " ".join(L)

In [261]:
application_text = "I am pathetic at web development"
opening_description = "Need a really experienced and great full stack web developer"

application_text_stem = stem(application_text.split())
opening_description_stem = stem(opening_description.split())

In [262]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

application_tfidf = tfidf_vectorizer.fit_transform([application_text_stem])

opening_tfidf = tfidf_vectorizer.transform([opening_description_stem])
similarity_score = cosine_similarity(opening_tfidf, application_tfidf)[0][0]

In [263]:
print("Cosine Similarity (Application vs. Opening Description):", similarity_score)

Cosine Similarity (Application vs. Opening Description): 0.8164965809277261


In [265]:
application_sentiment_score=get_sentiment_score(application_text)

In [266]:
print("Final Similarity Score (Application vs. Opening Description):", similarity_score*application_sentiment_score)

Final Similarity Score (Application vs. Opening Description): -0.3517101187232982


In [13]:
import numpy as np

def sigmoid(x):
    return 2 / (1 + np.exp(-x)) - 1

In [14]:
def get_similarity_score(application_text, opening_description):
    application_text_stem = stem(application_text.split())
    opening_description_stem = stem(opening_description.split())

    tfidf_vectorizer = TfidfVectorizer()

    application_tfidf = tfidf_vectorizer.fit_transform([application_text_stem])
    
    opening_tfidf = tfidf_vectorizer.transform([opening_description_stem])
    similarity_score = cosine_similarity(opening_tfidf, application_tfidf)[0][0]

    application_sentiment_score=get_sentiment_score(application_text)

    return sigmoid(similarity_score*application_sentiment_score)

In [434]:
get_similarity_score(
    "I am great at web development and have years of experience",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.3854615085547146

In [435]:
get_similarity_score(
    "I am good at web development and little experience in it",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.46706004643938615

In [436]:
get_similarity_score(
    "I am bad at web development and have no experience",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

-0.22346232265652322

In [437]:
get_similarity_score(
    "I am pathetic at web development and have no experience",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

-0.22948165796224218

In [438]:
get_similarity_score(
    "I am okay okay at web development and have just started learning it",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

0.1876354931413573

In [439]:
get_similarity_score(
    "I am good at machine learning",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.0

## Using weight function

In [454]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to calculate the weighted score
def get_weighted_score(similarity_score, application_sentiment_score):
    # Function to determine the weight based on the similarity score
    def weight_function(similarity_score):
        weight = 1.0 + 0.5 * similarity_score**2
        return weight

    weight = weight_function(similarity_score)

    weighted_score = similarity_score * (1 + weight) * application_sentiment_score

    return weighted_score

def sigmoid(x):
    return 2 / (1 + np.exp(-x)) - 1

def get_similarity_score(application_text, opening_description):
    application_text_stem = stem(application_text.split())
    opening_description_stem = stem(opening_description.split())

    tfidf_vectorizer = TfidfVectorizer()

    application_tfidf = tfidf_vectorizer.fit_transform([application_text_stem])
    
    opening_tfidf = tfidf_vectorizer.transform([opening_description_stem])
    similarity_score = cosine_similarity(opening_tfidf, application_tfidf)[0][0]

    application_sentiment_score = get_sentiment_score(application_text)

    weighted_score = get_weighted_score(similarity_score, application_sentiment_score)

    return sigmoid(weighted_score)

In [455]:
get_similarity_score(
    "I am great at web development and have years of experience",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.7466488715297739

In [456]:
get_similarity_score(
    "I am good at web development and little experience in it",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.8526439606668026

In [457]:
get_similarity_score(
    "I am bad at web development and have no experience",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

-0.5140682427714358

In [458]:
get_similarity_score(
    "I am pathetic at web development and have no experience",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

-0.525643062152912

In [459]:
get_similarity_score(
    "I am okay okay at web development and have just started learning it",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

0.39503115927524224

In [460]:
get_similarity_score(
    "I am good at machine learning",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.0

# Using Roberta with BERT

In [402]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

def get_embeddings(text):
    input_ids = tokenizer(text, return_tensors='pt', padding=True, truncation=True)['input_ids']
    with torch.no_grad():
        embeddings = model(input_ids).last_hidden_state.mean(dim=1)
    return embeddings

def sigmoid(x):
    return 2 / (1 + np.exp(-x)) - 1

def get_similarity_score(application_text, opening_description):
    application_embeddings = get_embeddings(application_text)
    description_embeddings = get_embeddings(opening_description)

    similarity_score = cosine_similarity(application_embeddings, description_embeddings)[0][0]

    application_sentiment_score=get_sentiment_score(application_text)
    return sigmoid(similarity_score*application_sentiment_score)

In [403]:
get_similarity_score(
    "I am great at web development and have years of experience",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.8220010637976313

In [404]:
get_similarity_score(
    "I am good at web development and little experience in it",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.8320659263853195

In [405]:
get_similarity_score(
    "I am bad at web development and have no experience",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

-0.38609173546733483

In [406]:
get_similarity_score(
    "I am pathetic at web development and have no experience",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

-0.396097810538605

In [407]:
get_similarity_score(
    "I am okay okay at web development and have just started learning it",
    "Need a good full stack web developer which sufficient knowledge and experience"
)

0.47414601236326276

In [408]:
get_similarity_score(
    "I am good at machine learning",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

0.7120702413987053

In [15]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

def get_embeddings(text):
    input_ids = tokenizer(text, return_tensors='pt', padding=True, truncation=True)['input_ids']
    with torch.no_grad():
        embeddings = model(input_ids).last_hidden_state.mean(dim=1)
    return embeddings

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
import string

ps=PorterStemmer()

custom_stopwords = ["need", "want", "this", "that", "fast"]

def remove_adj(x):
    L = []
    tagged_tokens = pos_tag(x.split())
    for token, pos in tagged_tokens:
        if pos != 'JJ' and pos != 'JJR' and pos != 'JJS' and token not in custom_stopwords:  # Remove adjectives
            L.append(token)
    print(L)
    return " ".join(L)

def sigmoid(x):
    return 2 / (1 + np.exp(-x)) - 1

def get_similarity_score(application_text, opening_description):
    application_sentiment_score=get_sentiment_score(application_text)

    application_embeddings = get_embeddings(remove_adj(application_text))
    description_embeddings = get_embeddings(remove_adj(opening_description))

    similarity_score = cosine_similarity(application_embeddings, description_embeddings)[0][0]

    return sigmoid(similarity_score*application_sentiment_score)

In [16]:
get_similarity_score(
    "I am great at web development and have years of experience",
    "Need a good full stack web developer which sufficient knowledge and good experience"
)

['I', 'am', 'at', 'web', 'development', 'and', 'have', 'years', 'of', 'experience']
['Need', 'a', 'stack', 'web', 'developer', 'which', 'knowledge', 'and', 'experience']


0.385356292268237

In [206]:
from transformers import BertTokenizer, BertForTokenClassification
import torch

# Define the text and the target phrase to extract
text = "I am good at web development but I don't know machine learning."
target_phrase = "I am good at web development"

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForTokenClassification.from_pretrained("bert-base-uncased")

# Tokenize the input text
input_ids = tokenizer.encode(text, add_special_tokens=True)
input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

# Predict the tokens and their labels
with torch.no_grad():
    outputs = model(input_ids)

# Retrieve the predicted labels
predicted_labels = outputs.logits.argmax(dim=2).squeeze(0).tolist()
predicted_tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in predicted_labels][0]

# Combine tokens to reconstruct the extracted phrase
extracted_phrase = " ".join(predicted_tokens).replace(" ##", "").strip()

print("Extracted Phrase:", extracted_phrase)

# Check if the extracted phrase matches the target phrase
if target_phrase in extracted_phrase:
    print("Target phrase found in the text.")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracted Phrase: [ P A D ]
