# Import libraries

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


# Preprocessing

In [2]:
# Download required NLTK data files
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
def preprocess_sentence(sentence):
    """
    Preprocess a single sentence: remove special characters, stop words, and apply lemmatization.
    """
    # 1. Remove special characters and numbers
    sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)
    
    # 2. Tokenize the sentence
    words = word_tokenize(sentence.lower())  # Convert to lowercase and tokenize
    
    # 3. Remove stop words
#     filtered_words = [word for word in words if word not in stop_words]
    
#     # 4. Lemmatize words
#     lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    # 4. Lemmatize words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    # 5. Reconstruct the sentence
    return " ".join(lemmatized_words)

In [5]:
# Preprocess the entire dataset
def preprocess_dataset(dataset):
    """
    Preprocess a dataset of sentences.
    """
    return [preprocess_sentence(sentence) for sentence in dataset]

In [6]:
train_df = pd.read_excel('./Dataset.xlsx', sheet_name='Train')
display(train_df.head())
display(train_df.shape)

Unnamed: 0,Questions
0,Tell me about yourself
1,Can you introduce yourself briefly?
2,What are your career goals?
3,Where do you see yourself in five years?
4,Why did you choose your field of study?


(100, 1)

In [7]:
raw_dataset = list(train_df['Questions'])
len(raw_dataset)

100

In [8]:
# Preprocess the dataset
preprocessed_dataset = preprocess_dataset(raw_dataset)
preprocessed_dataset

['tell me about yourself',
 'can you introduce yourself briefly',
 'what are your career goal',
 'where do you see yourself in five year',
 'why did you choose your field of study',
 'what motivates you in life',
 'how would you describe your personality',
 'what are your longterm aspiration',
 'what inspired you to pursue this career',
 'how do you define success',
 'what are your greatest strength',
 'what are your weakness',
 'how do you stay updated in your field',
 'can you share a technical skill youve mastered',
 'what new skill have you learned recently',
 'what certification do you hold',
 'how do you keep improving yourself',
 'are you familiar with specific toolsoftware',
 'how do you approach problemsolving',
 'what unique skill make you stand out',
 'why do you want this job',
 'what do you know about our company',
 'what interest you most about this role',
 'why are you the best candidate for this position',
 'how do your past experience align with this job',
 'what chall

# Unsupervised modeling with encodings using Sentence Bert

In [9]:
# Step 1: Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [10]:
# Step 2: Generate Embeddings for the Preprocessed Dataset
dataset_embeddings = model.encode(preprocessed_dataset, convert_to_tensor=True)

In [11]:
def sentence_matching(unknown_sentence, dataset_embeddings, raw_dataset):
    
    # Step 3: Process an Unknown Sentence
    preprocessed_unknown_sentence = preprocess_sentence(unknown_sentence)
    unknown_embedding = model.encode(preprocessed_unknown_sentence, convert_to_tensor=True)
    
    # Step 4: Find the Most Similar Sentence
    similarity_scores = util.cos_sim(unknown_embedding, dataset_embeddings)
    most_similar_idx = similarity_scores.argmax().item()
    
    return (raw_dataset[most_similar_idx], preprocessed_unknown_sentence)

In [12]:
unknown_sentence = 'What do you consider your strongest skills?'
matched_sentence, preprocessed_unknown_sentence = sentence_matching(unknown_sentence, dataset_embeddings, raw_dataset)

# Output the results
print("Input Sentence:", unknown_sentence)
print("Preprocessed Input Sentence:", preprocessed_unknown_sentence)
print("Most Similar Sentence:", matched_sentence)

Input Sentence: What do you consider your strongest skills?
Preprocessed Input Sentence: what do you consider your strongest skill
Most Similar Sentence: What are your greatest strengths?


# Testing

In [13]:
test_df = pd.read_excel('./Dataset.xlsx', sheet_name='Test')
display(test_df.head())
display(test_df.shape)

Unnamed: 0,Questions
0,Can you share something about yourself?
1,Could you give a brief overview of who you are?
2,What are your short-term and long-term career ...
3,What position do you see yourself in five year...
4,Why did you decide to study in your chosen field?


(100, 1)

In [14]:
test_df['Ground Truth'] = train_df['Questions']
display(test_df.head())
display(test_df.shape)

Unnamed: 0,Questions,Ground Truth
0,Can you share something about yourself?,Tell me about yourself
1,Could you give a brief overview of who you are?,Can you introduce yourself briefly?
2,What are your short-term and long-term career ...,What are your career goals?
3,What position do you see yourself in five year...,Where do you see yourself in five years?
4,Why did you decide to study in your chosen field?,Why did you choose your field of study?


(100, 2)

In [15]:
# Classifying the test sentences to the training dataset 
test_df['Matched Sentence'] = test_df.apply(lambda x: sentence_matching(x['Questions'], dataset_embeddings, raw_dataset)[0], axis = 1)

In [16]:
display(test_df.head())
display(test_df.shape)

Unnamed: 0,Questions,Ground Truth,Matched Sentence
0,Can you share something about yourself?,Tell me about yourself,Tell me about yourself
1,Could you give a brief overview of who you are?,Can you introduce yourself briefly?,Tell me about yourself
2,What are your short-term and long-term career ...,What are your career goals?,What are your career goals?
3,What position do you see yourself in five year...,Where do you see yourself in five years?,Where do you see yourself in five years?
4,Why did you decide to study in your chosen field?,Why did you choose your field of study?,Why did you choose your field of study?


(100, 3)

In [19]:
# Calculating the accuracy
test_df['Match_or_not'] = test_df.apply(lambda x: 1 if x['Ground Truth'] == x['Matched Sentence'] else 0, axis=1)
accuracy = test_df['Match_or_not'].sum() / test_df.shape[0]
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 78.00%


In [18]:
# The ones which are not correctly classified
test_df[test_df['Match_or_not'] == 0].to_excel('Test_cosine.xlsx')
test_df[test_df['Match_or_not'] == 0]

Unnamed: 0,Questions,Ground Truth,Matched Sentence,Match_or_not
1,Could you give a brief overview of who you are?,Can you introduce yourself briefly?,Tell me about yourself,0
5,What inspires you to perform well?,What motivates you in life?,What unique skills make you stand out?,0
7,Where do you see yourself professionally in th...,What are your long-term aspirations?,Where do you see yourself in five years?,0
11,Are there any areas where you struggle and are...,What are your weaknesses?,How do you keep improving yourself?,0
12,How do you keep your knowledge and skills up-t...,How do you stay updated in your field?,What new skills have you learned recently?,0
16,What steps do you take for personal and profes...,How do you keep improving yourself?,What is your greatest professional achievement?,0
18,What is your typical approach to solving tough...,How do you approach problem-solving?,How do you deal with unforeseen challenges?,0
20,Why are you interested in this particular posi...,Why do you want this job?,What interests you most about this role?,0
27,What would you aim to accomplish early in this...,What do you expect to accomplish in the first ...,What interests you most about this role?,0
30,What’s the biggest success you’ve had in your ...,What is your greatest professional achievement?,What inspired you to pursue this career?,0
