# Import libraries

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
# from sentence_transformers import SentenceTransformer, util
import Levenshtein

# Preprocessing

In [2]:
# Download required NLTK data files
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
def preprocess_sentence(sentence):
    """
    Preprocess a single sentence: remove special characters, stop words, and apply lemmatization.
    """
    # 1. Remove special characters and numbers
    sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)
    
    # 2. Tokenize the sentence
    words = word_tokenize(sentence.lower())  # Convert to lowercase and tokenize
    
    # 3. Remove stop words
#     filtered_words = [word for word in words if word not in stop_words]
    # 4. Lemmatize words
#     lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    # 4. Lemmatize words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    # 5. Reconstruct the sentence
    return " ".join(lemmatized_words)

In [5]:
# Preprocess the entire dataset
def preprocess_dataset(dataset):
    """
    Preprocess a dataset of sentences.
    """
    return [preprocess_sentence(sentence) for sentence in dataset]

In [6]:
train_df = pd.read_excel('./Dataset.xlsx', sheet_name='Train')
display(train_df.head())
display(train_df.shape)

Unnamed: 0,Questions
0,Tell me about yourself
1,Can you introduce yourself briefly?
2,What are your career goals?
3,Where do you see yourself in five years?
4,Why did you choose your field of study?


(100, 1)

In [7]:
raw_dataset = list(train_df['Questions'])
len(raw_dataset)

100

In [8]:
# Preprocess the dataset
preprocessed_dataset = preprocess_dataset(raw_dataset)
preprocessed_dataset

['tell me about yourself',
 'can you introduce yourself briefly',
 'what are your career goal',
 'where do you see yourself in five year',
 'why did you choose your field of study',
 'what motivates you in life',
 'how would you describe your personality',
 'what are your longterm aspiration',
 'what inspired you to pursue this career',
 'how do you define success',
 'what are your greatest strength',
 'what are your weakness',
 'how do you stay updated in your field',
 'can you share a technical skill youve mastered',
 'what new skill have you learned recently',
 'what certification do you hold',
 'how do you keep improving yourself',
 'are you familiar with specific toolsoftware',
 'how do you approach problemsolving',
 'what unique skill make you stand out',
 'why do you want this job',
 'what do you know about our company',
 'what interest you most about this role',
 'why are you the best candidate for this position',
 'how do your past experience align with this job',
 'what chall

# Unsupervised modeling using Levenshtein distance

In [9]:
# Step 5: Compute Levenshtein Distance and Find Most Similar Sentence
def compute_levenshtein_distance(sentence1, sentence2):
    return Levenshtein.distance(sentence1, sentence2)


In [10]:
def sentence_matching(unknown_sentence, preprocessed_dataset, raw_dataset):
    
    # Process an Unknown Sentence
    preprocessed_unknown_sentence = preprocess_sentence(unknown_sentence)
    
    # Compute the Levenshtein distances for the unknown sentence against each in the dataset
    distances = [compute_levenshtein_distance(preprocessed_unknown_sentence, sentence) for sentence in preprocessed_dataset]

    # Find the index of the sentence with the smallest distance (most similar)
    most_similar_idx = distances.index(min(distances))
    
    return raw_dataset[most_similar_idx], preprocessed_unknown_sentence

In [11]:
# # Step 4: Process an Unknown Sentence
# unknown_sentence = "Can you share something about yourself?"
# preprocessed_unknown_sentence = preprocess_sentence(unknown_sentence)


In [12]:

# # Compute the Levenshtein distances for the unknown sentence against each in the dataset
# distances = [compute_levenshtein_distance(preprocessed_unknown_sentence, sentence) for sentence in preprocessed_dataset]

# # Find the index of the sentence with the smallest distance (most similar)
# most_similar_idx = distances.index(min(distances))


In [14]:
unknown_sentence = 'Can you share something about yourself?'
matched_sentence, preprocessed_unknown_sentence = sentence_matching(unknown_sentence, preprocessed_dataset, raw_dataset)

# Output the results
print("Input Sentence:", unknown_sentence)
print("Preprocessed Input Sentence:", preprocessed_unknown_sentence)
print("Most Similar Sentence:", matched_sentence)

Input Sentence: Can you share something about yourself?
Preprocessed Input Sentence: can you share something about yourself
Most Similar Sentence: Tell me about yourself


# Testing

In [15]:
test_df = pd.read_excel('./Dataset.xlsx', sheet_name='Test')
display(test_df.head())
display(test_df.shape)

Unnamed: 0,Questions
0,Can you share something about yourself?
1,Could you give a brief overview of who you are?
2,What are your short-term and long-term career ...
3,What position do you see yourself in five year...
4,Why did you decide to study in your chosen field?


(100, 1)

In [16]:
test_df['Ground Truth'] = train_df['Questions']
display(test_df.head())
display(test_df.shape)

Unnamed: 0,Questions,Ground Truth
0,Can you share something about yourself?,Tell me about yourself
1,Could you give a brief overview of who you are?,Can you introduce yourself briefly?
2,What are your short-term and long-term career ...,What are your career goals?
3,What position do you see yourself in five year...,Where do you see yourself in five years?
4,Why did you decide to study in your chosen field?,Why did you choose your field of study?


(100, 2)

In [18]:
# Classifying the test sentences to the training dataset 
test_df['Matched Sentence'] = test_df.apply(lambda x: sentence_matching(x['Questions'], preprocessed_dataset, raw_dataset)[0], axis = 1)

In [19]:
display(test_df.head())
display(test_df.shape)

Unnamed: 0,Questions,Ground Truth,Matched Sentence
0,Can you share something about yourself?,Tell me about yourself,Tell me about yourself
1,Could you give a brief overview of who you are?,Can you introduce yourself briefly?,Have you ever trained or mentored someone?
2,What are your short-term and long-term career ...,What are your career goals?,What are your long-term aspirations?
3,What position do you see yourself in five year...,Where do you see yourself in five years?,Where do you see yourself in five years?
4,Why did you decide to study in your chosen field?,Why did you choose your field of study?,How do you stay updated in your field?


(100, 3)

In [22]:
# Calculating the accuracy
test_df['Match_or_not'] = test_df.apply(lambda x: 1 if x['Ground Truth'] == x['Matched Sentence'] else 0, axis=1)
accuracy = test_df['Match_or_not'].sum() / test_df.shape[0]
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 47.00%


In [21]:
# The ones which are not correctly classified
test_df[test_df['Match_or_not'] == 0].to_excel('Test_lev.xlsx')
test_df[test_df['Match_or_not'] == 0]

Unnamed: 0,Questions,Ground Truth,Matched Sentence,Match_or_not
1,Could you give a brief overview of who you are?,Can you introduce yourself briefly?,Have you ever trained or mentored someone?,0
2,What are your short-term and long-term career ...,What are your career goals?,What are your long-term aspirations?,0
4,Why did you decide to study in your chosen field?,Why did you choose your field of study?,How do you stay updated in your field?,0
5,What inspires you to perform well?,What motivates you in life?,What inspired you to pursue this career?,0
7,Where do you see yourself professionally in th...,What are your long-term aspirations?,Where do you see yourself in five years?,0
8,Who or what influenced your career choice the ...,What inspired you to pursue this career?,What are your career goals?,0
9,What does success mean to you?,How do you define success?,What are your career goals?,0
10,What do you consider your strongest skills?,What are your greatest strengths?,Why did you choose your field of study?,0
11,Are there any areas where you struggle and are...,What are your weaknesses?,Is there anything else you’d like us to know?,0
12,How do you keep your knowledge and skills up-t...,How do you stay updated in your field?,How do you keep improving yourself?,0
