# Import libraries

In [1]:
# Import libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from parrot import Parrot
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def random_state(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
random_state(1234)
print(device)

cuda


# Preprocessing

In [3]:
# Download required NLTK data files
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
def preprocess_sentence(sentence):
    """
    Preprocess a single sentence: remove special characters, stop words, and apply lemmatization.
    """
    # 1. Remove special characters and numbers
    sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)
    
    # 2. Tokenize the sentence
    words = word_tokenize(sentence.lower())  # Convert to lowercase and tokenize
    
    # 3. Remove stop words
#     filtered_words = [word for word in words if word not in stop_words]
    
#     # 4. Lemmatize words
#     lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    # 4. Lemmatize words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    # 5. Reconstruct the sentence
    return " ".join(lemmatized_words)

In [6]:
def preprocess_augmented_dataset(augmented_dataset):
    """
    Preprocess a dataset of augmented sentences (original and paraphrases).
    """
    return [ [preprocess_sentence(sentence) for sentence in sentence_group] for sentence_group in augmented_dataset ]

In [7]:

train_df = pd.read_excel('./Dataset.xlsx', sheet_name='Train')
display(train_df.head())
display(train_df.shape)

Unnamed: 0,Questions
0,Tell me about yourself
1,Can you introduce yourself briefly?
2,What are your career goals?
3,Where do you see yourself in five years?
4,Why did you choose your field of study?


(100, 1)

In [8]:
raw_dataset = list(train_df['Questions'])
len(raw_dataset)

100

In [9]:
# Data Augmentation (Using Parrot for paraphrasing)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=True)

augmented_dataset = []
for sentence in raw_dataset:
    # Start with the original sentence
    sentence_group = [sentence]  
    
    # Generate paraphrases
    paraphrases = parrot.augment(sentence)
    
    # If paraphrases exist, add them to the sentence_group
    if paraphrases is not None:
        for paraphrase, _ in paraphrases:
            sentence_group.append(paraphrase)  # Add the paraphrase to the group
    else:
        print(f"Warning: No paraphrases generated for sentence: {sentence}")
    
    # Add the sentence group (original + paraphrases) to the augmented dataset
    augmented_dataset.append(sentence_group)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565




In [10]:
augmented_preprocessed_dataset = preprocess_augmented_dataset(augmented_dataset)
len(augmented_preprocessed_dataset)

100

# Unsupervised modeling with encodings using Sentence Bert

In [11]:
# Step 1: Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [12]:
# Concatenate the sentences within each group (original + paraphrases) into a single string
augmented_concatenated_dataset = [" ".join(group) for group in augmented_preprocessed_dataset]
len(augmented_concatenated_dataset)

100

In [13]:
# Step 2: Generate Embeddings for the Augmented Dataset
dataset_embeddings = model.encode(augmented_concatenated_dataset, convert_to_tensor=True)

In [14]:
def sentence_matching(unknown_sentence, dataset_embeddings, raw_dataset):
    # Step 3: Process an Unknown Sentence
    preprocessed_unknown_sentence = preprocess_sentence(unknown_sentence)
    unknown_embedding = model.encode(preprocessed_unknown_sentence, convert_to_tensor=True)
    
    # Step 4: Find the Most Similar Sentence
    similarity_scores = util.cos_sim(unknown_embedding, dataset_embeddings)
    most_similar_idx = similarity_scores.argmax().item()
    
    return (raw_dataset[most_similar_idx], preprocessed_unknown_sentence)

In [15]:
# Test Sentence Matching
unknown_sentence = 'What do you consider your strongest skills?'
matched_sentence, preprocessed_unknown_sentence = sentence_matching(unknown_sentence, dataset_embeddings, raw_dataset)

# Output the results
print("Input Sentence:", unknown_sentence)
print("Preprocessed Input Sentence:", preprocessed_unknown_sentence)
print("Most Similar Sentence:", matched_sentence)

Input Sentence: What do you consider your strongest skills?
Preprocessed Input Sentence: what do you consider your strongest skill
Most Similar Sentence: What are your greatest strengths?


# Testing

In [16]:
test_df = pd.read_excel('./Dataset.xlsx', sheet_name='Test')
display(test_df.head())
display(test_df.shape)

Unnamed: 0,Questions
0,Can you share something about yourself?
1,Could you give a brief overview of who you are?
2,What are your short-term and long-term career ...
3,What position do you see yourself in five year...
4,Why did you decide to study in your chosen field?


(100, 1)

In [17]:
test_df['Ground Truth'] = train_df['Questions']
display(test_df.head())
display(test_df.shape)

Unnamed: 0,Questions,Ground Truth
0,Can you share something about yourself?,Tell me about yourself
1,Could you give a brief overview of who you are?,Can you introduce yourself briefly?
2,What are your short-term and long-term career ...,What are your career goals?
3,What position do you see yourself in five year...,Where do you see yourself in five years?
4,Why did you decide to study in your chosen field?,Why did you choose your field of study?


(100, 2)

In [18]:
# Classifying the test sentences to the training dataset 
test_df['Matched Sentence'] = test_df.apply(lambda x: sentence_matching(x['Questions'], dataset_embeddings, raw_dataset)[0], axis=1)

display(test_df.head())
display(test_df.shape)

Unnamed: 0,Questions,Ground Truth,Matched Sentence
0,Can you share something about yourself?,Tell me about yourself,Tell me about yourself
1,Could you give a brief overview of who you are?,Can you introduce yourself briefly?,Tell me about yourself
2,What are your short-term and long-term career ...,What are your career goals?,What are your long-term aspirations?
3,What position do you see yourself in five year...,Where do you see yourself in five years?,Where do you see yourself in five years?
4,Why did you decide to study in your chosen field?,Why did you choose your field of study?,Why did you choose your field of study?


(100, 3)

In [19]:
# Calculating the accuracy
test_df['Match_or_not'] = test_df.apply(lambda x: 1 if x['Ground Truth'] == x['Matched Sentence'] else 0, axis=1)
accuracy = test_df['Match_or_not'].sum() / test_df.shape[0]
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 83.00%


In [20]:
# The ones which are not correctly classified
test_df[test_df['Match_or_not'] == 0].to_excel('Test_cosine.xlsx')
test_df[test_df['Match_or_not'] == 0]

Unnamed: 0,Questions,Ground Truth,Matched Sentence,Match_or_not
1,Could you give a brief overview of who you are?,Can you introduce yourself briefly?,Tell me about yourself,0
2,What are your short-term and long-term career ...,What are your career goals?,What are your long-term aspirations?,0
5,What inspires you to perform well?,What motivates you in life?,What are your greatest strengths?,0
7,Where do you see yourself professionally in th...,What are your long-term aspirations?,Where do you see yourself in five years?,0
11,Are there any areas where you struggle and are...,What are your weaknesses?,How do you keep improving yourself?,0
18,What is your typical approach to solving tough...,How do you approach problem-solving?,How do you deal with unforeseen challenges?,0
20,Why are you interested in this particular posi...,Why do you want this job?,What interests you most about this role?,0
27,What would you aim to accomplish early in this...,What do you expect to accomplish in the first ...,What challenges are you expecting in this posi...,0
37,What is the professional achievement you’re mo...,What is your proudest accomplishment?,What is your greatest professional achievement?,0
38,Have you ever been a mentor or coach to someone?,Have you ever trained or mentored someone?,Have you ever mentored someone at work?,0
