In [None]:
## Import packages and Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import string

import io
import re
import nltk

import spacy
# Download the stopwords corpus if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
## load the dataset
df = pd.read_csv('/content/Health Care Service - Sheet1.csv')

In [None]:
df.head()

Unnamed: 0,S.No.,Topic,Job Role,Collaborator's Name,Question Type(if any),Questions,Answers,Source link
0,1.0,Nursing Assistants,Nurse,Sneha G S,,Why did you decide on a career as a nurse?,I come from a long line of nurses. Both my gra...,https://theinterviewguys.com/nursing-interview...
1,2.0,Nursing Assistants,Nurse,Sneha G S,,What do you find rewarding about this job?,"I truly love helping people, and when those pe...",
2,3.0,Nursing Assistants,Nurse,Sneha G S,,How do you deal with someone who isn’t satisfi...,While I constantly strive to do everything I c...,
3,4.0,Nursing Assistants,Nurse,Sneha G S,,Tell me what you feel your greatest skill as a...,I’m very proud of my ability to really listen ...,
4,5.0,Nursing Assistants,Nurse,Sneha G S,,How do you handle the stress of the job?,I find the best way to handle the stress of th...,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   S.No.                  1049 non-null   float64
 1   Topic                  1055 non-null   object 
 2   Job Role               1055 non-null   object 
 3   Collaborator's Name    1055 non-null   object 
 4   Question Type(if any)  68 non-null     object 
 5   Questions              1054 non-null   object 
 6   Answers                1051 non-null   object 
 7   Source link            74 non-null     object 
dtypes: float64(1), object(7)
memory usage: 66.1+ KB


In [None]:
df.shape

(1055, 8)

In [None]:
df.isna().sum()

S.No.                      6
Topic                      0
Job Role                   0
Collaborator's Name        0
Question Type(if any)    987
Questions                  1
Answers                    4
Source link              981
dtype: int64

In [None]:
## Drop columns that are not needed
df = df.drop(columns = ['S.No.', "Collaborator's Name", "Question Type(if any)", "Source link"])

In [None]:
df.isna().sum()

Topic        0
Job Role     0
Questions    1
Answers      4
dtype: int64

In [None]:
## drop rows with null values
df = df.dropna()

In [None]:
df.isna().sum()

Topic        0
Job Role     0
Questions    0
Answers      0
dtype: int64

In [None]:
df.shape

(1050, 4)

In [None]:
df.head(5)

Unnamed: 0,Topic,Job Role,Questions,Answers
0,Nursing Assistants,Nurse,Why did you decide on a career as a nurse?,I come from a long line of nurses. Both my gra...
1,Nursing Assistants,Nurse,What do you find rewarding about this job?,"I truly love helping people, and when those pe..."
2,Nursing Assistants,Nurse,How do you deal with someone who isn’t satisfi...,While I constantly strive to do everything I c...
3,Nursing Assistants,Nurse,Tell me what you feel your greatest skill as a...,I’m very proud of my ability to really listen ...
4,Nursing Assistants,Nurse,How do you handle the stress of the job?,I find the best way to handle the stress of th...


### Text cleaning and preprocessing

In [None]:
## preprocessing of the dataset, removeing punctuations and make lowercase

def preprocess_text_column(df, column_name):
    # stemmer = PorterStemmer()
    # stop_words = set(stopwords.words("english"))

    # Function to preprocess a single text
    def preprocess_text(text):
        # Remove punctuation and convert to lowercase
        text = text.translate(str.maketrans("", "", string.punctuation)).lower()
        # Tokenize the text
        words = word_tokenize(text)
        # Remove stopwords and apply stemming
        # words = [stemmer.stem(word) for word in words if word not in stop_words]
        # words = [word for word in words if word not in stop_words]

        # Join the processed words back into a single string
        return " ".join(words)
    # Apply the preprocessing function to the specified column
    df[column_name] = df[column_name].apply(preprocess_text)

    return df

In [None]:
cleaned_df = preprocess_text_column(df, 'Questions')

In [None]:
cleaned_df.head()

Unnamed: 0,Topic,Job Role,Questions,Answers
0,Nursing Assistants,Nurse,why did you decide on a career as a nurse,I come from a long line of nurses. Both my gra...
1,Nursing Assistants,Nurse,what do you find rewarding about this job,"I truly love helping people, and when those pe..."
2,Nursing Assistants,Nurse,how do you deal with someone who isn ’ t satis...,While I constantly strive to do everything I c...
3,Nursing Assistants,Nurse,tell me what you feel your greatest skill as a...,I’m very proud of my ability to really listen ...
4,Nursing Assistants,Nurse,how do you handle the stress of the job,I find the best way to handle the stress of th...


In [None]:
cleaned_df.Questions.unique()

array(['why did you decide on a career as a nurse',
       'what do you find rewarding about this job',
       'how do you deal with someone who isn ’ t satisfied with your patient care',
       'tell me what you feel your greatest skill as a nurse is',
       'how do you handle the stress of the job',
       'are you comfortable working with other doctors and nurses',
       'what do you find is the hardest part about being a nurse',
       'why are you the best nursing candidate for this position',
       'tell me about yourself',
       'why are you leaving your current position',
       'why do you want to work here',
       'would you say you ’ re a team player',
       'describe a time you had to deal with a difficult patient and how you handled that',
       'why should we hire you',
       'if you disagree with a physician ’ s approach to treating a patient how do you handle it',
       'how do you explain complex medical terms and procedures to patients who don ’ t have a heal

In [None]:
## reset the index to indexing errors
cleaned_df.reset_index(inplace = True, drop = True)

### Grouping questions according to similarity scores using a 0.5 similarity score threshhold

In [None]:
## this code does not capture every question due to some comaparisons fail to cross the threshhold
# cleaned_df is a pandas dataframe containing questions and answers

# Extract TF-IDF features from the questions
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_df['Questions'])

# Calculate the pairwise cosine similarity between questions
cosine_similarities = cosine_similarity(tfidf_matrix)

# Define a threshold for grouping questions
threshold = 0.5

# Create an empty list to store the groups
grouped_questions = []

# Create a dictionary to store question-to-group mapping
question_to_group = {}

# Counter for group numbering
group_number = 1

# Iterate through the cosine similarity matrix
for i in range(len(cleaned_df)):
    # Check if the question is already assigned to a group
    if i not in question_to_group:
        # Initialize an empty list to store the current group
        current_group = [i]

        # Iterate through the cosine similarity scores for the current question
        for j in range(len(cleaned_df)):
            if i != j and cosine_similarities[i, j] >= threshold:
                current_group.append(j)
                # Mark the question as assigned to the current group
                question_to_group[j] = group_number

        # If the current group contains more than one question, add it to the list of groups
        if len(current_group) > 1:
            grouped_questions.append(current_group)
            # Increment the group number
            group_number += 1

# Print the grouped questions with group numbers
for group_number, group in enumerate(grouped_questions, start=1):
    print(f"Group {group_number}:")
    for i in group:
        print(cleaned_df.loc[i, 'Questions'])


Group 1:
why did you decide on a career as a nurse
why did you decide to be a nurse
why did you decide on a career as a nurse
Group 2:
what do you find rewarding about this job
what do you find most rewarding about being a nurse
what do you find most rewarding about being a nurse
what do you find rewarding about this job
Group 3:
how do you deal with someone who isn ’ t satisfied with your patient care
how do you deal with someone who isn ’ t satisfied with your patient care
Group 4:
tell me what you feel your greatest skill as a nurse is
what ’ s your strongest skill as a nurse
tell me what you feel your greatest skill as a nurse is
what ’ s your strongest skill as a nurse
Group 5:
how do you handle the stress of the job
how do you handle workplace stress
how do you handle workplace stress
how do you handle stress
how do you handle stress
do you want the job
how do you handle the stress of the job
Group 6:
are you comfortable working with other doctors and nurses
how well do you work 

In [None]:
# Create a new 'labels' column in the DataFrame to hold group numbers
cleaned_df['labels'] = None

# Assign group numbers to the 'labels' column
for i, group in enumerate(grouped_questions, start=1):
    cleaned_df.loc[group, 'labels'] = i

# show the modified DataFrame
cleaned_df

Unnamed: 0,Topic,Job Role,Questions,Answers,labels
0,Nursing Assistants,Nurse,why did you decide on a career as a nurse,I come from a long line of nurses. Both my gra...,1
1,Nursing Assistants,Nurse,what do you find rewarding about this job,"I truly love helping people, and when those pe...",2
2,Nursing Assistants,Nurse,how do you deal with someone who isn ’ t satis...,While I constantly strive to do everything I c...,3
3,Nursing Assistants,Nurse,tell me what you feel your greatest skill as a...,I’m very proud of my ability to really listen ...,4
4,Nursing Assistants,Nurse,how do you handle the stress of the job,I find the best way to handle the stress of th...,5
...,...,...,...,...,...
1045,Medical Assistant,Medical Assistant,tell me about your previous experience as a me...,I have worked in a hospital setting for the pa...,
1046,Medical Assistant,Medical Assistant,what do you think makes a good medical assistant,"From my experience, empathy is one of the key ...",226
1047,Medical Assistant,Medical Assistant,describe a time when you solved a problem invo...,The most challenging situation I experienced a...,
1048,Medical Assistant,Medical Assistant,what experience do you have with medical techn...,When I first entered the medical assistant fie...,114


In [None]:
cleaned_df.labels.nunique()

226

In [None]:
## sort the dataframe according to the group numbers
cleaned_df.sort_values(by='labels', inplace=True)
cleaned_df

Unnamed: 0,Topic,Job Role,Questions,Answers,labels
0,Nursing Assistants,Nurse,why did you decide on a career as a nurse,I come from a long line of nurses. Both my gra...,1
1009,Nursing Assistants,Nurse,why did you decide on a career as a nurse,I come from a long line of nurses. Both my gra...,1
1,Nursing Assistants,Nurse,what do you find rewarding about this job,"I truly love helping people, and when those pe...",2
1010,Nursing Assistants,Nurse,what do you find rewarding about this job,"I truly love helping people, and when those pe...",2
2,Nursing Assistants,Nurse,how do you deal with someone who isn ’ t satis...,While I constantly strive to do everything I c...,3
...,...,...,...,...,...
1042,Nursing Assistants,Nurse,how do you use organizational skills in your d...,"Being as a nurse, I’m enrolled with various re...",
1043,Nursing Assistants,Nurse,which factors give out as a motivation for you...,"The admiration from my patients, seniors and t...",
1045,Medical Assistant,Medical Assistant,tell me about your previous experience as a me...,I have worked in a hospital setting for the pa...,
1047,Medical Assistant,Medical Assistant,describe a time when you solved a problem invo...,The most challenging situation I experienced a...,


In [None]:
cleaned_df.to_csv('healthcare_labelled_dataset.csv', index=False)

In [None]:
grouped_df = pd.read_csv('/content/healthcare_labelled_dataset.csv')

In [None]:
grouped_df

Unnamed: 0,Topic,Job Role,Questions,Answers,labels
0,Nursing Assistants,Nurse,why did you decide on a career as a nurse,I come from a long line of nurses. Both my gra...,1.0
1,Nursing Assistants,Nurse,why did you decide on a career as a nurse,I come from a long line of nurses. Both my gra...,1.0
2,Nursing Assistants,Nurse,what do you find rewarding about this job,"I truly love helping people, and when those pe...",2.0
3,Nursing Assistants,Nurse,what do you find rewarding about this job,"I truly love helping people, and when those pe...",2.0
4,Nursing Assistants,Nurse,how do you deal with someone who isn ’ t satis...,While I constantly strive to do everything I c...,3.0
...,...,...,...,...,...
1045,Nursing Assistants,Nurse,how do you use organizational skills in your d...,"Being as a nurse, I’m enrolled with various re...",
1046,Nursing Assistants,Nurse,which factors give out as a motivation for you...,"The admiration from my patients, seniors and t...",
1047,Medical Assistant,Medical Assistant,tell me about your previous experience as a me...,I have worked in a hospital setting for the pa...,
1048,Medical Assistant,Medical Assistant,describe a time when you solved a problem invo...,The most challenging situation I experienced a...,


In [None]:
grouped_df.columns

Index(['Topic', 'Job Role', 'Questions', 'Answers', 'labels'], dtype='object')

In [None]:
grouped_df['Job Role'].unique()

array(['Nurse', 'Nursing', 'Home Health Aide', 'Medical Assistant', 'All',
       'Midwife', 'Personal care Assistant', 'Medical assistant',
       'Nurse and Midwife'], dtype=object)

In [None]:
grouped_df['Topic'].unique()

array(['Nursing Assistants', 'Nursing assistant', 'Home Health Aide',
       'Problem-solving skills', 'Behavioral Question',
       'Medical Assistant', 'General', 'nursing assistant',
       'Nursing, Situational Questions',
       'Home Health and Personal Care Aides',
       'Nursing, Behavioral Question', 'Expertise question',
       'Nursing Assistant', 'Medical Assistants',
       'Personality and Character Nursing Interview Question',
       'Educational Background Nursing Interview Question',
       'Questions About Work Experience',
       'Choice About Being a Nurse - Nursing Interview Question',
       'Questions To Assess Critical Thinking',
       'Diversity and Cultural Competency'], dtype=object)