# Importing Necessary Packages

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib
import seaborn as sns
import nltk
import string
from nltk.corpus import stopwords
from sklearn.metrics import jaccard_score
from nltk.metrics import jaccard_distance
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import gensim
from sentence_transformers import SentenceTransformer

# Performing EDA: Exploratory Data Analysis

In [2]:
#cur_dir = Path().resolve()
#print(cur_dir)
#path_to_file = Path.joinpath(cur_dir, 'potential_talents_OG.xlsx')
#join_path = cur_dir / 'potential_talents_OG.xlsx'
#print("the joined path =", join_path)

data = pd.read_excel("potential_talents_OG.xlsx")

In [3]:
# data = pd.read_excel(path_to_file)
# print(data)
print(data)
data.head()

      id                                          job_title  \
0      1  2019 C.T. Bauer College of Business Graduate (...   
1      2  Native English Teacher at EPIK (English Progra...   
2      3              Aspiring Human Resources Professional   
3      4             People Development Coordinator at Ryan   
4      5    Advisory Board Member at Celal Bayar University   
..   ...                                                ...   
99   100  Aspiring Human Resources Manager | Graduating ...   
100  101              Human Resources Generalist at Loparex   
101  102   Business Intelligence and Analytics at Travelers   
102  103                     Always set them up for Success   
103  104   Director Of Administration at Excellence Logging   

                                location connection  fit  
0                         Houston, Texas         85  NaN  
1                                 Kanada      500+   NaN  
2    Raleigh-Durham, North Carolina Area         44  NaN  
3      

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [4]:
data.describe() # the stats of the data, mean, mediun, max and min, and std value of the data

Unnamed: 0,id,fit
count,104.0,0.0
mean,52.5,
std,30.166206,
min,1.0,
25%,26.75,
50%,52.5,
75%,78.25,
max,104.0,


In [5]:
data.shape # checking the shape of the data

# balance data

(104, 5)

In [6]:
data['job_title']

0      2019 C.T. Bauer College of Business Graduate (...
1      Native English Teacher at EPIK (English Progra...
2                  Aspiring Human Resources Professional
3                 People Development Coordinator at Ryan
4        Advisory Board Member at Celal Bayar University
                             ...                        
99     Aspiring Human Resources Manager | Graduating ...
100                Human Resources Generalist at Loparex
101     Business Intelligence and Analytics at Travelers
102                       Always set them up for Success
103     Director Of Administration at Excellence Logging
Name: job_title, Length: 104, dtype: object

In [7]:
data['location']

0                           Houston, Texas
1                                   Kanada
2      Raleigh-Durham, North Carolina Area
3                            Denton, Texas
4                           İzmir, Türkiye
                      ...                 
99                Cape Girardeau, Missouri
100    Raleigh-Durham, North Carolina Area
101             Greater New York City Area
102               Greater Los Angeles Area
103                            Katy, Texas
Name: location, Length: 104, dtype: object

# Pre-processing Text and Applying Jaccard Similarity

In [17]:
#To lemmatize and stem an Excel file, remove stop words, remove punctuation, and tokenize the 
#text using RegexpTokenizer, you can follow these steps:

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    # Tokenize the text
    tokens = tokenizer.tokenize(text) if isinstance(text, str) else text
    
    # Remove punctuation
    tokens = [token for token in tokens if token.isalpha()]

    # Remove stop words
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Stem
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

In [18]:
input_text = input("Insert your free text: ")
preprocessed_input = preprocess_text(input_text) # Replace with the actual column names to process
preprocessed_data_excel = data['job_title'].apply(preprocess_text)

print(preprocessed_data_excel.to_excel('preprocessed_file.xlsx', index=False))
print(preprocessed_data_excel) #printing out the processed columns
print(preprocessed_input) # printing out the processed input text 

Insert your free text: r
None
0      [c, bauer, colleg, busi, graduat, magna, cum, ...
1      [nativ, english, teacher, epik, english, progr...
2                    [aspir, human, resourc, profession]
3                        [peopl, develop, coordin, ryan]
4       [advisori, board, member, celal, bayar, univers]
                             ...                        
99     [aspir, human, resourc, manag, graduat, may, s...
100                [human, resourc, generalist, loparex]
101                     [busi, intellig, analyt, travel]
102                                [alway, set, success]
103                    [director, administr, excel, log]
Name: job_title, Length: 104, dtype: object
['r']


In [9]:
df = pd.read_excel('preprocessed_file.xlsx')
df['preprocessed_column'] = df['job_title'].apply(preprocess_text)

df['fit_score'] = df['preprocessed_column'].apply(lambda x: len(set(x).intersection(set(preprocessed_input))) / len(set(x).union(set(preprocessed_input))))
df.to_excel('output_file.xlsx', index=False)
print(df)

                                             job_title  \
0    ['c', 'bauer', 'colleg', 'busi', 'graduat', 'm...   
1    ['nativ', 'english', 'teacher', 'epik', 'engli...   
2          ['aspir', 'human', 'resourc', 'profession']   
3              ['peopl', 'develop', 'coordin', 'ryan']   
4    ['advisori', 'board', 'member', 'celal', 'baya...   
..                                                 ...   
99   ['aspir', 'human', 'resourc', 'manag', 'gradua...   
100      ['human', 'resourc', 'generalist', 'loparex']   
101           ['busi', 'intellig', 'analyt', 'travel']   
102                        ['alway', 'set', 'success']   
103          ['director', 'administr', 'excel', 'log']   

                                   preprocessed_column  fit_score  
0    [c, bauer, colleg, busi, graduat, magna, cum, ...        0.0  
1    [nativ, english, teacher, epik, english, progr...        0.0  
2                     [aspir, human, resourc, profess]        0.0  
3                      [peopl, 

# Applying Cosine Similarity

### Sentence Transformer + Calculating Similarity

In [10]:
# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L12-v2')

In [11]:
sentences = data['job_title'].tolist() # data = the original dataset 

In [12]:
#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

In [13]:
input_text_CS = input("Insert your free text: ")
input_embedding = model.encode([input_text_CS])[0]

Insert your free text: I want to work as an engineer who is interested in art and wants to be very rich in the future


In [14]:
#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print()

Sentence: 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional
Embedding: [-3.94034619e-03  3.55660170e-02  1.11462660e-02  2.62162667e-02
  6.14758879e-02 -1.99232083e-02 -1.63670480e-02  3.64676793e-03
 -8.17630813e-02  2.49527656e-02 -3.00650764e-02 -5.20287454e-02
 -5.18384092e-02 -1.90392062e-02  6.32134452e-02  6.98571876e-02
  3.23789269e-02 -2.19207220e-02  1.22361742e-01  1.16313091e-02
 -4.73656990e-02 -4.10423093e-02  5.49963862e-03 -6.62717223e-02
  7.75792077e-02  1.29174786e-02  1.63841993e-02 -6.76391944e-02
 -1.77451745e-02 -3.83062363e-02 -5.73774278e-02  4.78714297e-04
 -1.76203474e-02  1.32092848e-01  6.25960436e-03  8.01007450e-02
  1.39185619e-02  2.16158591e-02  5.64356074e-02  5.24338223e-02
 -5.56738041e-02 -7.35350326e-02  2.62083989e-02  3.77269946e-02
 -5.33966534e-02  6.64172275e-03  6.92184409e-03 -6.36435300e-02
 -7.09201694e-02  5.22959828e-02 -7.64673427e-02 -1.33154526e-01
 -5.11847902e-03  1.95868481e

In [20]:
#Print the embeddings
for sentence, embedding in zip([input_text_CS], [input_embedding]):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print()

Sentence: I want to work as an engineer who is interested in art and wants to be very rich in the future
Embedding: [ 4.82598767e-02  5.43124415e-02  7.36079216e-02  4.10336852e-02
 -3.19703594e-02 -1.04314797e-02 -1.41701959e-02  4.66357404e-03
 -1.37537951e-02 -7.78096020e-02 -1.65516231e-02 -7.64056668e-02
 -4.49598692e-02  3.63136902e-02 -1.79152861e-02 -3.78757752e-02
  3.99207398e-02  2.85960417e-02  2.24051438e-02 -6.09364174e-02
  5.49349971e-02  1.21171772e-03 -1.88202374e-02 -5.72125539e-02
  1.19711235e-01 -1.53537886e-02  6.74358197e-03  3.78816412e-03
  5.02490886e-02 -3.13289976e-03  6.96682483e-02 -5.29672243e-02
 -2.42786296e-02  8.14430136e-03 -3.62842418e-02  8.11887756e-02
  2.51573324e-03 -9.23719071e-03  6.82749525e-02  1.91550981e-03
 -2.19240971e-02 -1.42780542e-02 -3.70978750e-02  9.00119171e-03
  3.43447104e-02  5.07343262e-02  7.32168257e-02  1.45177059e-02
  3.29349339e-02  1.97167210e-02 -4.47144061e-02 -7.49495206e-03
 -4.80817594e-02  1.09532578e-02  1.262

In [16]:
# Calculate cosine similarity
similarity_scores = cosine_similarity([input_embedding], embeddings)
# Find the index of the most similar entry
most_similar_index = similarity_scores.argmax()

# Get the most similar sentence
most_similar_sentence = sentences[most_similar_index]

# Print the most similar sentence and its similarity score
print('Most similar sentence:', most_similar_sentence)
print('Similarity score:', similarity_scores[0, most_similar_index])

Most similar sentence: Liberal Arts Major. Aspiring Human Resources Analyst.
Similarity score: 0.43878564
