In [18]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
jobs = pd.read_csv("C:/Users/sneha/Downloads/Candidate matching/job_data_merged_1.csv")
candidates = pd.read_csv("C:/Users/sneha/Downloads/Candidate matching/gpt_dataset.csv")

# Display the first few rows of the data
print(jobs.head())
print(candidates.head())

   Unnamed: 0          Category Workplace  \
0           0  Business Analyst    Remote   
1           1  Business Analyst    Remote   
2           2  Business Analyst   On-site   
3           3  Business Analyst   On-site   
4           4  Business Analyst    Remote   

                                       Location             Department  \
0                                United Kingdom             Operations   
1             Makati, Metro Manila, Philippines                 Aux HQ   
2  Al-Dajeej, Al Farwaniyah Governorate, Kuwait       PWC Technologies   
3               London, England, United Kingdom  Consultants, Advisory   
4                                United Kingdom             Operations   

        Type  
0  Full time  
1  Full time  
2  Full time  
3  Full time  
4  Full time  
             Category                                             Resume
0  Frontend Developer  As a seasoned Frontend Developer, I have a pro...
1   Backend Developer  With a solid background i

In [19]:
# Handle missing values
jobs.fillna('', inplace=True)
candidates.fillna('', inplace=True)

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower().strip()
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply preprocessing to relevant columns
jobs['Category'] = jobs['Category'].apply(preprocess_text)
jobs['Workplace'] = jobs['Workplace'].apply(preprocess_text)
jobs['Location'] = jobs['Location'].apply(preprocess_text)
jobs['Department'] = jobs['Department'].apply(preprocess_text)
jobs['Type'] = jobs['Type'].apply(preprocess_text)
candidates['Resume'] = candidates['Resume'].apply(preprocess_text)

# Combine job attributes into one text field
jobs['combined'] = jobs['Category'] + ' ' + jobs['Workplace'] + ' ' + jobs['Location'] + ' ' + jobs['Department'] + ' ' + jobs['Type']

# Display the first few rows to verify
print(jobs.head())
print(candidates.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sneha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sneha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   Unnamed: 0          Category Workplace  \
0           0  business analyst    remote   
1           1  business analyst    remote   
2           2  business analyst      site   
3           3  business analyst      site   
4           4  business analyst    remote   

                                     Location           Department       Type  \
0                              united kingdom            operation  full time   
1              makati metro manila philippine               aux hq  full time   
2  al dajeej al farwaniyah governorate kuwait       pwc technology  full time   
3               london england united kingdom  consultant advisory  full time   
4                              united kingdom            operation  full time   

                                            combined  
0  business analyst remote united kingdom operati...  
1  business analyst remote makati metro manila ph...  
2  business analyst site al dajeej al farwaniyah ...  
3  business analyst si

In [21]:
# Vectorize the combined job descriptions and candidate resumes
vectorizer = TfidfVectorizer(max_features=5000)
job_features = vectorizer.fit_transform(jobs['combined']).toarray()
candidate_features = vectorizer.transform(candidates['Resume']).toarray()

# Print shape of feature matrices to debug
print("Job features shape:", job_features.shape)
print("Candidate features shape:", candidate_features.shape)

Job features shape: (1095, 705)
Candidate features shape: (400, 705)


In [22]:
# Calculate cosine similarity between candidates and jobs
similarity_matrix = cosine_similarity(candidate_features, job_features)

# Print similarity matrix to debug
print("Similarity matrix:", similarity_matrix)

Similarity matrix: [[0.00864636 0.0048747  0.00366521 ... 0.06303141 0.14519842 0.08105911]
 [0.05114574 0.02883526 0.05086385 ... 0.00588803 0.00349973 0.06807685]
 [0.         0.         0.03073701 ... 0.         0.         0.06372656]
 ...
 [0.00740323 0.00417384 0.07696924 ... 0.00505942 0.00300722 0.15957914]
 [0.0208195  0.01173775 0.04375471 ... 0.01422821 0.00845696 0.09071597]
 [0.00576158 0.0032483  0.04074844 ... 0.00393751 0.00234038 0.08448311]]


In [23]:
# Find the best job match for each candidate
best_matches = similarity_matrix.argmax(axis=1)
candidates['best_job_id'] = best_matches

# Print the matched job for each candidate to debug
for i in range(len(candidates)):
    print(f"Candidate {i+1} is best matched with Job {candidates['best_job_id'][i]}")

Candidate 1 is best matched with Job 1061
Candidate 2 is best matched with Job 135
Candidate 3 is best matched with Job 515
Candidate 4 is best matched with Job 427
Candidate 5 is best matched with Job 1061
Candidate 6 is best matched with Job 135
Candidate 7 is best matched with Job 911
Candidate 8 is best matched with Job 135
Candidate 9 is best matched with Job 135
Candidate 10 is best matched with Job 135
Candidate 11 is best matched with Job 427
Candidate 12 is best matched with Job 427
Candidate 13 is best matched with Job 1061
Candidate 14 is best matched with Job 135
Candidate 15 is best matched with Job 135
Candidate 16 is best matched with Job 427
Candidate 17 is best matched with Job 427
Candidate 18 is best matched with Job 135
Candidate 19 is best matched with Job 135
Candidate 20 is best matched with Job 135
Candidate 21 is best matched with Job 427
Candidate 22 is best matched with Job 427
Candidate 23 is best matched with Job 135
Candidate 24 is best matched with Job 13