In [1]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV files
candidates_data = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")
job_details_data = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

# List of columns with JSON data in candidate data
json_columns = ['Experiences', 'Projects', 'Achievements', 'Certifications', 'HardSkills', 'SoftSkills', 'RecommendedJobDomains']

# Parse JSON columns in candidate data
for column in json_columns:
    candidates_data[column] = candidates_data[column].apply(json.loads)

# Combine relevant text fields into a single text string for each candidate
def combine_skills(row):
    hard_skills = ' '.join(row['HardSkills'])
    soft_skills = ' '.join(row['SoftSkills'])
    combined_text = f"{hard_skills} {soft_skills}"
    return combined_text

candidates_data['combined_skills'] = candidates_data.apply(combine_skills, axis=1)

# Combine relevant text fields for job roles
job_details_data['combined_job_text'] = job_details_data.apply(lambda row: f"{row['role_description']} {row['requirement']}", axis=1)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine all texts for fitting the TF-IDF vectorizer
all_texts = candidates_data['combined_skills'].tolist() + job_details_data['combined_job_text'].tolist()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# Split the TF-IDF matrix into candidates' skills and job roles
candidates_matrix = tfidf_matrix[:len(candidates_data)]
jobs_matrix = tfidf_matrix[len(candidates_data):]

# Create a DataFrame to store relevancy scores
relevancy_scores = pd.DataFrame(index=candidates_data.index)

# Compute cosine similarity between each candidate's skills and each job role
for job_index in job_details_data.index:
    job_vector = jobs_matrix[job_index]
    similarity_scores = cosine_similarity(candidates_matrix, job_vector)
    relevancy_scores[job_index] = similarity_scores.flatten()

# Add relevancy scores to the job details DataFrame
job_details_data['relevancy_scores'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).index.tolist(), axis=0)
job_details_data['relevancy_candidates'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).values.tolist(), axis=0)

# Save the updated job details with relevancy scores to a new CSV file
job_details_data.to_csv('job_relevancy_scores.csv', index=False)


TypeError: sequence item 0: expected str instance, dict found

In [3]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV files
candidates_data = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")
job_details_data = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

# List of columns with JSON data in candidate data
json_columns = ['Experiences', 'Projects', 'Achievements', 'Certifications', 'HardSkills', 'SoftSkills', 'RecommendedJobDomains']

# Parse JSON columns in candidate data
for column in json_columns:
    candidates_data[column] = candidates_data[column].apply(json.loads)

# Combine relevant text fields into a single text string for each candidate
def combine_skills(row):
    hard_skills = ' '.join(row['HardSkills'])
    soft_skills = ' '.join(row['SoftSkills'])
    experiences = ' '.join([item['description'] for item in row['Experiences']])
    projects = ' '.join([item['description'] for item in row['Projects']])
    achievements = ' '.join([item['description'] for item in row['Achievements']])
    certifications = ' '.join([item['name'] for item in row['Certifications']])
    combined_text = f"{hard_skills} {soft_skills} {experiences} {projects} {achievements} {certifications}"
    return combined_text

candidates_data['combined_skills'] = candidates_data.apply(combine_skills, axis=1)

# Combine relevant text fields for job roles
job_details_data['combined_job_text'] = job_details_data.apply(lambda row: f"{row['role_description']} {row['requirement']}", axis=1)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine all texts for fitting the TF-IDF vectorizer
all_texts = candidates_data['combined_skills'].tolist() + job_details_data['combined_job_text'].tolist()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# Split the TF-IDF matrix into candidates' skills and job roles
candidates_matrix = tfidf_matrix[:len(candidates_data)]
jobs_matrix = tfidf_matrix[len(candidates_data):]

# Create a DataFrame to store relevancy scores
relevancy_scores = pd.DataFrame(index=candidates_data.index)

# Compute cosine similarity between each candidate's skills and each job role
for job_index in job_details_data.index:
    job_vector = jobs_matrix[job_index]
    similarity_scores = cosine_similarity(candidates_matrix, job_vector)
    relevancy_scores[job_index] = similarity_scores.flatten()

# Convert relevancy scores to lists of candidate indexes and scores
job_details_data['relevancy_scores'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).tolist(), axis=0)
job_details_data['relevancy_candidates'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).index.tolist(), axis=0)

# Save the updated job details with relevancy scores to a new CSV file
job_details_data.to_csv('job_relevancy_scores.csv', index=False)

# Flatten the relevancy_scores and relevancy_candidates for each job to create a detailed CSV
detailed_rows = []
for job_index, job_row in job_details_data.iterrows():
    for candidate_index, score in zip(job_row['relevancy_candidates'], job_row['relevancy_scores']):
        detailed_rows.append({
            'job_id': job_row['job_id'],
            'job_title': job_row['role_title'],
            'candidate_id': candidates_data.iloc[candidate_index]['Name'],
            'candidate_email': candidates_data.iloc[candidate_index]['Email'],
            'relevancy_score': score
        })

detailed_df = pd.DataFrame(detailed_rows)
detailed_df.to_csv('detailed_job_relevancy_scores.csv', index=False)


TypeError: sequence item 0: expected str instance, dict found

In [4]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV files
candidates_data = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")
job_details_data = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

# List of columns with JSON data in candidate data
json_columns = ['Experiences', 'Projects', 'Achievements', 'Certifications', 'HardSkills', 'SoftSkills', 'RecommendedJobDomains']

# Parse JSON columns in candidate data
for column in json_columns:
    candidates_data[column] = candidates_data[column].apply(json.loads)

# Combine relevant text fields into a single text string for each candidate
def combine_skills(row):
    hard_skills = ' '.join([skill['skill'] for skill in row['HardSkills']])
    soft_skills = ' '.join(row['SoftSkills'])
    experiences = ' '.join([item['description'] for item in row['Experiences']])
    projects = ' '.join([item['description'] for item in row['Projects']])
    achievements = ' '.join([item['achievement'] for item in row['Achievements']])
    certifications = ' '.join([item['name'] for item in row['Certifications']])
    combined_text = f"{hard_skills} {soft_skills} {experiences} {projects} {achievements} {certifications}"
    return combined_text

candidates_data['combined_skills'] = candidates_data.apply(combine_skills, axis=1)

# Combine relevant text fields for job roles
job_details_data['combined_job_text'] = job_details_data.apply(lambda row: f"{row['role_description']} {row['requirement']}", axis=1)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine all texts for fitting the TF-IDF vectorizer
all_texts = candidates_data['combined_skills'].tolist() + job_details_data['combined_job_text'].tolist()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# Split the TF-IDF matrix into candidates' skills and job roles
candidates_matrix = tfidf_matrix[:len(candidates_data)]
jobs_matrix = tfidf_matrix[len(candidates_data):]

# Create a DataFrame to store relevancy scores
relevancy_scores = pd.DataFrame(index=candidates_data.index)

# Compute cosine similarity between each candidate's skills and each job role
for job_index in job_details_data.index:
    job_vector = jobs_matrix[job_index]
    similarity_scores = cosine_similarity(candidates_matrix, job_vector)
    relevancy_scores[job_index] = similarity_scores.flatten()

# Convert relevancy scores to lists of candidate indexes and scores
job_details_data['relevancy_scores'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).tolist(), axis=0)
job_details_data['relevancy_candidates'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).index.tolist(), axis=0)

# Save the updated job details with relevancy scores to a new CSV file
job_details_data.to_csv('job_relevancy_scores.csv', index=False)

# Flatten the relevancy_scores and relevancy_candidates for each job to create a detailed CSV
detailed_rows = []
for job_index, job_row in job_details_data.iterrows():
    for candidate_index, score in zip(job_row['relevancy_candidates'], job_row['relevancy_scores']):
        detailed_rows.append({
            'job_id': job_row['job_id'],
            'job_title': job_row['role_title'],
            'candidate_id': candidates_data.iloc[candidate_index]['Name'],
            'candidate_email': candidates_data.iloc[candidate_index]['Email'],
            'relevancy_score': score
        })

detailed_df = pd.DataFrame(detailed_rows)
detailed_df.to_csv('detailed_job_relevancy_scores.csv', index=False)


TypeError: sequence item 0: expected str instance, dict found

In [6]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV files
candidates_data = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")
job_details_data = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

# List of columns with JSON data in candidate data
json_columns = ['Experiences', 'Projects', 'Achievements', 'Certifications', 'HardSkills', 'SoftSkills', 'RecommendedJobDomains']

# Parse JSON columns in candidate data
for column in json_columns:
    candidates_data[column] = candidates_data[column].apply(json.loads)

# Helper function to safely extract text from JSON fields
def extract_text_from_json(json_field, key):
    if isinstance(json_field, list):
        return ' '.join([item[key] for item in json_field if key in item])
    return ''

# Combine relevant text fields into a single text string for each candidate
def combine_skills(row):
    hard_skills = ' '.join([skill['skill'] for skill in row['HardSkills']])
    soft_skills = ' '.join(row['SoftSkills'])
    experiences = extract_text_from_json(row['Experiences'], 'description')
    projects = extract_text_from_json(row['Projects'], 'description')
    achievements = extract_text_from_json(row['Achievements'], 'achievement')
    certifications = extract_text_from_json(row['Certifications'], 'name')
    combined_text = f"{hard_skills} {soft_skills} {experiences} {projects} {achievements} {certifications}"
    return combined_text

candidates_data['combined_skills'] = candidates_data.apply(combine_skills, axis=1)

# Combine relevant text fields for job roles
job_details_data['combined_job_text'] = job_details_data.apply(lambda row: f"{row['role_description']} {row['requirement']}", axis=1)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine all texts for fitting the TF-IDF vectorizer
all_texts = candidates_data['combined_skills'].tolist() + job_details_data['combined_job_text'].tolist()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# Split the TF-IDF matrix into candidates' skills and job roles
candidates_matrix = tfidf_matrix[:len(candidates_data)]
jobs_matrix = tfidf_matrix[len(candidates_data):]

# Create a DataFrame to store relevancy scores
relevancy_scores = pd.DataFrame(index=candidates_data.index)

# Compute cosine similarity between each candidate's skills and each job role
for job_index in job_details_data.index:
    job_vector = jobs_matrix[job_index]
    similarity_scores = cosine_similarity(candidates_matrix, job_vector)
    relevancy_scores[job_index] = similarity_scores.flatten()

# Convert relevancy scores to lists of candidate indexes and scores
job_details_data['relevancy_scores'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).tolist(), axis=0)
job_details_data['relevancy_candidates'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).index.tolist(), axis=0)

# Save the updated job details with relevancy scores to a new CSV file
job_details_data.to_csv('job_relevancy_scores.csv', index=False)

# Flatten the relevancy_scores and relevancy_candidates for each job to create a detailed CSV
detailed_rows = []
for job_index, job_row in job_details_data.iterrows():
    for candidate_index, score in zip(job_row['relevancy_candidates'], job_row['relevancy_scores']):
        detailed_rows.append({
            'job_id': job_row['job_id'],
            'job_title': job_row['role_title'],
            'candidate_name': candidates_data.iloc[candidate_index]['Name'],
            'candidate_email': candidates_data.iloc[candidate_index]['Email'],
            'relevancy_score': score
        })

detailed_df = pd.DataFrame(detailed_rows)
detailed_df.to_csv('detailed_job_relevancy_scores.csv', index=False)


TypeError: sequence item 0: expected str instance, dict found

In [7]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV files
candidates_data = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")
job_details_data = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

# List of columns with JSON data in candidate data
json_columns = ['Experiences', 'Projects', 'Achievements', 'Certifications', 'HardSkills', 'SoftSkills', 'RecommendedJobDomains']

# Parse JSON columns in candidate data
for column in json_columns:
    candidates_data[column] = candidates_data[column].apply(json.loads)

# Helper function to safely extract text from JSON fields
def extract_text_from_json(json_field, key):
    if isinstance(json_field, list):
        return ' '.join([str(item[key]) for item in json_field if key in item])
    return ''

# Combine relevant text fields into a single text string for each candidate
def combine_skills(row):
    hard_skills = ' '.join([skill['skill'] for skill in row['HardSkills']])
    soft_skills = ' '.join([str(skill['skill']) if isinstance(skill, dict) else str(skill) for skill in row['SoftSkills']])
    experiences = extract_text_from_json(row['Experiences'], 'description')
    projects = extract_text_from_json(row['Projects'], 'description')
    achievements = extract_text_from_json(row['Achievements'], 'achievement')
    certifications = extract_text_from_json(row['Certifications'], 'name')
    combined_text = f"{hard_skills} {soft_skills} {experiences} {projects} {achievements} {certifications}"
    return combined_text

candidates_data['combined_skills'] = candidates_data.apply(combine_skills, axis=1)

# Combine relevant text fields for job roles
job_details_data['combined_job_text'] = job_details_data.apply(lambda row: f"{row['role_description']} {row['requirement']}", axis=1)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine all texts for fitting the TF-IDF vectorizer
all_texts = candidates_data['combined_skills'].tolist() + job_details_data['combined_job_text'].tolist()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# Split the TF-IDF matrix into candidates' skills and job roles
candidates_matrix = tfidf_matrix[:len(candidates_data)]
jobs_matrix = tfidf_matrix[len(candidates_data):]

# Create a DataFrame to store relevancy scores
relevancy_scores = pd.DataFrame(index=candidates_data.index)

# Compute cosine similarity between each candidate's skills and each job role
for job_index in job_details_data.index:
    job_vector = jobs_matrix[job_index]
    similarity_scores = cosine_similarity(candidates_matrix, job_vector)
    relevancy_scores[job_index] = similarity_scores.flatten()

# Convert relevancy scores to lists of candidate indexes and scores
job_details_data['relevancy_scores'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).tolist(), axis=0)
job_details_data['relevancy_candidates'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).index.tolist(), axis=0)

# Save the updated job details with relevancy scores to a new CSV file
job_details_data.to_csv('job_relevancy_scores.csv', index=False)

# Flatten the relevancy_scores and relevancy_candidates for each job to create a detailed CSV
detailed_rows = []
for job_index, job_row in job_details_data.iterrows():
    for candidate_index, score in zip(job_row['relevancy_candidates'], job_row['relevancy_scores']):
        detailed_rows.append({
            'job_id': job_row['job_id'],
            'job_title': job_row['role_title'],
            'candidate_name': candidates_data.iloc[candidate_index]['Name'],
            'candidate_email': candidates_data.iloc[candidate_index]['Email'],
            'relevancy_score': score
        })

detailed_df = pd.DataFrame(detailed_rows)
detailed_df.to_csv('detailed_job_relevancy_scores.csv', index=False)


  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similari

ValueError: Cannot set a DataFrame with multiple columns to the single column relevancy_scores

In [8]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV files
candidates_data = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")
job_details_data = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

# List of columns with JSON data in candidate data
json_columns = ['Experiences', 'Projects', 'Achievements', 'Certifications', 'HardSkills', 'SoftSkills', 'RecommendedJobDomains']

# Parse JSON columns in candidate data
for column in json_columns:
    candidates_data[column] = candidates_data[column].apply(json.loads)

# Helper function to safely extract text from JSON fields
def extract_text_from_json(json_field, key):
    if isinstance(json_field, list):
        return ' '.join([str(item[key]) for item in json_field if key in item])
    return ''

# Combine relevant text fields into a single text string for each candidate
def combine_skills(row):
    hard_skills = ' '.join([skill['skill'] for skill in row['HardSkills']])
    soft_skills = ' '.join([str(skill['skill']) if isinstance(skill, dict) else str(skill) for skill in row['SoftSkills']])
    experiences = extract_text_from_json(row['Experiences'], 'description')
    projects = extract_text_from_json(row['Projects'], 'description')
    achievements = extract_text_from_json(row['Achievements'], 'achievement')
    certifications = extract_text_from_json(row['Certifications'], 'name')
    combined_text = f"{hard_skills} {soft_skills} {experiences} {projects} {achievements} {certifications}"
    return combined_text

candidates_data['combined_skills'] = candidates_data.apply(combine_skills, axis=1)

# Combine relevant text fields for job roles
job_details_data['combined_job_text'] = job_details_data.apply(lambda row: f"{row['role_description']} {row['requirement']}", axis=1)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine all texts for fitting the TF-IDF vectorizer
all_texts = candidates_data['combined_skills'].tolist() + job_details_data['combined_job_text'].tolist()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# Split the TF-IDF matrix into candidates' skills and job roles
candidates_matrix = tfidf_matrix[:len(candidates_data)]
jobs_matrix = tfidf_matrix[len(candidates_data):]

# Create a DataFrame to store relevancy scores
relevancy_scores = pd.DataFrame(index=candidates_data.index)

# Compute cosine similarity between each candidate's skills and each job role
for job_index in job_details_data.index:
    job_vector = jobs_matrix[job_index]
    similarity_scores = cosine_similarity(candidates_matrix, job_vector)
    relevancy_scores[job_index] = similarity_scores.flatten()

# Extract the top candidate indexes and scores for each job
job_details_data['relevancy_candidates'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).index.tolist(), axis=0)
job_details_data['relevancy_scores'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).tolist(), axis=0)

# Save the updated job details with relevancy scores to a new CSV file
job_details_data.to_csv('job_relevancy_scores.csv', index=False)

# Flatten the relevancy_scores and relevancy_candidates for each job to create a detailed CSV
detailed_rows = []
for job_index, job_row in job_details_data.iterrows():
    for candidate_index, score in zip(job_row['relevancy_candidates'], job_row['relevancy_scores']):
        detailed_rows.append({
            'job_id': job_row['job_id'],
            'job_title': job_row['role_title'],
            'candidate_name': candidates_data.iloc[candidate_index]['Name'],
            'candidate_email': candidates_data.iloc[candidate_index]['Email'],
            'relevancy_score': score
        })

detailed_df = pd.DataFrame(detailed_rows)
detailed_df.to_csv('detailed_job_relevancy_scores.csv', index=False)


  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similari

ValueError: Cannot set a DataFrame with multiple columns to the single column relevancy_candidates

In [9]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV files
candidates_data = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")
job_details_data = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

# List of columns with JSON data in candidate data
json_columns = ['Experiences', 'Projects', 'Achievements', 'Certifications', 'HardSkills', 'SoftSkills', 'RecommendedJobDomains']

# Parse JSON columns in candidate data
for column in json_columns:
    candidates_data[column] = candidates_data[column].apply(json.loads)

# Helper function to safely extract text from JSON fields
def extract_text_from_json(json_field, key):
    if isinstance(json_field, list):
        return ' '.join([str(item[key]) for item in json_field if key in item])
    return ''

# Combine relevant text fields into a single text string for each candidate
def combine_skills(row):
    hard_skills = ' '.join([skill['skill'] for skill in row['HardSkills']])
    soft_skills = ' '.join([str(skill) for skill in row['SoftSkills']])
    experiences = extract_text_from_json(row['Experiences'], 'description')
    projects = extract_text_from_json(row['Projects'], 'description')
    achievements = extract_text_from_json(row['Achievements'], 'achievement')
    certifications = extract_text_from_json(row['Certifications'], 'name')
    combined_text = f"{hard_skills} {soft_skills} {experiences} {projects} {achievements} {certifications}"
    return combined_text

candidates_data['combined_skills'] = candidates_data.apply(combine_skills, axis=1)

# Combine relevant text fields for job roles
job_details_data['combined_job_text'] = job_details_data.apply(lambda row: f"{row['role_description']} {row['requirement']}", axis=1)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine all texts for fitting the TF-IDF vectorizer
all_texts = candidates_data['combined_skills'].tolist() + job_details_data['combined_job_text'].tolist()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# Split the TF-IDF matrix into candidates' skills and job roles
candidates_matrix = tfidf_matrix[:len(candidates_data)]
jobs_matrix = tfidf_matrix[len(candidates_data):]

# Create a DataFrame to store relevancy scores
relevancy_scores = pd.DataFrame(index=candidates_data.index)

# Compute cosine similarity between each candidate's skills and each job role
for job_index in job_details_data.index:
    job_vector = jobs_matrix[job_index]
    similarity_scores = cosine_similarity(candidates_matrix, job_vector)
    relevancy_scores[job_index] = similarity_scores.flatten()

# Extract the top candidate indexes and scores for each job
job_details_data['relevancy_candidates'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).index.tolist(), axis=0)
job_details_data['relevancy_scores'] = relevancy_scores.apply(lambda col: col.nlargest(len(candidates_data)).tolist(), axis=0)

# Save the updated job details with relevancy scores to a new CSV file
job_details_data.to_csv('job_relevancy_scores.csv', index=False)

# Flatten the relevancy_scores and relevancy_candidates for each job to create a detailed CSV
detailed_rows = []
for job_index, job_row in job_details_data.iterrows():
    for candidate_index, score in zip(job_row['relevancy_candidates'], job_row['relevancy_scores']):
        detailed_rows.append({
            'job_id': job_row['job_id'],
            'job_title': job_row['role_title'],
            'candidate_name': candidates_data.iloc[candidate_index]['Name'],
            'candidate_email': candidates_data.iloc[candidate_index]['Email'],
            'relevancy_score': score
        })

detailed_df = pd.DataFrame(detailed_rows)
detailed_df.to_csv('detailed_job_relevancy_scores.csv', index=False)


  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similari

ValueError: Cannot set a DataFrame with multiple columns to the single column relevancy_candidates

In [11]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV files
candidates_data = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")
job_details_data = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

# List of columns with JSON data in candidate data
json_columns = ['Experiences', 'Projects', 'Achievements', 'Certifications', 'HardSkills', 'SoftSkills', 'RecommendedJobDomains']

# Parse JSON columns in candidate data
for column in json_columns:
    candidates_data[column] = candidates_data[column].apply(json.loads)

# Helper function to safely extract text from JSON fields
def extract_text_from_json(json_field, key):
    if isinstance(json_field, list):
        return ' '.join([str(item[key]) for item in json_field if key in item])
    return ''

# Combine relevant text fields into a single text string for each candidate
def combine_skills(row):
    hard_skills = ' '.join([skill['skill'] for skill in row['HardSkills']])
    soft_skills = ' '.join([str(skill) for skill in row['SoftSkills']])
    experiences = extract_text_from_json(row['Experiences'], 'description')
    projects = extract_text_from_json(row['Projects'], 'description')
    achievements = extract_text_from_json(row['Achievements'], 'achievement')
    certifications = extract_text_from_json(row['Certifications'], 'name')
    combined_text = f"{hard_skills} {soft_skills} {experiences} {projects} {achievements} {certifications}"
    return combined_text

candidates_data['combined_skills'] = candidates_data.apply(combine_skills, axis=1)

# Combine relevant text fields for job roles
job_details_data['combined_job_text'] = job_details_data.apply(lambda row: f"{row['role_description']} {row['requirement']}", axis=1)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine all texts for fitting the TF-IDF vectorizer
all_texts = candidates_data['combined_skills'].tolist() + job_details_data['combined_job_text'].tolist()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

# Split the TF-IDF matrix into candidates' skills and job roles
candidates_matrix = tfidf_matrix[:len(candidates_data)]
jobs_matrix = tfidf_matrix[len(candidates_data):]

# Create a DataFrame to store relevancy scores
relevancy_scores = pd.DataFrame(index=candidates_data.index)

# Compute cosine similarity between each candidate's skills and each job role
for job_index in job_details_data.index:
    job_vector = jobs_matrix[job_index]
    similarity_scores = cosine_similarity(candidates_matrix, job_vector)
    relevancy_scores[job_index] = similarity_scores.flatten()

# Initialize an empty list to store data for detailed CSV
detailed_rows = []

# Iterate through each job index in job_details_data
for job_index, job_row in job_details_data.iterrows():
    # Extract the top candidate indexes and scores for this job
    top_candidates = relevancy_scores[job_index].nlargest(len(candidates_data))
    
    # Iterate through the top candidates and store detailed information
    for candidate_index, score in zip(top_candidates.index, top_candidates):
        detailed_rows.append({
            'job_id': job_row['job_id'],
            'job_title': job_row['role_title'],
            'candidate_name': candidates_data.iloc[candidate_index]['Name'],
            'candidate_email': candidates_data.iloc[candidate_index]['Email'],
            'relevancy_score': score
        })

# Create a DataFrame from detailed_rows
detailed_df = pd.DataFrame(detailed_rows)

# Save the detailed DataFrame to CSV
detailed_df.to_csv('C:/Users/hsahn/Downloads/detailed_job_relevancy_scores.csv', index=False)

# Optionally, print a message indicating successful CSV creation
print("Detailed job relevancy scores saved to 'detailed_job_relevancy_scores.csv'")


  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similari

Detailed job relevancy scores saved to 'detailed_job_relevancy_scores.csv'


# Relevancy Score Calculation 
In the provided code, the relevance score between candidates and job roles is calculated using **cosine similarity**. Here’s a breakdown of how cosine similarity is applied in this context:

### Cosine Similarity Calculation

1. **TF-IDF Vectorization**:
   - The combined texts from candidates' skills and job descriptions are vectorized using `TfidfVectorizer`. This converts text data into numerical vectors based on the Term Frequency-Inverse Document Frequency (TF-IDF) values.
   - `fit_transform(all_texts)` is used to fit the vectorizer on all combined texts (`all_texts`) and transform them into TF-IDF weighted vectors.

2. **Matrix Splitting**:
   - After vectorization, the TF-IDF matrix (`tfidf_matrix`) is split into two parts:
     - `candidates_matrix`: Contains TF-IDF vectors for candidates' combined skills.
     - `jobs_matrix`: Contains TF-IDF vectors for job descriptions.

3. **Cosine Similarity Computation**:
   - For each job role (`job_index`), the corresponding vector from `jobs_matrix` is selected (`job_vector`).
   - Cosine similarity between the `job_vector` and all vectors in `candidates_matrix` is computed using `cosine_similarity(candidates_matrix, job_vector)`.
   - This results in a similarity score for each candidate with respect to the job role.

4. **Relevance Scores Storage**:
   - The computed similarity scores are stored in the `relevancy_scores` DataFrame, where each column corresponds to a job role (`job_index`) and each row corresponds to a candidate.

### Interpretation

- **Cosine Similarity**: Measures the cosine of the angle between two non-zero vectors in an n-dimensional space. In this case, it quantifies the similarity between the TF-IDF vectors of candidates' skills and job requirements.
- **Higher Scores**: Indicate greater similarity between a candidate’s skills profile and the requirements of a job role.
- **Implementation in Code**: The use of `cosine_similarity` from `sklearn.metrics.pairwise` allows efficient calculation across multiple candidates and job roles.

### Conclusion

This method leverages TF-IDF vectorization and cosine similarity to provide a numerical measure of how well candidates match each job role based on their skills and the job requirements. Adjustments to the vectorization parameters or similarity metrics can further tailor the relevance scoring based on specific needs or domain knowledge.

In [12]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

candidates_data = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")
job_details_data = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

json_columns = ['Experiences', 'Projects', 'Achievements', 'Certifications', 'HardSkills', 'SoftSkills', 'RecommendedJobDomains']

for column in json_columns:
    candidates_data[column] = candidates_data[column].apply(json.loads)

def extract_text_from_json(json_field, key):
    if isinstance(json_field, list):
        return ' '.join([str(item[key]) for item in json_field if key in item])
    return ''

def combine_skills(row):
    hard_skills = ' '.join([skill['skill'] for skill in row['HardSkills']])
    soft_skills = ' '.join([str(skill) for skill in row['SoftSkills']])
    experiences = extract_text_from_json(row['Experiences'], 'description')
    projects = extract_text_from_json(row['Projects'], 'description')
    achievements = extract_text_from_json(row['Achievements'], 'achievement')
    certifications = extract_text_from_json(row['Certifications'], 'name')
    combined_text = f"{hard_skills} {soft_skills} {experiences} {projects} {achievements} {certifications}"
    return combined_text

candidates_data['combined_skills'] = candidates_data.apply(combine_skills, axis=1)

job_details_data['combined_job_text'] = job_details_data.apply(lambda row: f"{row['role_description']} {row['requirement']}", axis=1)

tfidf_vectorizer = TfidfVectorizer()

all_texts = candidates_data['combined_skills'].tolist() + job_details_data['combined_job_text'].tolist()

tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

candidates_matrix = tfidf_matrix[:len(candidates_data)]
jobs_matrix = tfidf_matrix[len(candidates_data):]

relevancy_scores = pd.DataFrame(index=candidates_data.index)

for job_index in job_details_data.index:
    job_vector = jobs_matrix[job_index]
    similarity_scores = cosine_similarity(candidates_matrix, job_vector)
    relevancy_scores[job_index] = similarity_scores.flatten()

detailed_rows = []

for job_index, job_row in job_details_data.iterrows():
    top_candidates = relevancy_scores[job_index].nlargest(len(candidates_data))
    
    candidates_list = []
    
    for candidate_index, score in zip(top_candidates.index, top_candidates):
        candidate_details = {
            'candidate_name': candidates_data.iloc[candidate_index]['Name'],
            'candidate_email': candidates_data.iloc[candidate_index]['Email'],
            'relevancy_score': score
        }
        candidates_list.append(candidate_details)
    
    detailed_rows.append({
        'job_id': job_row['job_id'],
        'job_title': job_row['role_title'],
        'candidates': json.dumps(candidates_list)
    })

detailed_df = pd.DataFrame(detailed_rows)

detailed_df.to_csv('C:/Users/hsahn/Downloads/detailed_job_relevancy_scores.json.csv', index=False)

print("Detailed job relevancy scores saved to 'detailed_job_relevancy_scores.json.csv'")


  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similarity_scores.flatten()
  relevancy_scores[job_index] = similari

Detailed job relevancy scores saved to 'detailed_job_relevancy_scores.json.csv'
