In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [None]:
file_path = '/content/nlpfinaldataset.csv'
job_data = pd.read_csv('/content/nlpfinaldataset.csv')

In [None]:
# Modify the clean_text function to handle non-string values
def clean_text(text):
    if isinstance(text, str):  # Check if the input is a string
        text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    else:
        text = ''  # If it's not a string, return an empty string
    return text


In [None]:
# Apply text cleaning to job titles and descriptions
job_data['cleaned_jobdescription'] = job_data['jobdescription'].apply(clean_text)
job_data['cleaned_skills'] = job_data['skills'].apply(clean_text)

In [None]:
# Create a combined feature from job title, description, and skills
job_data['combined_features'] = job_data['jobtitle'] + ' ' + job_data['cleaned_jobdescription'] + ' ' + job_data['cleaned_skills']
# Display cleaned dataset
job_data[['jobtitle', 'cleaned_jobdescription', 'cleaned_skills']].head()

Unnamed: 0,jobtitle,cleaned_jobdescription,cleaned_skills
0,AUTOMATION TEST ENGINEER,looking for selenium engineersmust have solid ...,see below
1,Information Security Engineer,the university of chicago has a rapidly growin...,linuxunix network monitoring incident response...
2,Business Solutions Architect,galaxesolutionsevery day our solutions affect ...,enterprise solutions architecture business int...
3,"Java Developer (mid level)- FT- GREAT culture,...",java developerfulltimedirecthirebolingbrook il...,please see job description
4,DevOps Engineer,midtown based high tech firm has an immediate ...,configuration management developer linux manag...


####Model Selection

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Extract features using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(job_data['combined_features'])

In [None]:
# Function to recommend jobs based on the input job title
def recommend_jobs(job_title, job_data, tfidf_matrix):
    idx = job_data[job_data['jobtitle'].str.contains(job_title, case=False)].index[0]

In [None]:
# Example: Get the index of a job title input by the user
job_title_input = 'Data Scientist'  # Replace with the actual user input

# Find the index of the job that matches the user's input
idx = job_data[job_data['jobtitle'].str.contains(job_title_input, case=False)].index[0]

In [None]:
# Calculate cosine similarity between the input job and all other jobs
cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()

In [None]:
# Get top 5 similar jobs based on cosine similarity
similar_indices = cosine_sim.argsort()[-6:][::-1]  # No indentation issue here

In [None]:
# Display recommended jobs
recommended_jobs = job_data[['jobtitle', 'joblocation_address']].iloc[similar_indices]
print(recommended_jobs)

                           jobtitle joblocation_address
143          Lead/Sr Data Scientist   San Francisco, CA
2402           Data Scientist - NYC        New York, NY
4883       Data Scientist - Houston         Houston, TX
941    Data Scientist in waltham MA         Waltham, MA
19743                Data Scientist         Chicago, IL
4478       Senior Applied Scientist         Seattle, WA


####Bert for Semantic Similarity

In [1]:
pip install sentence-transformers




In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd

In [None]:
# Load BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Example user profile input
user_profile = "Data Scientist with Python and machine learning skills"

In [None]:
# Create the user embedding
user_embedding = model.encode(user_profile, convert_to_tensor=True)

In [None]:
# Encode job descriptions
job_embeddings = model.encode(job_data['combined_features'].tolist(), convert_to_tensor=True)

In [None]:
# Calculate cosine similarity between user profile and job descriptions
similarities = util.pytorch_cos_sim(user_embedding, job_embeddings)

In [None]:
# Function to recommend jobs using BERT embeddings
def recommend_jobs_bert(user_profile, job_data, job_embeddings):
    user_embedding = model.encode(user_profile, convert_to_tensor=True)

In [None]:
# Calculate cosine similarity between user profile and job descriptions
similarities = util.pytorch_cos_sim(user_embedding, job_embeddings)

In [None]:
def some_function():
    # Get top 5 similar jobs
    similar_indices = similarities.argsort(descending=True).tolist()[0][:5]

    # Ensure the return statement is correctly indented within the function
    return job_data[['jobtitle', 'joblocation_address']].iloc[similar_indices]


In [None]:
def recommend_jobs_bert(user_profile, job_data, job_embeddings):
    # Existing logic for job recommendation

    # Ensure you have the logic to get recommended jobs
    recommended_jobs = job_data[['jobtitle', 'joblocation_address']]  # Example

    # Make sure to return the recommended jobs, not None
    return recommended_jobs


In [None]:
print(job_data.head())
print(job_embeddings.shape)

                                       advertiserurl  \
0  https://www.dice.com/jobs/detail/AUTOMATION-TE...   
1  https://www.dice.com/jobs/detail/Information-S...   
2  https://www.dice.com/jobs/detail/Business-Solu...   
3  https://www.dice.com/jobs/detail/Java-Develope...   
4  https://www.dice.com/jobs/detail/DevOps-Engine...   

                             company  \
0  Digital Intelligence Systems, LLC   
1  University of Chicago/IT Services   
2               Galaxy Systems, Inc.   
3                      TransTech LLC   
4                   Matrix Resources   

                            employmenttype_jobstatus  \
0  C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...   
1                                          Full Time   
2                                          Full Time   
3                                          Full Time   
4                                          Full Time   

                                      jobdescription               jobid  \
0  Lookin

In [None]:
user_profile = "Data Scientist - Houston"
recommended_jobs = recommend_jobs_bert(user_profile, job_data, job_embeddings)

# Print out the recommendations
print(recommended_jobs)

                                                jobtitle joblocation_address
0                               AUTOMATION TEST ENGINEER         Atlanta, GA
1                          Information Security Engineer         Chicago, IL
2                           Business Solutions Architect      Schaumburg, IL
3      Java Developer (mid level)- FT- GREAT culture,...     Bolingbrook, IL
4                                        DevOps Engineer         Atlanta, GA
...                                                  ...                 ...
21995                                       Web Designer         Oakland, CA
21996  Senior Front End Web Developer - Full Time at ...   San Francisco, CA
21997                                         QA Analyst   San Francisco, CA
21998                               Tech Lead-Full Stack         Oakland, CA
21999                                   C/C++ Programmer     Santa Clara, CA

[22000 rows x 2 columns]


In [None]:
def recommend_jobs(similarities, job_data, location=None):
    # Get top 5 similar jobs
    similar_indices = similarities.argsort(descending=True).tolist()[0][:5]

    # Get recommended jobs
    recommended_jobs = job_data[['jobtitle', 'joblocation_address']].iloc[similar_indices]

    # If a location is provided, filter the recommended jobs by that location
    if location:
        recommended_jobs = recommended_jobs[recommended_jobs['joblocation_address'].str.contains(location, case=False, na=False)]

    return recommended_jobs

In [None]:
# Call the function without location filtering
top_jobs = recommend_jobs(similarities, job_data)

# Call the function with location filtering
top_jobs_in_ny = recommend_jobs(similarities, job_data, location='New York')