In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from bs4 import BeautifulSoup
import html

In [2]:
# Load NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mbkmu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mbkmu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the full dataset
job_df = pd.read_csv(".Combined_Jobs_Final.csv")

In [None]:
# Keep relevant columns
job_df = job_df[['Status', 'Title', 'Position', 'Company', 'Job.Description']]

In [4]:
# Define text cleaning function
ps = PorterStemmer()

In [5]:
def cleaning(txt):
    # Unescape HTML entities
    txt = html.unescape(txt)
    # Remove HTML tags
    txt = BeautifulSoup(txt, "html.parser").get_text()
    # Remove non-alphanumeric characters
    txt = re.sub(r'[^a-zA-Z0-9\s]', '', txt)
    # Tokenize and stem
    tokens = nltk.word_tokenize(txt.lower())
    stemming = [ps.stem(w) for w in tokens if w not in stopwords.words('english')]
    return " ".join(stemming)

In [6]:
# Apply text cleaning
job_df['clean_text'] = job_df['Job.Description'].astype(str).apply(lambda x: cleaning(x))

  txt = BeautifulSoup(txt, "html.parser").get_text()


In [None]:
job_df['clean_title'] = job_df['Title'].astype(str).apply(lambda x: cleaning(x))

In [None]:
job_df['clean_position'] = job_df['Position'].astype(str).apply(lambda x: cleaning(x))

In [None]:
# Combine cleaned text columns
job_df['clean_text'] = job_df['clean_text'] + " " + job_df['clean_title'] + " " + job_df['clean_position']

In [None]:
# Vectorize the cleaned text data
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(job_df['clean_text'])

In [None]:
# Compute cosine similarity
similarity = cosine_similarity(matrix)

In [None]:
# Recommendation function
def recommend(title, num_recommendations=20):
    best_match = process.extractOne(title, job_df['clean_title'].values)
    if best_match and best_match[1] > 60:  # Adjust the threshold as needed
        indx = job_df[job_df['clean_title'] == best_match[0]].index[0]
        indx = job_df.index.get_loc(indx)
        distances = sorted(list(enumerate(similarity[indx])), key=lambda x: x[1], reverse=True)[1:num_recommendations + 1]
        
        # Get the recommended job details
        jobs = [{
            'Title': job_df.iloc[i[0]].Title,
            'Company': job_df.iloc[i[0]].Company,
            'Job Description': job_df.iloc[i[0]]['Job.Description']
        } for i in distances]
        return jobs
    else:
        return None

In [None]:
# Save the dataset and similarity matrix
pickle.dump(job_df, open('job_df.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))