In [92]:
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

In [93]:
jobs_df = pd.read_csv("../../data/raw/job_postings.csv")
resumes_df = pd.read_csv("../../data/raw/resume.csv")

In [94]:
# jobs_df.tail(20)

In [95]:
jobs_df = jobs_df.drop_duplicates() #remove dulicate values
jobs_df.isna().sum()  #checking for any missing values

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

## handlind missing values

In [96]:

cat_columns = [
    'employment_type',
    'required_experience',
    'required_education',
    'industry',
    'function'
]

for col in cat_columns:
    jobs_df[col].fillna(jobs_df[col].mode()[0], inplace=True)


In [97]:
jobs_df['location'].fillna('Not Specified', inplace=True)



0            US, NY, New York
1              NZ, , Auckland
2               US, IA, Wever
3          US, DC, Washington
4          US, FL, Fort Worth
                 ...         
17875         CA, ON, Toronto
17876    US, PA, Philadelphia
17877         US, TX, Houston
17878           NG, LA, Lagos
17879       NZ, N, Wellington
Name: location, Length: 17880, dtype: str

In [98]:
jobs_df['salary_range'].fillna('Not Disclosed', inplace=True)


0        Not Disclosed
1        Not Disclosed
2        Not Disclosed
3        Not Disclosed
4        Not Disclosed
             ...      
17875    Not Disclosed
17876    Not Disclosed
17877    Not Disclosed
17878    Not Disclosed
17879    Not Disclosed
Name: salary_range, Length: 17880, dtype: str

In [99]:
jobs_df['department'].fillna('General', inplace=True)


0          Marketing
1            Success
2            General
3              Sales
4            General
            ...     
17875          Sales
17876     Accounting
17877        General
17878        General
17879    Engineering
Name: department, Length: 17880, dtype: str

In [100]:
jobs_df['benefits'].fillna('No benefits information provided', inplace=True)
jobs_df['company_profile'].fillna('Company information not available', inplace=True)


0        We're Food52, and we've created a groundbreaki...
1        90 Seconds, the worlds Cloud Video Production ...
2        Valor Services provides Workforce Solutions th...
3        Our passion for improving quality of life thro...
4        SpotSource Solutions LLC is a Global Human Cap...
                               ...                        
17875    Vend is looking for some awesome new talent to...
17876    WebLinc is the e-commerce platform and service...
17877    We Provide Full Time Permanent Positions for m...
17878                    Company information not available
17879    Vend is looking for some awesome new talent to...
Name: company_profile, Length: 17880, dtype: str

In [101]:
job_text_cols = [
    'title',
    'company_profile',
    'description',
    'requirements',
    'benefits',
    'location',
    'employment_type',
    'salary_range',
    'industry'

]

# Keep only existing columns
job_text_cols = [col for col in job_text_cols if col in jobs_df.columns]

jobs_df[job_text_cols] = jobs_df[job_text_cols].fillna('')


In [102]:
resume_text_cols = [
    'skills',
    'experience',
    'education',
    'summary'
]

resume_text_cols = [col for col in resume_text_cols if col in resumes_df.columns]
resumes_df[resume_text_cols] = resumes_df[resume_text_cols].fillna('')


In [103]:
# Combine job text
jobs_df['job_text'] = jobs_df[job_text_cols].apply(lambda x: ' '.join(x), axis=1)
jobs_df['job_text'] = jobs_df['job_text'].str.lower()


# Combine resume text
resumes_df['resume_text'] = resumes_df[resume_text_cols].apply(lambda x: ' '.join(x), axis=1)


## Data Preprocessing 

In [104]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", " ", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [105]:
jobs_df['job_text'] = jobs_df['job_text'].apply(clean_text)
resumes_df['resume_text'] = resumes_df['resume_text'].apply(clean_text)


## Train Test Split

In [106]:
X = jobs_df['job_text']
y = jobs_df['fraudulent']


In [107]:
y.value_counts()


fraudulent
0    17014
1      866
Name: count, dtype: int64

In [108]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [109]:
print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 14304
Testing samples: 3576


In [110]:
print("Train distribution:\n", y_train.value_counts(normalize=True))
print("\nTest distribution:\n", y_test.value_counts(normalize=True))


Train distribution:
 fraudulent
0    0.951552
1    0.048448
Name: proportion, dtype: float64

Test distribution:
 fraudulent
0    0.951622
1    0.048378
Name: proportion, dtype: float64


## TF-IDF

In [111]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words='english',      
    max_features=5000,         
    ngram_range=(1,2)          
)

X_train_tfidf = tfidf.fit_transform(X_train)

X_test_tfidf = tfidf.transform(X_test)

In [112]:
print("Training shape:", X_train_tfidf.shape)
print("Testing shape:", X_test_tfidf.shape)


Training shape: (14304, 5000)
Testing shape: (3576, 5000)


## Logistic Regression Model

In [113]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    class_weight='balanced',   
    max_iter=1000,
    random_state=42
)


In [114]:
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

In [115]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9711968680089486


In [116]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.97      0.98      3403
           1       0.64      0.91      0.75       173

    accuracy                           0.97      3576
   macro avg       0.82      0.94      0.87      3576
weighted avg       0.98      0.97      0.97      3576



In [117]:
print(confusion_matrix(y_test, y_pred))


[[3315   88]
 [  15  158]]


In [128]:
legit_jobs = jobs_df[jobs_df['fraudulent'] == 0].copy()
legit_jobs.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'job_text'],
      dtype='str')

In [130]:
print("Before:", legit_jobs.shape)

legit_jobs = legit_jobs[legit_jobs['company_profile'].notna() & (legit_jobs['company_profile'].str.strip() != "")]

print("After:", legit_jobs.shape)


Before: (17014, 19)
After: (14293, 19)


In [131]:
legit_jobs['job_match_text'] = (
    legit_jobs['title'].fillna('') * 3 + " " +
    legit_jobs['description'].fillna('') + " " +
    legit_jobs['company_profile'].fillna('')
)


In [132]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9 ]', ' ', text)
    return text

legit_jobs['job_match_text'] = legit_jobs['job_match_text'].apply(clean_text)


In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_recommend = TfidfVectorizer(stop_words='english', ngram_range=(1,2) ,max_features=5000)

job_vectors = tfidf_recommend.fit_transform(legit_jobs['job_match_text'])


In [134]:
from sklearn.metrics.pairwise import cosine_similarity

In [135]:
def recommend_jobs(user_input, top_n=5):
    
    user_input = user_input.lower()
    user_vector = tfidf_recommend.transform([user_input])
    similarity_scores = cosine_similarity(user_vector, job_vectors)
    top_indices = similarity_scores.argsort()[0][-top_n:][::-1]
    recommended_jobs = legit_jobs.iloc[top_indices][
        ['title', 'company_profile', 'location']
    ].copy()
    recommended_jobs['similarity_score'] = (similarity_scores.flatten()[top_indices] * 100).round(2).astype(str) + "%"

    recommended_jobs.insert(0, 'Rank', range(1, len(recommended_jobs) + 1))

    return recommended_jobs



In [136]:
recommend_jobs("python ")


Unnamed: 0,Rank,title,company_profile,location,similarity_score
8225,1,Junior Python Developer,Playfair Capital is an early stage technology ...,"GB, , London",58.7%
3943,2,Python Developer,We are a small gaming startup based in Timisoa...,"RO, TM, Timisoara",48.0%
204,3,Junior Python Developer,Playfair Capital is an early stage technology ...,,45.08%
7325,4,DSP Research Engineer - C++ Python Linux - Fra...,We Provide Full Time Permanent Positions for m...,"US, CA, San Francisco",38.51%
12129,5,Python Software Engineer,We are a young tech company specialising in lo...,"GR, I, Athens",38.22%


In [125]:
# legit_jobs.to_csv("../../data/processed/clean_jobs.csv", index=False)
# resumes_df.to_csv("../../data/processed/clean_resumes.csv", index=False)