In [14]:
import pandas as pd
import numpy as np
import re
import string
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity


### Cleaning function

In [2]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(f"[{string.punctuation}]", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


### Load & Preprocess Data

In [5]:
# Load
jobs_df = pd.read_csv("../../data/raw/job_postings.csv")
resumes_df = pd.read_csv("../../data/raw/resume.csv")

# Remove duplicates
jobs_df.drop_duplicates(inplace=True)

# Fill categorical columns
cat_columns = [
    'employment_type', 'required_experience',
    'required_education', 'industry', 'function'
]

for col in cat_columns:
    if col in jobs_df.columns:
        jobs_df[col].fillna(jobs_df[col].mode()[0], inplace=True)

# Fill other columns
jobs_df.fillna({
    'location': 'Not Specified',
    'salary_range': 'Not Disclosed',
    'department': 'General',
    'benefits': 'No benefits information provided',
    'company_profile': 'Company information not available'
}, inplace=True).head()


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,Not Disclosed,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,No benefits information provided,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,Not Disclosed,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",General,Not Disclosed,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,No benefits information provided,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,Not Disclosed,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",General,Not Disclosed,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


### Create Combined Text Fields

In [6]:
job_text_cols = [
    'title', 'company_profile', 'description',
    'requirements', 'benefits', 'industry'
]

job_text_cols = [col for col in job_text_cols if col in jobs_df.columns]

jobs_df['job_text'] = (
    jobs_df[job_text_cols]
    .fillna('')
    .astype(str)
    .agg(' '.join, axis=1)
)

jobs_df['job_text'] = jobs_df['job_text'].apply(clean_text)


## FRAUD DETECTION MODEL

In [15]:


# Features and target
X = jobs_df['job_text']
y = jobs_df['fraudulent']

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Improved pipeline
fraud_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',
        max_features=8000,      # Increased from 5000
        ngram_range=(1,2),
        min_df=3                # Ignore very rare words
    )),
    ('clf', LogisticRegression(
        class_weight='balanced',
        max_iter=2000,          # Increased for convergence
        solver='liblinear',     # Good for small/medium sparse data
        C=0.7,                  # Slight regularization tuning
        random_state=42
    ))
])

# Train
fraud_pipeline.fit(X_train, y_train)

# Predict probabilities
y_probs = fraud_pipeline.predict_proba(X_test)[:, 1]

# Custom threshold (try 0.6 for better precision)
threshold = 0.6
y_pred = (y_probs > threshold).astype(int)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_probs))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9753914988814317
ROC-AUC: 0.9896283286253713
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3403
           1       0.70      0.85      0.77       173

    accuracy                           0.98      3576
   macro avg       0.85      0.92      0.88      3576
weighted avg       0.98      0.98      0.98      3576

[[3341   62]
 [  26  147]]


## JOB RECOMMENDATION SYSTEM

### Filter Only Legit Jobs

In [8]:
legit_jobs = jobs_df[jobs_df['fraudulent'] == 0].copy()

legit_jobs['job_match_text'] = (
    legit_jobs['title'].fillna('') * 3 + " " +
    legit_jobs['description'].fillna('') + " " +
    legit_jobs['company_profile'].fillna('')
)

legit_jobs['job_match_text'] = legit_jobs['job_match_text'].apply(clean_text)


### Vectorization

In [9]:
tfidf_recommend = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,2),
    max_features=5000
)

job_vectors = tfidf_recommend.fit_transform(legit_jobs['job_match_text'])


### Recommendation Function

In [10]:
def recommend_jobs(user_input, top_n=5):
    
    user_input = clean_text(user_input)
    user_vector = tfidf_recommend.transform([user_input])

    similarity_scores = cosine_similarity(user_vector, job_vectors).flatten()
    
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    
    recommended = legit_jobs.iloc[top_indices][
        ['title', 'company_profile', 'location']
    ].copy()

    recommended['similarity_score'] = (
        similarity_scores[top_indices] * 100
    ).round(2)

    recommended.insert(0, 'Rank', range(1, len(recommended)+1))

    return recommended


In [11]:
recommend_jobs("python developer machine learning")



Unnamed: 0,Rank,title,company_profile,location,similarity_score
2290,1,Machine Learning Scientist,We combine advanced machine learning and state...,"GB, ,",57.75
11157,2,Senior Computer Vision/Machine Learning Progra...,Company information not available,"US, TX, Austin",55.43
13196,3,Junior/Intermediate Python Dev,Company information not available,"CA, ON, Toronto",47.53
7128,4,"Data Scientist (Big Data, Machine Learning)",Merchenta’s behavioural advertising platform i...,"PL, , krakow",43.35
11204,5,Senior Python Dev,Company information not available,"CA, ON, Toronto",42.44
