In [1]:
import pandas as pd
df = pd.read_csv("fake_job_postings.csv")

In [2]:
df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [3]:
df.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [5]:
df.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent
count,17880.0,17880.0,17880.0,17880.0,17880.0
mean,8940.5,0.042897,0.795302,0.491723,0.048434
std,5161.655742,0.202631,0.403492,0.499945,0.214688
min,1.0,0.0,0.0,0.0,0.0
25%,4470.75,0.0,1.0,0.0,0.0
50%,8940.5,0.0,1.0,0.0,0.0
75%,13410.25,0.0,1.0,1.0,0.0
max,17880.0,1.0,1.0,1.0,1.0


In [None]:
train = pd.read_csv("/home/prateek/Projects/Fake_Job_Post_Detection/artifact/07_14_2025_14_26_57/data_ingestion/ingested/train.csv")
test = pd.read_csv("/home/prateek/Projects/Fake_Job_Post_Detection/artifact/07_14_2025_14_26_57/data_ingestion/ingested/test.csv")

In [7]:
print(len(train))
train.isnull().sum()

14304


job_id                     0
title                      0
location                 282
department              9213
salary_range           12013
company_profile         2650
description                1
requirements            2137
benefits                5766
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         2786
required_experience     5660
required_education      6510
industry                3930
function                5179
fraudulent                 0
dtype: int64

In [8]:
print(9209/14304)

0.6438059284116331


In [9]:
print(len(test))
test.isnull().sum()

3576


job_id                    0
title                     0
location                 64
department             2334
salary_range           2999
company_profile         658
description               0
requirements            559
benefits               1446
telecommuting             0
has_company_logo          0
has_questions             0
employment_type         685
required_experience    1390
required_education     1595
industry                973
function               1276
fraudulent                0
dtype: int64

In [10]:
print(2338/3576)

0.6538031319910514


In [11]:
print(train.groupby("fraudulent")["department"].value_counts(normalize=True))
print(train.groupby("fraudulent")["salary_range"].value_counts(normalize=True))

fraudulent  department  
0           Sales           0.091342
            Engineering     0.073529
            Marketing       0.064830
            Operations      0.043289
            IT              0.037904
                              ...   
1           Work at home    0.003802
            biotech         0.003802
            hr              0.003802
            medical         0.003802
            reception       0.003802
Name: proportion, Length: 1201, dtype: float64
fraudulent  salary_range
0           0-0             0.052805
            40000-50000     0.025931
            30000-40000     0.020745
            45000-67000     0.016502
            25000-30000     0.013673
                              ...   
1           80000-119000    0.005882
            80000-200000    0.005882
            80000-85000     0.005882
            90000-100000    0.005882
            Dec-25          0.005882
Name: proportion, Length: 807, dtype: float64


In [12]:
# Drop low-value columns
train.drop(columns=['department', 'salary_range'], inplace=True)

# Fill 'Unknown' for categoricals
unknown_fill_cols = ['location', 'employment_type', 'required_experience',
                     'required_education', 'industry', 'function']
for col in unknown_fill_cols:
    train[col] = train[col].fillna('Unknown')

# Fill empty strings for text
text_fill_cols = ['company_profile', 'requirements', 'benefits']
for col in text_fill_cols:
    train[col] = train[col].fillna('')

# Drop row with missing description
train.dropna(subset=['description'], inplace=True)


In [13]:
# Drop low-value columns
test.drop(columns=['department', 'salary_range'], inplace=True)

# Fill 'Unknown' for categoricals
unknown_fill_cols = ['location', 'employment_type', 'required_experience',
                     'required_education', 'industry', 'function']
for col in unknown_fill_cols:
    test[col] = test[col].fillna('Unknown')

# Fill empty strings for text
text_fill_cols = ['company_profile', 'requirements', 'benefits']
for col in text_fill_cols:
    test[col] = test[col].fillna('')

In [14]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


# Split Features and Target

X_train = train.drop(columns=["fraudulent"])
y_train = train["fraudulent"]


# Define Column Groups

text_cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']
cat_cols = ['location', 'employment_type', 'required_experience',
            'required_education', 'industry', 'function']
num_cols = ['telecommuting', 'has_company_logo', 'has_questions']


# Define Pipelines for Each Column Type

# One pipeline per text column (for better separation and parallel vectorization)
text_transformers = [
    (f"text_{col}", TfidfVectorizer(stop_words='english', max_features=300), col)
    for col in text_cols
]

# Categorical pipeline
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

# Combine all in ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        *text_transformers,                     # Each text column
        ('cat', cat_pipeline, cat_cols),        # Categorical columns
        ('num', 'passthrough', num_cols)        # Numeric columns
    ],
    verbose=True
)


# Fit and Transform the Training Data

X_train_transformed = preprocessor.fit_transform(X_train)

print(f"Transformed X_train shape: {X_train_transformed.shape}")


[ColumnTransformer] .... (1 of 7) Processing text_title, total=   0.1s
[ColumnTransformer]  (2 of 7) Processing text_company_profile, total=   0.7s
[ColumnTransformer]  (3 of 7) Processing text_description, total=   1.5s
[ColumnTransformer]  (4 of 7) Processing text_requirements, total=   0.7s
[ColumnTransformer] . (5 of 7) Processing text_benefits, total=   0.3s
[ColumnTransformer] ........... (6 of 7) Processing cat, total=   0.0s
[ColumnTransformer] ........... (7 of 7) Processing num, total=   0.0s
Transformed X_train shape: (14303, 4427)


In [15]:
# Split Features and Target

X_test = test.drop(columns=["fraudulent"])
y_test = test["fraudulent"]


X_test_transformed = preprocessor.transform(X_test)

print(f"Transformed X_test shape: {X_test_transformed.shape}")


Transformed X_test shape: (3576, 4427)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

model_configs = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    
    "RandomForest": RandomForestClassifier(
        n_estimators=100, max_depth=10, class_weight='balanced', random_state=42
    ),

    "XGBoost": XGBClassifier(
        use_label_encoder=False, eval_metric='logloss', max_depth=6, learning_rate=0.1
    ),

    "SVC": SVC(
        kernel='rbf', probability=True, class_weight='balanced'
    )
}

In [17]:
from sklearn.metrics import classification_report, roc_auc_score

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"\n====== {name} ======")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
    
    return {
        "model": model,
        "roc_auc": roc_auc_score(y_test, y_proba)
    }


In [18]:
results = {}

for name, model in model_configs.items():
    result = evaluate_model(name, model, X_train_transformed, y_train, X_test_transformed, y_test)
    results[name] = result



              precision    recall  f1-score   support

           0       1.00      0.98      0.99      3394
           1       0.68      0.93      0.79       182

    accuracy                           0.97      3576
   macro avg       0.84      0.95      0.89      3576
weighted avg       0.98      0.97      0.98      3576

ROC AUC Score: 0.9893

              precision    recall  f1-score   support

           0       1.00      0.87      0.93      3394
           1       0.29      0.97      0.44       182

    accuracy                           0.88      3576
   macro avg       0.64      0.92      0.69      3576
weighted avg       0.96      0.88      0.91      3576

ROC AUC Score: 0.9727


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3394
           1       0.98      0.66      0.79       182

    accuracy                           0.98      3576
   macro avg       0.98      0.83      0.89      3576
weighted avg       0.98      0.98      0.98      3576

ROC AUC Score: 0.9886

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3394
           1       0.93      0.86      0.89       182

    accuracy                           0.99      3576
   macro avg       0.96      0.93      0.94      3576
weighted avg       0.99      0.99      0.99      3576

ROC AUC Score: 0.9934


In [20]:
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
best_model = results[best_model_name]['model']

print(f"\n✅ Best model: {best_model_name}")



✅ Best model: SVC
