In [1]:
import re
import pandas as pd
import nltk
import spacy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Download stopwords
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Load SpaCy model
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])

# ---------------------------------------------------------
# 1. CLEANING FUNCTION
# ---------------------------------------------------------
def clean_text(text):
    text = str(text)

    # Remove URLs & websites
    text = re.sub(r'http\S+|www\S+', ' ', text)

    # Remove emails
    text = re.sub(r'\S+@\S+', ' ', text)

    # Remove phone numbers
    text = re.sub(r'\+?\d[\d -]{8,12}\d', ' ', text)

    # Remove emojis
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)

    # Keep only alphabets, numbers, spaces
    text = re.sub(r'[^\w\s]', ' ', text)

    # Lowercase
    text = text.lower()

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# ---------------------------------------------------------
# 2. LOAD AND CLEAN DATASET
# ---------------------------------------------------------

df = pd.read_csv("fake_job_posts.csv")

# Keep only important columns
columns_to_keep = ["title", "location", "description", "requirements", "salary_range"]
df = df[columns_to_keep]

# Remove rows with missing description
df = df.dropna(subset=["description"])

# Remove duplicate rows
df = df.drop_duplicates()

# Remove very short descriptions
df = df[df["description"].str.len() > 20]

# Remove non-alphabetic rows
df = df[df["description"].str.contains('[A-Za-z]', regex=True)]

# Apply cleaning
df["clean_text"] = df["description"].apply(clean_text)

# Save cleaned file
df.to_csv("cleaned_jobs.csv", index=False)
print("✔ Cleaning completed and saved to cleaned_jobs.csv")

# ---------------------------------------------------------
# 3. PREPROCESSING — Lemmatization
# ---------------------------------------------------------
def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.text not in stopwords])

df["processed_text"] = df["clean_text"].apply(lemmatize)

# ---------------------------------------------------------
# 4. TOKENIZATION + PADDING
# ---------------------------------------------------------
texts = df["processed_text"].tolist()

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=200, padding='post')

# Fake label (0 or 1) for model demo
df["fake"] = df["title"].apply(lambda x: 1 if "intern" in str(x).lower() else 0)
labels = df["fake"].values

# ---------------------------------------------------------
# 5. CNN MODEL USING ADAM OPTIMIZER
# ---------------------------------------------------------
cnn_model = Sequential([
    Embedding(10000, 64, input_length=200),
    Conv1D(64, 3, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\n Training CNN model...")
cnn_model.fit(padded, labels, epochs=3, batch_size=32)

cnn_model.save("cnn_model.h5")
print("✔ CNN model saved as cnn_model.h5")

# ---------------------------------------------------------
# 6. RNN (LSTM) MODEL USING ADAM OPTIMIZER
# ---------------------------------------------------------
rnn_model = Sequential([
    Embedding(10000, 64, input_length=200),
    LSTM(64, return_sequences=False),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\n Training RNN (LSTM) model...")
rnn_model.fit(padded, labels, epochs=3, batch_size=32)

rnn_model.save("rnn_model.h5")
print(" RNN model saved as rnn_model.h5")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


✔ Cleaning completed and saved to cleaned_jobs.csv





 Training CNN model...
Epoch 1/3
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.9661 - loss: 0.1956
Epoch 2/3
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.9819 - loss: 0.0647
Epoch 3/3
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.9871 - loss: 0.0376




✔ CNN model saved as cnn_model.h5

 Training RNN (LSTM) model...
Epoch 1/3
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 114ms/step - accuracy: 0.9565 - loss: 0.2057
Epoch 2/3
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 114ms/step - accuracy: 0.9679 - loss: 0.1563
Epoch 3/3
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 112ms/step - accuracy: 0.9652 - loss: 0.1619




 RNN model saved as rnn_model.h5


In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# -------------------------------------------------------
# 1. LOAD CLEANED DATASET
# -------------------------------------------------------
df = pd.read_csv("cleaned_jobs.csv")

print("Dataset columns:", list(df.columns))

# -------------------------------------------------------
# 2. SIMPLE PREPROCESSING FUNCTION
#    (use existing clean_text column if available)
# -------------------------------------------------------
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

if "clean_text" in df.columns:
    text_col = "clean_text"
else:
    text_col = "description"

print(f"Processing column: '{text_col}'")
df["processed_text"] = df[text_col].astype(str).apply(preprocess_text)

# -------------------------------------------------------
# 3. TF-IDF VECTORISATION
# -------------------------------------------------------
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["processed_text"])

# -------------------------------------------------------
# 4. SAVE OUTPUTS
# -------------------------------------------------------
pd.DataFrame(X.toarray(),
             columns=tfidf.get_feature_names_out()).to_csv("tfidf_features.csv",
                                                          index=False)
df.to_csv("final_preprocessed_dataset.csv", index=False)

print("Preprocessing Completed!")
print("Saved: final_preprocessed_dataset.csv")
print("TF-IDF Features Saved: tfidf_features.csv")


Dataset columns: ['title', 'location', 'description', 'requirements', 'salary_range', 'clean_text']
Processing column: 'clean_text'
Preprocessing Completed!
Saved: final_preprocessed_dataset.csv
TF-IDF Features Saved: tfidf_features.csv


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocessed text column from your cleaned dataset
texts = df["processed_text"]

tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words="english",
    ngram_range=(1,2)   # unigrams + bigrams
)

X = tfidf.fit_transform(texts)

print("TF-IDF Shape:", X.shape)


TF-IDF Shape: (17454, 5000)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import re

# -----------------------------
# 1. INPUT: Load Dataset
# -----------------------------
df = pd.read_csv("fake_job_posts.csv")

# Keep only important columns
df = df[['description', 'fraudulent']]
df = df.dropna()

# -----------------------------
# 2. PREPROCESSING
# -----------------------------
def clean_text(text):
    text = text.lower()                                    # lowercase
    text = re.sub(r'http\S+|www.\S+', '', text)            # remove URLs
    text = re.sub(r'[^a-zA-Z ]', ' ', text)                # remove special chars
    text = re.sub(r'\s+', ' ', text).strip()               # remove extra spaces
    return text

df['clean_text'] = df['description'].apply(clean_text)

# -----------------------------
# 3. FEATURE EXTRACTION (CountVectorizer)
# -----------------------------
vectorizer = CountVectorizer(stop_words='english', max_features=50)
X = vectorizer.fit_transform(df['clean_text'])

# Get top 50 common words overall
common_words = vectorizer.get_feature_names_out()


# =========================================================
# ===============    ANALYSIS TASKS    ====================
# =========================================================

# ---------------------------------------------------------
# TASK 1: Visualize fake vs real job posts
# ---------------------------------------------------------
plt.figure(figsize=(6,4))
df['fraudulent'].value_counts().plot(kind='bar')
plt.title("Fake vs Real Job Posts")
plt.xticks([0,1], labels=['Real (0)', 'Fake (1)'])
plt.xlabel("Category")
plt.ylabel("Count")
plt.show()


# ---------------------------------------------------------
# TASK 2: Text Length Analysis
# ---------------------------------------------------------
df['text_length'] = df['clean_text'].apply(lambda x: len(x.split()))

plt.figure(figsize=(7,4))
df['text_length'].hist(bins=40)
plt.title("Distribution of Text Lengths")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()


# ANALYSIS: Compare text length of real vs fake
plt.figure(figsize=(7,4))
df[df['fraudulent']==0]['text_length'].hist(alpha=0.5, label='Real', bins=40)
df[df['fraudulent']==1]['text_length'].hist(alpha=0.5, label='Fake', bins=40)
plt.legend()
plt.title("Text Length Comparison: Real vs Fake")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()


# ---------------------------------------------------------
# TASK 3: Common words in FAKE job posts
# ---------------------------------------------------------
fake_df = df[df['fraudulent'] == 1]

fake_vectorizer = CountVectorizer(stop_words='english', max_features=20)
fake_matrix = fake_vectorizer.fit_transform(fake_df['clean_text'])

fake_words = fake_vectorizer.get_feature_names_out()
fake_counts = fake_matrix.sum(axis=0).A1

# Plot common words in fake job posts
plt.figure(figsize=(10,5))
plt.bar(fake_words, fake_counts)
plt.xticks(rotation=45)
plt.title("Top Common Words in Fake Job Posts")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.show()


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pickle

# --------------------------------------------
# 1. LOAD PREPROCESSED DATA
# --------------------------------------------
df = pd.read_csv("final_preprocessed_dataset.csv")
print("Columns:", df.columns.tolist())

# We already have processed_text
TEXT_COL = "processed_text"

# --------------------------------------------
# 2. CREATE LABEL COLUMN (DUMMY EXAMPLE)
#    0 = Real, 1 = Fake
# --------------------------------------------
if "fraudulent" not in df.columns:
    df["fraudulent"] = 0
    # mark first 100 rows as fake (you can change this rule later)
    n_fake = min(100, len(df))
    df.loc[:n_fake-1, "fraudulent"] = 1
    print(f"Created 'fraudulent' label: {n_fake} fake, {len(df)-n_fake} real")

LABEL_COL = "fraudulent"

X_text = df[TEXT_COL].astype(str)
y = df[LABEL_COL]

# --------------------------------------------
# 3. FEATURE EXTRACTION (TF-IDF)
# --------------------------------------------
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(X_text)

# Save TF-IDF vectorizer for Flask
with open("tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# --------------------------------------------
# 4. TRAIN-TEST SPLIT
# --------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------------------------
# 5. TRAIN MODELS
# --------------------------------------------
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# --------------------------------------------
# 6. PREDICTIONS & METRICS
# --------------------------------------------
log_pred = log_model.predict(X_test)
nb_pred = nb_model.predict(X_test)

log_acc = accuracy_score(y_test, log_pred)
nb_acc  = accuracy_score(y_test, nb_pred)

print("\nLogistic Regression Accuracy :", log_acc)
print("Naive Bayes Accuracy        :", nb_acc)

print("\n--- Logistic Regression Report ---")
print(classification_report(y_test, log_pred))

print("\n--- Naive Bayes Report ---")
print(classification_report(y_test, nb_pred))

# --------------------------------------------
# 7. SAVE MODELS FOR FLASK API
# --------------------------------------------
with open("log_model.pkl", "wb") as f:
    pickle.dump(log_model, f)

with open("nb_model.pkl", "wb") as f:
    pickle.dump(nb_model, f)

print("\nModels saved as:")
print(" → tfidf.pkl")
print(" → log_model.pkl")
print(" → nb_model.pkl")


Columns: ['title', 'location', 'description', 'requirements', 'salary_range', 'clean_text', 'processed_text']
Created 'fraudulent' label: 100 fake, 17354 real

Logistic Regression Accuracy : 0.9942709825264967
Naive Bayes Accuracy        : 0.9939845316528215

--- Logistic Regression Report ---
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3471
           1       0.00      0.00      0.00        20

    accuracy                           0.99      3491
   macro avg       0.50      0.50      0.50      3491
weighted avg       0.99      0.99      0.99      3491


--- Naive Bayes Report ---
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3471
           1       0.00      0.00      0.00        20

    accuracy                           0.99      3491
   macro avg       0.50      0.50      0.50      3491
weighted avg       0.99      0.99      0.99      3491


Models saved as:
 → tf

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#  DistilBERT Fake Job Detector - YOUR UPLOADED DATASET (<18 mins)
# Perfect for your JobCheck project - Uses YOUR local dataset!

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pandas as pd, torch, time, warnings, numpy as np
warnings.filterwarnings("ignore")
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from datasets import Dataset

print(" YOUR DATASET - DistilBERT Fake Job Detector!")
print(" Analyzing your uploaded dataset...")
start_total = time.time()

# 1. LOAD YOUR DATASET (CSV/Excel support)
print("\n Loading your dataset...")
# Try common filenames first
dataset_files = ['fake_job_postings.csv', 'dataset.csv', 'jobs.csv', 'data.csv', 'train.csv']

df = None
for filename in dataset_files:
    try:
        if filename.endswith('.csv'):
            df = pd.read_csv(filename)
        else:
            df = pd.read_excel(filename)
        print(f" Loaded: {filename}")
        break
    except:
        continue

# If no common filename, load from user specification
if df is None:
    print("Available files:", os.listdir('.'))
    # Default to first CSV found
    csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
    if csv_files:
        df = pd.read_csv(csv_files[0])
        print(f" Auto-loaded: {csv_files[0]}")

print(f" Dataset shape: {df.shape}")
print(f" Columns: {list(df.columns)}")
print("\n First 3 rows:")
print(df.head(3))

# 2. AUTO-PREPROCESSING (Smart column detection)
print("\n Auto-detecting job text & labels...")

# Common text columns
text_cols = ['title', 'description', 'job_description', 'text', 'job_title', 'content']
text_col = None
for col in text_cols:
    if col in df.columns:
        text_col = col
        break

if text_col is None:
    # Combine title + description if both exist
    if 'title' in df.columns and 'description' in df.columns:
        df['text'] = df['title'].fillna('') + ' [SEP] ' + df['description'].fillna('')
        text_col = 'text'
    else:
        print(" No text columns found! Using first string column.")
        str_cols = df.select_dtypes(include=['object']).columns
        text_col = str_cols[0]

print(f" Text column: {text_col}")

# Common label columns (0/1, real/fake, legitimate/fraud)
label_cols = ['label', 'fake', 'is_fake', 'fraudulent', 'telecommuting', 'has_company_logo']
label_col = None

for col in label_cols:
    if col in df.columns:
        label_col = col
        break

if label_col is None:
    # Auto-detect binary column
    binary_cols = [col for col in df.columns if df[col].nunique() == 2]
    if binary_cols:
        label_col = binary_cols[0]
        print(f" Auto-detected label: {label_col}")

# Smart label mapping (0=real, 1=fake)
if label_col:
    print(f" Label distribution before processing: {df[label_col].value_counts().to_dict()}")

    # Map common patterns
    if df[label_col].dtype == 'object':
        df[label_col] = df[label_col].map({'real': 0, 'fake': 1, 'legitimate': 0, 'fraudulent': 1, 0: 0, 1: 1}).fillna(1)
    df['label'] = pd.to_numeric(df[label_col], errors='coerce').fillna(1).astype(int)

    print(f" Final labels: Real={sum(df.label==0)}, Fake={sum(df.label==1)}")

# Clean & balance dataset
df = df.dropna(subset=[text_col, 'label']).reset_index(drop=True)
df['text'] = df[text_col].astype(str)

# Balance classes (minority class size)
min_class = min(sum(df.label==0), sum(df.label==1))
real_sample = df[df.label==0].sample(n=min_class, random_state=42)
fake_sample = df[df.label==1].sample(n=min_class, random_state=42)
df = pd.concat([real_sample, fake_sample]).reset_index(drop=True)

print(f" Balanced dataset: {len(df):,} jobs (Real: {sum(df.label==0):,}, Fake: {sum(df.label==1):,})")

# 3. TOKENIZATION
print("\n Tokenizing your dataset...")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=256)

train_df, test_df = train_test_split(df[['text', 'label']], test_size=0.2, stratify=df.label, random_state=42)
train_ds = Dataset.from_pandas(train_df).map(tokenize, batched=True)
test_ds = Dataset.from_pandas(test_df).map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 4. TRAIN YOUR MODEL
print("\n Training on YOUR dataset...")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir='./your_jobcheck_model',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=50,
        weight_decay=0.01,
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        dataloader_num_workers=0,
        report_to=None,
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
        gradient_accumulation_steps=2
    ),
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

# 5. RESULTS
results = trainer.evaluate()
print(f"\n YOUR DATASET RESULTS:")
print(f" Accuracy:  {results['eval_accuracy']*100:.1f}%")
print(f" F1-Score:  {results['eval_f1']:.3f}")
print(f" Precision: {results['eval_precision']:.3f}")
print(f" Recall:    {results['eval_recall']:.3f}")

trainer.save_model("./your_jobcheck_model")
tokenizer.save_pretrained("./your_jobcheck_model")
print(" Saved: ./your_jobcheck_model/")

# 6. PRODUCTION PREDICTOR
print("\n Test your model:")
def predict_job(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = probs.argmax().item()
    return " FAKE" if pred == 1 else " REAL", probs[0][pred].item(), probs[0][1].item()

# Interactive testing
print("\n Enter job postings to test (Ctrl+C to stop):")
while True:
    try:
        job_text = input("\nJob posting: ")
        if not job_text.strip(): break
        verdict, conf, fake_prob = predict_job(job_text)
        print(f"   {verdict} ({conf:.1%} conf) | Fake risk: {fake_prob:.1%}")
    except KeyboardInterrupt:
        break


