In [None]:
# ==========================================
# 1. IMPORT LIBRARIES
# ==========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK data (only needs to be done once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

print("âœ… Libraries Imported Successfully")

# ==========================================
# 2. LOAD THE DATASET
# ==========================================
try:
    df = pd.read_csv('/content/fake_job_postings.csv', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv('fake_job_postings.csv', encoding='latin-1')

print(f"\nâœ… Dataset Loaded. Shape: {df.shape}")

# Show columns to ensure we select the right ones
print("Columns:", df.columns.tolist())

# ==========================================
# 3. DATA CLEANING & PREPARATION
# ==========================================
# We want to combine text columns to give the model more info
# Usually: Title + Description + Requirements
# We fill NaNs with empty strings first so we don't get errors
df = df.fillna('')

df['text'] = df['title'] + ' ' + df['description'] + ' ' + df['requirements']

# Drop duplicates
initial_count = len(df)
df.drop_duplicates(subset=['text'], inplace=True)
print(f"\nRemoved {initial_count - len(df)} duplicates.")

# The target is usually 'fraudulent' (1 = Fake, 0 = Real)
y = df['fraudulent']

# Check class balance
print("\n--- Class Balance ---")
print(y.value_counts())
# Note: You will likely see VERY few 1s (Fake) compared to 0s (Real)

# ==========================================
# 4. TEXT PREPROCESSING
# ==========================================
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove punctuation/numbers (keep only letters)
    text = re.sub(r'[^a-z\s]', '', text)

    # 3. Tokenize (split into words)
    tokens = text.split()

    # 4. Remove stopwords & Lemmatize
    # (Lemmatization converts "hiring" -> "hire", "companies" -> "company")
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(clean_tokens)

print("\nðŸ”„ Preprocessing text... (This might take 10-20 seconds)")
df['clean_text'] = df['text'].apply(preprocess_text)
print("âœ… Preprocessing Complete.")

# ==========================================
# 5. VECTORIZATION (TF-IDF)
# ==========================================
# Convert text to numbers.
# max_features=5000 means we only keep the top 5,000 most important words
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
# ngram_range=(1,2) looks at single words AND pairs (e.g., "wire transfer")

X = tfidf.fit_transform(df['clean_text'])

print(f"\nâœ… Text Vectorized. Shape: {X.shape}")

# ==========================================
# 6. TRAIN-TEST SPLIT
# ==========================================
# Stratify=y ensures we have the same ratio of Fake/Real in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ==========================================
# 7. MODEL TRAINING
# ==========================================
# CRITICAL: class_weight='balanced'
# Since fake jobs are rare, this tells the model to pay MORE attention to them.
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

print("\nâœ… Model Trained Successfully")

# ==========================================
# 8. EVALUATION
# ==========================================
y_pred = model.predict(X_test)

print("\n--- Model Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(6, 4))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ==========================================
# 9. MOST INFORMATIVE FEATURES
# ==========================================
# What words indicate a FAKE job?
feature_names = tfidf.get_feature_names_out()
coefficients = model.coef_[0]

# Get indices of sorted coefficients
sorted_idx = coefficients.argsort()

print("\n--- Top 10 Words Indicating a FAKE Job ---")
# Largest positive coefficients -> Fake
for i in sorted_idx[-10:]:
    print(f"{feature_names[i]} ({coefficients[i]:.2f})")

print("\n--- Top 10 Words Indicating a REAL Job ---")
# Largest negative coefficients -> Real
for i in sorted_idx[:10]:
    print(f"{feature_names[i]} ({coefficients[i]:.2f})")

# ==========================================
# 10. PREDICT ON CUSTOM INPUT
# ==========================================
def detect_fake_job(job_text):
    cleaned = preprocess_text(job_text)
    vectorized = tfidf.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    confidence = model.predict_proba(vectorized)[0][1] # Probability of being Fake

    label = "ðŸš¨ FAKE" if prediction == 1 else "âœ… REAL"
    print(f"\nInput: {job_text[:60]}...")
    print(f"Result: {label} (Fraud Probability: {confidence:.2%})")

# Try it out!
detect_fake_job("Data Entry Clerk needed. Work from home. Earn $5000 a week. No interview required.")
detect_fake_job("Senior Software Engineer. Experience with Python, SQL, and AWS required. On-site in New York.")

# ==========================================
# 11. END OF PROJECT SUMMARY
# ==========================================

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


âœ… Libraries Imported Successfully

âœ… Dataset Loaded. Shape: (17880, 18)
Columns: ['job_id', 'title', 'location', 'department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function', 'fraudulent']

Removed 1932 duplicates.

--- Class Balance ---
fraudulent
0    15260
1      688
Name: count, dtype: int64

ðŸ”„ Preprocessing text... (This might take 10-20 seconds)
