# Setup & Imports

In [2]:
import os
import re
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import nltk
from nltk.corpus import stopwords
import pickle

# --- Download NLTK data (if needed) ---
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))



In [3]:

print("Torch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
print("Is CUDA available?:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(0))


Torch version: 2.3.0+cu121
CUDA version: 12.1
cuDNN version: 8902
Is CUDA available?: True
Device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3050 Laptop GPU


In [17]:
import os
import re
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import nltk
from nltk.corpus import stopwords
import pickle

# --- Download NLTK data (if needed) ---
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

# ==============================================================================
# 1. Configuration & File Paths
# ==============================================================================
FILE_PATH = '/home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Datasets/fake_job_cleaned_dataset.csv'
GLOVE_PATH = '/home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Models/glove.6B.100d.txt'
MODEL_SAVE_PATH = '/home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Models/hybrid_model_glove_v2.pth'
VOCAB_SAVE_PATH = '/home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Models/vocab_glove_v2.pth'
COLS_SAVE_PATH = '/home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Models/train_cols_glove_v2.pkl'

# --- Hyperparameters ---
VOCAB_SIZE = 20000
EMBEDDING_DIM = 100
MAX_LEN = 512
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 5e-5

# ==============================================================================
# 2. Reusable Functions & Classes
# ==============================================================================
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

def create_features(df):
    df['text'] = (
        df['title'].fillna('') + ' ' + df['location'].fillna('') + ' ' +
        df['description'].fillna('') + ' ' + df['requirements'].fillna('') + ' ' +
        df['benefits'].fillna('') + ' ' + df['employment_type'].fillna('')
    )
    df['text'] = df['text'].apply(clean_text)
    for col in ['telecommuting', 'has_company_logo', 'has_questions']:
        df[col] = df[col].astype(float)
    categorical_cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function']
    return pd.get_dummies(df, columns=categorical_cols, dummy_na=True, drop_first=True)

class JobDataset(Dataset):
    def __init__(self, texts, tabular, labels, text_pipeline):
        self.texts = texts
        self.tabular = tabular
        self.labels = labels
        self.text_pipeline = text_pipeline
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        tabular_data = self.tabular[idx]
        label = self.labels[idx]
        processed_text = torch.tensor(self.text_pipeline(text), dtype=torch.long)
        return processed_text, torch.tensor(tabular_data, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

def collate_batch(batch):
    text_list, tabular_list, label_list = [], [], []
    for (_text, _tabular, _label) in batch:
        text_list.append(_text)
        tabular_list.append(_tabular)
        label_list.append(_label)
    padded_texts = pad_sequence(text_list, batch_first=True, padding_value=0)
    return padded_texts, torch.stack(tabular_list), torch.stack(label_list)
    
class HybridRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, tabular_feature_count, padding_idx, pretrained_weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_weights, freeze=False, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.tabular_fc = nn.Linear(tabular_feature_count, 32)
        self.relu = nn.ReLU()
        lstm_output_size = hidden_dim * 2
        self.fc_combined = nn.Linear(lstm_output_size + 32, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, tabular_features):
        embedded = self.embedding(text)
        _, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        tabular_out = self.relu(self.tabular_fc(tabular_features))
        combined = torch.cat((hidden, tabular_out), dim=1)
        return self.fc_combined(combined)

def get_glove_embedding_matrix(vocab, glove_file_path, embedding_dim):
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for i, word in enumerate(vocab.get_itos()):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print(f"GloVe embeddings loaded for {len(embedding_matrix)} words.")
    return torch.FloatTensor(embedding_matrix)

# ==============================================================================
# 3. Main Training & Evaluation Script
# ==============================================================================
if __name__ == '__main__':
    if not os.path.exists(FILE_PATH):
        print(f"Error: Dataset file '{FILE_PATH}' not found.")
    elif not os.path.exists(GLOVE_PATH):
        print(f"Error: GloVe file '{GLOVE_PATH}' not found. Please download it first.")
    else:
        print("--- Loading and Preparing Data ---")
        df = pd.read_csv(FILE_PATH)
        df_processed = create_features(df)
        
        y = df_processed['fraudulent'].values
        X_text = df_processed['text']
        
        numeric_cols = df_processed.select_dtypes(include=np.number).columns.tolist()
        if 'fraudulent' in numeric_cols:
            numeric_cols.remove('fraudulent')
        X_tabular = df_processed[numeric_cols].values.astype(np.float32)
        train_df_columns = ['text'] + numeric_cols

        X_text_train, X_text_test, X_tabular_train, X_tabular_test, y_train, y_test = train_test_split(
            X_text, X_tabular, y, test_size=0.2, random_state=42, stratify=y)
        
        tokenizer = get_tokenizer('basic_english')
        def yield_tokens(data_iter):
            for text in data_iter:
                yield tokenizer(text)

        vocab = build_vocab_from_iterator(yield_tokens(X_text_train), specials=["<unk>", "<pad>"], max_tokens=VOCAB_SIZE)
        vocab.set_default_index(vocab["<unk>"])
        padding_idx = vocab['<pad>']
        text_pipeline = lambda x: vocab(tokenizer(x))
        
        train_dataset = JobDataset(X_text_train, X_tabular_train, pd.Series(y_train), text_pipeline)
        test_dataset = JobDataset(X_text_test, X_tabular_test, pd.Series(y_test), text_pipeline)
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_batch)
        
        pretrained_embedding_weights = get_glove_embedding_matrix(vocab, GLOVE_PATH, EMBEDDING_DIM)
        
        print("\n--- Initializing Model ---")
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        pos_weight = torch.tensor([10.0], dtype=torch.float32).to(device)
        print(f"Using a manual positive weight for loss function: {pos_weight.item():.2f}")
        
        model = HybridRNNModel(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS,
                             BIDIRECTIONAL, DROPOUT, X_tabular.shape[1], padding_idx, pretrained_embedding_weights).to(device)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight).to(device)
        
        print(f"Model initialized on {device}.")
        print("\n--- Starting Model Training ---")
        for epoch in range(EPOCHS):
            model.train()
            epoch_loss = 0
            for text, tab, labels in train_loader:
                text, tab, labels = text.to(device), tab.to(device), labels.to(device)
                optimizer.zero_grad()
                predictions = model(text, tab).squeeze(1)
                loss = criterion(predictions, labels)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
            print(f'Epoch {epoch+1:02} | Training Loss: {epoch_loss/len(train_loader):.4f}')
        
        print("\n--- Training Complete ---")

        # ==============================================================================
        # 4. UPDATED: Evaluation with Threshold Tuning
        # ==============================================================================
        print("\n--- Evaluating Model Performance across different thresholds ---")
        model.eval()
        all_probs = []
        all_labels = []
        with torch.no_grad():
            for text, tab, labels in test_loader:
                text, tab, labels = text.to(device), tab.to(device), labels.to(device)
                predictions = model(text, tab).squeeze(1)
                # Get the raw probabilities by applying sigmoid
                probabilities = torch.sigmoid(predictions)
                all_probs.extend(probabilities.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        # Now, test different thresholds
        thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
        all_probs = np.array(all_probs)

        for threshold in thresholds:
            print(f"\n==================== Threshold: {threshold} ====================")
            # Apply the threshold to get final predictions
            all_preds = (all_probs >= threshold).astype(int)
            
            print(f"Accuracy: {accuracy_score(all_labels, all_preds):.4f}")
            print("\nClassification Report:")
            print(classification_report(all_labels, all_preds, target_names=['Real Job', 'Fake Job']))
            print("====================================================\n")

        # --- 5. Save the Final Model and Tokenizer ---
        print("\n--- Saving Model and Artifacts (based on last training run) ---")
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        torch.save(vocab, VOCAB_SAVE_PATH)
        with open(COLS_SAVE_PATH, 'wb') as f:
            pickle.dump(train_df_columns, f)
            
        print(f"✅ Model saved to: {MODEL_SAVE_PATH}")
        print(f"✅ Vocab saved to: {VOCAB_SAVE_PATH}")
        print(f"✅ Training columns saved to: {COLS_SAVE_PATH}")

--- Loading and Preparing Data ---
Loading GloVe embeddings...
GloVe embeddings loaded for 20000 words.

--- Initializing Model ---
Using a manual positive weight for loss function: 10.00
Model initialized on cuda.

--- Starting Model Training ---
Epoch 01 | Training Loss: 1.0537
Epoch 02 | Training Loss: 0.6813
Epoch 03 | Training Loss: 0.6327
Epoch 04 | Training Loss: 0.5481
Epoch 05 | Training Loss: 0.5319

--- Training Complete ---

--- Evaluating Model Performance across different thresholds ---

Accuracy: 0.8834

Classification Report:
              precision    recall  f1-score   support

    Real Job       0.99      0.89      0.94      3403
    Fake Job       0.26      0.73      0.38       173

    accuracy                           0.88      3576
   macro avg       0.62      0.81      0.66      3576
weighted avg       0.95      0.88      0.91      3576



Accuracy: 0.9066

Classification Report:
              precision    recall  f1-score   support

    Real Job       0.98    