In [9]:
import torch

print("Torch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
print("Is CUDA available?:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(0))


Torch version: 2.8.0+cu126
CUDA version: 12.6
cuDNN version: 91002
Is CUDA available?: True
Device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3050 Laptop GPU


In [4]:
import torch
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# 1. Load data (iterator yielding (label, text))
train_iter = AG_NEWS(split="train")

# 2. Tokenizer
tokenizer = get_tokenizer("basic_english")

# 3. Build vocabulary from training data (CPU work)
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# 4. Function to preprocess one example
def text_to_tensor(text: str):
    tokens = tokenizer(text)
    indices = [vocab[token] for token in tokens]
    return torch.tensor(indices, dtype=torch.long)

# 5. Example: prepare a batch
texts = ["hello world", "this is a test"]
tensors = [text_to_tensor(t) for t in texts]

# Pad to same length (simple example)
from torch.nn.utils.rnn import pad_sequence
batch = pad_sequence(tensors, batch_first=True, padding_value=vocab["<unk>"])  # shape: (batch_size, max_len)

labels = torch.tensor([0, 1], dtype=torch.long)  # dummy labels

# 6. Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch = batch.to(device)
labels = labels.to(device)

# 7. Define a simple model
class SimpleModel(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.fc = torch.nn.Linear(embed_dim, num_class)

    def forward(self, x):
        # x: (batch, seq_len)
        # embed: (batch, seq_len, embed_dim)
        emb = self.embedding(x)
        # simple: average pooling over sequence
        pooled = emb.mean(dim=1)
        return self.fc(pooled)

model = SimpleModel(len(vocab), embed_dim=32, num_class=4).to(device)

# 8. Forward pass
output = model(batch)
print("Output shape:", output.shape)


OSError: /home/vikhil/miniconda3/envs/vikhil/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch6detail10class_baseC2ERKSsS3_SsRKSt9type_infoS6_

In [5]:
# Deactivate if you need to, then reactivate to be sure
# conda deactivate
# conda activate vikhil

!pip uninstall torch torchvision torchtext -y

Found existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
Found existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
Found existing installation: torchtext 0.18.0
Uninstalling torchtext-0.18.0:
  Successfully uninstalled torchtext-0.18.0


In [None]:
import os
import re
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
from nltk.corpus import stopwords
import pickle

# --- Download NLTK data (if needed) ---
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

# ==============================================================================
# 1. Configuration & Global Functions
# ==============================================================================
FILE_PATH = '/home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Datasets/fake_job_cleaned_dataset.csv'
MODEL_SAVE_PATH = 'hybrid_model.pth'
TOKENIZER_SAVE_PATH = 'tokenizer.pickle'
COLS_SAVE_PATH = 'train_cols.pkl'

# --- Hyperparameters ---
VOCAB_SIZE = 20000
MAX_LEN = 512
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 1e-3

# --- Reusable Functions ---
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

def create_features(df):
    df['text'] = (
        df['title'].fillna('') + ' ' + df['location'].fillna('') + ' ' +
        df['department'].fillna('') + ' ' + df['company_profile'].fillna('') + ' ' +
        df['description'].fillna('') + ' ' + df['requirements'].fillna('') + ' ' +
        df['benefits'].fillna('') + ' ' + df['employment_type'].fillna('') + ' ' +
        df['required_experience'].fillna('') + ' ' + df['required_education'].fillna('') + ' ' +
        df['industry'].fillna('') + ' ' + df['function'].fillna('')
    )
    df['text'] = df['text'].apply(clean_text)
    for col in ['telecommuting', 'has_company_logo', 'has_questions']:
        df[col] = df[col].astype(float)
    categorical_cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function']
    return pd.get_dummies(df, columns=categorical_cols, dummy_na=True, drop_first=True)

class JobDataset(Dataset):
    # ... (code remains the same as before)
    def __init__(self, texts, tabular, labels, tokenizer):
        self.texts = texts
        self.tabular = tabular
        self.labels = labels
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        tabular_data = self.tabular[idx]
        label = self.labels[idx]
        tokens = self.tokenizer.texts_to_sequences([text])[0]
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(tabular_data, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

def collate_batch(batch):
    # ... (code remains the same as before)
    texts, tabular, labels = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)
    return texts_padded, torch.stack(tabular), torch.stack(labels)
    
class HybridRNNModel(nn.Module):
    # ... (code remains the same as before)
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, tabular_feature_count):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.tabular_fc = nn.Linear(tabular_feature_count, 32)
        self.relu = nn.ReLU()
        lstm_output_size = hidden_dim * 2
        self.fc_combined = nn.Linear(lstm_output_size + 32, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, text, tabular_features):
        embedded = self.dropout(self.embedding(text))
        _, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        tabular_out = self.relu(self.tabular_fc(tabular_features))
        combined = torch.cat((hidden, tabular_out), dim=1)
        return self.fc_combined(combined)

# ==============================================================================
# 5. Prediction Pipeline Functions
# ==============================================================================
def extract_structured_data(raw_text):
    # ... (code remains the same as before)
    data = { 'title': '', 'location': '', 'department': '', 'company_profile': '', 'description': '', 'requirements': '', 'benefits': '', 'employment_type': 'Unspecified', 'required_experience': 'Unspecified', 'required_education': 'Unspecified', 'industry': 'Unspecified', 'function': 'Unspecified', 'telecommuting': 0.0, 'has_company_logo': 1.0, 'has_questions': 0.0 }
    lines = raw_text.strip().split('\\n')
    data['title'] = lines[0].strip() if lines else ''
    text_lower = raw_text.lower()
    desc_match = re.search(r'(job description|responsibilities)(.*)(qualifications|requirements)', text_lower, re.S | re.I)
    if desc_match: data['description'] = desc_match.group(2).strip()
    req_match = re.search(r'(qualifications|requirements)(.*)(benefits|why join)', text_lower, re.S | re.I)
    if req_match: data['requirements'] = req_match.group(2).strip()
    ben_match = re.search(r'(benefits|why join)(.*)', text_lower, re.S | re.I)
    if ben_match: data['benefits'] = ben_match.group(2).strip()
    if 'full time' in text_lower: data['employment_type'] = 'Full-time'
    elif 'contract' in text_lower: data['employment_type'] = 'Contract'
    if not data['description']: data['description'] = raw_text
    return data

def predict_job_posting(input_data, model, tokenizer, train_df_cols, device):
    # ... (code remains the same as before, but with added comments)
    model.eval()
    if isinstance(input_data, str):
        print("Raw text detected. Parsing into structured format...")
        structured_data = extract_structured_data(input_data)
        df_pred = pd.DataFrame([structured_data])
    else:
        raise ValueError("Input data must be a raw string.")

    df_pred_processed = create_features(df_pred)
    missing_cols = set(train_df_cols) - set(df_pred_processed.columns)
    for c in missing_cols:
        df_pred_processed[c] = 0
    df_pred_processed = df_pred_processed[train_df_cols]

    text_to_predict = df_pred_processed['text']
    tabular_cols = [c for c in train_df_cols if c not in ['text', 'fraudulent']]
    tabular_to_predict = df_pred_processed[tabular_cols].values

    tokenized = tokenizer.texts_to_sequences(text_to_predict)
    text_tensor = pad_sequence([torch.tensor(t) for t in tokenized], batch_first=True, padding_value=0, total_length=MAX_LEN).to(device)
    tabular_tensor = torch.tensor(tabular_to_predict, dtype=torch.float32).to(device)

    with torch.no_grad():
        prediction = model(text_tensor, tabular_tensor)
        probability = torch.sigmoid(prediction).item()

    result = "Fake Job" if probability > 0.5 else "Real Job"
    print(f"\\n---> Prediction: {result} (Probability of being fake: {probability:.4f})")


# ==============================================================================
# 6. Main Execution Block
# ==============================================================================
def main():
    # --- PHASE 1: TRAINING ---
    print("--- Starting Training Phase ---")
    df = pd.read_csv(FILE_PATH)
    df_processed = create_features(df)
    
    y = df_processed['fraudulent'].values
    X_text = df_processed['text']
    
    train_df_columns = [col for col in df_processed.columns if col != 'fraudulent']
    tabular_cols_train = [c for c in train_df_columns if c != 'text']
    X_tabular = df_processed[tabular_cols_train].values

    X_text_train, _, X_tabular_train, _, y_train, _ = train_test_split(
        X_text, X_tabular, y, test_size=0.2, random_state=42, stratify=y)
    
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_text_train)
    
    train_dataset = JobDataset(X_text_train, X_tabular_train, pd.Series(y_train), tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = HybridRNNModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS,
                         BIDIRECTIONAL, DROPOUT, tabular_feature_count=X_tabular.shape[1]).to(device)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss().to(device)
    
    print(f"\nStarting Model Training on {device}...")
    for epoch in range(EPOCHS):
        model.train()
        epoch_loss = 0
        for text, tab, labels in train_loader:
            text, tab, labels = text.to(device), tab.to(device), labels.to(device)
            optimizer.zero_grad()
            predictions = model(text, tab).squeeze(1)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1:02} | Training Loss: {epoch_loss/len(train_loader):.4f}')
    
    print("\\n--- Training Complete ---")
    
    # --- Save the trained model and tokenizer ---
    torch.save(model.state_dict(), MODEL_SAVE_PATH)
    with open(TOKENIZER_SAVE_PATH, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(COLS_SAVE_PATH, 'wb') as handle:
        pickle.dump(train_df_columns, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    print(f"Model saved to: {MODEL_SAVE_PATH}")
    print(f"Tokenizer saved to: {TOKENIZER_SAVE_PATH}")
    print(f"Training columns saved to: {COLS_SAVE_PATH}")


    # --- PHASE 2: INFERENCE (PREDICTION) ---
    print("\n\n--- Starting Inference Phase ---")
    # Load the assets we just saved
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    with open(TOKENIZER_SAVE_PATH, 'rb') as handle:
        loaded_tokenizer = pickle.load(handle)
        
    with open(COLS_SAVE_PATH, 'rb') as handle:
        loaded_cols = pickle.load(handle)

    # We need the number of tabular features to initialize the model structure correctly
    tabular_feature_count = len([c for c in loaded_cols if c != 'text'])

    loaded_model = HybridRNNModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS,
                                 BIDIRECTIONAL, DROPOUT, tabular_feature_count=tabular_feature_count).to(device)
    
    loaded_model.load_state_dict(torch.load(MODEL_SAVE_PATH))
    print("Trained model and tokenizer loaded successfully.")
    
    # --- Get interactive user input ---
    input_text = input("\\n>>> Please paste the job description here and press Enter:\\n")
    
    # --- Make the prediction ---
    predict_job_posting(input_text, loaded_model, loaded_tokenizer, loaded_cols, device)


if __name__ == '__main__':
    if not os.path.exists(FILE_PATH):
        print(f"Error: Dataset file '{FILE_PATH}' not found. Please ensure it is in the correct directory.")
    else:
        main()!# This cleans out pip's local cache
pip cache purge