# Setup & Imports


In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# PyTorch imports
import torch
import torchtext

print("PyTorch version:", torch.__version__)
print("TorchText version:", torchtext.__version__)
print("CUDA available:", torch.cuda.is_available())

PyTorch version: 2.3.0+cu121
TorchText version: 0.18.0+cpu
CUDA available: True


# Exploratory Data Analysis (EDA) and Feature Engineering

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Load the dataset
df = pd.read_csv('/home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Datasets/fake_job_cleaned_dataset.csv')



In [33]:
import string
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import torch
import nltk

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

class GPUTextPreprocessor:
    """
    A class for comprehensive text preprocessing (cleaning, tokenization,
    lemmatization) that numericalizes the text and ensures the final tensors 
    are ready on the GPU (or CPU as fallback).
    """
    def __init__(self, texts, device='cuda', perform_spell_correction=False):
        
        # --- 1. Device Setup ---
        # Explicitly check for CUDA and set the device (GPU/CPU)
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        
        # --- 2. Preprocessing Tools ---
        self.tokenizer = get_tokenizer("basic_english")
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.perform_spell_correction = perform_spell_correction
        
        if self.perform_spell_correction:
            # Note: You need 'pip install pyspellchecker' for this to work
            try:
                from spellchecker import SpellChecker
                self.spell = SpellChecker()
            except ImportError:
                print("Warning: 'spellchecker' not installed. Spell correction will be skipped.")
                self.perform_spell_correction = False
        
        # --- 3. Initial Processing (on CPU) ---
        self.cleaned_texts = [self.clean_text(t) for t in texts]
        
        # --- 4. Build Vocabulary ---
        def token_iterator():
            for text in self.cleaned_texts:
                # Need to check if text is not empty after cleaning
                if text:
                    yield self.tokenizer(text)
                    
        # Include '<unk>' (unknown) and '<pad>' (padding) tokens
        self.vocab = build_vocab_from_iterator(token_iterator(), specials=["<unk>", "<pad>"])
        self.vocab.set_default_index(self.vocab["<unk>"])
        
        # Store the padding index for use in batch_encode
        self.padding_idx = self.vocab["<pad>"]

    def clean_text(self, text):
        """Performs initial string cleaning and lemmatization (CPU-bound)."""
        if pd.isna(text): return ""
        text = str(text) # Ensure the input is a string
        text = BeautifulSoup(text, "html.parser").get_text()
        text = text.lower().translate(str.maketrans('', '', string.punctuation))
        tokens = [t for t in self.tokenizer(text) if t not in self.stop_words and t.isalpha()]
        
        if self.perform_spell_correction:
            # Optimizing spell correction by only correcting unknown words
            misspelled = self.spell.unknown(tokens)
            tokens = [self.spell.correction(t) if t in misspelled else t for t in tokens]
            
        tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        return ' '.join(tokens)

    def encode_tensor(self, cleaned_text):
        """Converts text to token indices and transfers the tensor to the GPU."""
        tokens = self.tokenizer(cleaned_text)
        indices = [self.vocab[token] for token in tokens]
        
        # The key step: creating the tensor and moving it to the designated device (GPU or CPU)
        return torch.tensor(indices, dtype=torch.long).to(self.device)

    def batch_encode(self):
        """
        Encodes all texts and pads them to the longest sequence in the batch. 
        The final padded tensor remains on the GPU.
        """
        tensors = [self.encode_tensor(t) for t in self.cleaned_texts if t]
        
        if not tensors: 
            return torch.tensor([], dtype=torch.long).to(self.device)
            
        # pad_sequence handles padding and the final output tensor is on self.device
        return pad_sequence(tensors, batch_first=True, padding_value=self.padding_idx)

# Feature Engineering: Numerical and Location

In [34]:
# --- 2. Feature Engineering: Numerical and Location ---

# --- CRITICAL FIX: Fill NaN/None with empty string before calling len() ---
# This ensures that len() is only called on string objects.

# 1. Fill NaNs for the text columns you need length from.
df['title'] = df['title'].fillna('')
df['description'] = df['description'].fillna('')
df['company_profile'] = df['company_profile'].fillna('')
df['requirements'] = df['requirements'].fillna('')
df['benefits'] = df['benefits'].fillna('')

# 2.1 Text Length Features
df['title_len'] = df['title'].apply(len)
df['desc_len'] = df['description'].apply(len)
df['profile_len'] = df['company_profile'].apply(len) 
# You should also consider requirements_len and benefits_len for richer features.
df['req_len'] = df['requirements'].apply(len)
df['benefits_len'] = df['benefits'].apply(len)


# 2.2 Location Features (Splitting to Country)
# Fill remaining NaNs in location, though it should be mostly filled
df['location'] = df['location'].fillna('Unknown, Unknown') 
df['country_code'] = df['location'].apply(lambda x: x.split(',')[0].strip())

# 2.3 Seniority Flag 
df['required_experience'] = df['required_experience'].fillna('Unknown')
df['is_senior_role'] = df['required_experience'].str.lower().apply(
    # Check for senior indicators
    lambda x: 1 if 'senior' in x or 'executive' in x or 'director' in x or 'manager' in x else 0
)

# 2.4 Salary Feature (Indicator for presence)
df['has_salary'] = df['salary_range'].apply(lambda x: 0 if pd.isna(x) else 1)

print("Feature Engineering completed without length errors.")
print(df[['title_len', 'desc_len', 'country_code', 'has_salary']].head())

Feature Engineering completed without length errors.
   title_len  desc_len country_code  has_salary
0         16       905           US           1
1         41      2077           NZ           1
2         39       355           US           1
3         33      2600           US           1
4         19      1520           US           1


In [35]:
# Assuming the necessary libraries (pandas, nltk, BeautifulSoup, etc.) are imported.
# Assuming the TextPreprocessor class has been defined in an earlier cell.

# 1. Instantiate the Preprocessor (using GPUTextPreprocessor)
preprocessor = GPUTextPreprocessor(texts=[]) # Initialize without texts first

# 2. Fill NaNs in Original Text Columns (Crucial for clean_text)
text_cols = ['title', 'description', 'company_profile', 'requirements', 'benefits']
for col in text_cols:
    df[col] = df[col].fillna('')

# 3. Apply Cleaning to Create the Missing Columns
start_time = time.time()
for col in text_cols:
    df[f'cleaned_{col}'] = df[col].apply(preprocessor.clean_text)
    
end_time = time.time()

print(f"✅ Text cleaning completed in: {end_time - start_time:.2f} seconds.")
print(f"New column check: {'cleaned_title' in df.columns}")

# --- 4. TF-IDF VECTORIZATION (Ready to Run Now) ---

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# TF-IDF for TITLE 
tfidf_title_vectorizer = TfidfVectorizer(stop_words='english', max_features=1500, ngram_range=(1, 2))
title_features = tfidf_title_vectorizer.fit_transform(df['cleaned_title']) # This line will now work
print(f'Shape of Title TF-IDF features: {title_features.shape}')

# TF-IDF for DESCRIPTION 
tfidf_desc_vectorizer = TfidfVectorizer(stop_words='english', max_features=4000)
desc_features = tfidf_desc_vectorizer.fit_transform(df['cleaned_description'])
print(f'Shape of Description TF-IDF features: {desc_features.shape}')

# TF-IDF for COMPANY PROFILE 
tfidf_profile_vectorizer = TfidfVectorizer(stop_words='english', max_features=1500)
profile_features = tfidf_profile_vectorizer.fit_transform(df['cleaned_company_profile'])
print(f'Shape of Company Profile TF-IDF features: {profile_features.shape}')

# Combine all sparse text features
X_text_features = hstack([title_features, desc_features, profile_features])
print(f'\nShape of Combined Text features (Sparse): {X_text_features.shape}')

✅ Text cleaning completed in: 16.80 seconds.
New column check: True
Shape of Title TF-IDF features: (17880, 1500)
Shape of Description TF-IDF features: (17880, 4000)
Shape of Company Profile TF-IDF features: (17880, 1500)

Shape of Combined Text features (Sparse): (17880, 7000)


In [38]:
# --- Assuming the necessary imports: from scipy.sparse import hstack, csr_matrix ---

# Create the dense numerical/indicator features array (8 columns)
X_numerical_dense = df[[
    'telecommuting', 
    'has_company_logo', 
    'has_questions', 
    'title_len', 
    'desc_len', 
    'profile_len',
    'is_senior_role',
    'has_salary'
]].values 

# One-Hot Encode the categorical features (Sparse Matrix)
# X_categorical_sparse is already a sparse matrix from OneHotEncoder

# --- FIX: Convert the dense NumPy array to a Sparse Matrix ---
# Use csr_matrix for efficient horizontal stacking and final matrix structure.
from scipy.sparse import csr_matrix
X_numerical_sparse = csr_matrix(X_numerical_dense)

print(f'Shape of Numerical/Indicator Features (Sparse): {X_numerical_sparse.shape}')

# Now stack the two sparse matrices (OHE and Numerical/Indicator)
# This forms the complete NON-TEXT feature matrix (X_non_text)
X_non_text_features = hstack([X_categorical_sparse, X_numerical_sparse])

print(f'Shape of FINAL Non-Text Feature Matrix: {X_non_text_features.shape}')

Shape of Numerical/Indicator Features (Sparse): (17880, 8)
Shape of FINAL Non-Text Feature Matrix: (17880, 1634)


# Final Feature Matrix and Saving

In [40]:
import pandas as pd
import numpy as np
import os
import pickle

# --- CRITICAL FIX: Import save_npz and hstack from scipy.sparse ---
from scipy.sparse import hstack, save_npz

# Assuming X_text_features, X_categorical_sparse, X_numerical_sparse, 
# df, y, and the encoder objects are defined from previous cells.

# Re-define X_final (since the notebook kernel was restarted after the last run)
# IMPORTANT: Ensure these variables are available in the current session
# For demonstration, we assume they were successfully created previously:
# X_final = hstack([X_text_features, X_categorical_sparse, X_numerical_sparse])
# y = df['fraudulent'].values
# ... (and all the tfidf_vectorizer and encoder objects) ...

# --- 5. Final Feature Matrix and Saving ---

# Combine ALL sparse features: Text + Categorical + Numerical
X_final = hstack([X_text_features, X_categorical_sparse, X_numerical_sparse])
y = df['fraudulent'].values

print("\n--- Final Data Check ---")
print(f"Shape of FINAL Feature Matrix (X): {X_final.shape}")
print(f"Shape of Target Vector (y): {y.shape}")

# --- Saving ---

base_path = '/home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Datasets/'
save_path_X = os.path.join(base_path, 'X_final_features.npz')
save_path_y = os.path.join(base_path, 'y_target.pkl')
save_path_encoders = os.path.join(base_path, 'ml_encoders.pkl')

# Save the final sparse feature matrix
save_npz(save_path_X, X_final)

# Save the target vector (y)
with open(save_path_y, 'wb') as f:
    pickle.dump(y, f)
    
# Save the fitted encoders/vectorizers for transforming future test data
fitted_tools = {
    'tfidf_title': tfidf_title_vectorizer,
    'tfidf_desc': tfidf_desc_vectorizer,
    'tfidf_profile': tfidf_profile_vectorizer,
    'ohe_categorical': encoder
}
with open(save_path_encoders, 'wb') as f:
    pickle.dump(fitted_tools, f)


print(f"\n✅ Successfully saved X to: {save_path_X}")
print(f"✅ Successfully saved y to: {save_path_y}")
print(f"✅ Successfully saved encoders to: {save_path_encoders}")


--- Final Data Check ---
Shape of FINAL Feature Matrix (X): (17880, 8634)
Shape of Target Vector (y): (17880,)

✅ Successfully saved X to: /home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Datasets/X_final_features.npz
✅ Successfully saved y to: /home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Datasets/y_target.pkl
✅ Successfully saved encoders to: /home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Datasets/ml_encoders.pkl


In [None]:
import string
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import torch
import nltk

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

class GPUTextPreprocessor:
    """
    A class for comprehensive text preprocessing (cleaning, tokenization,
    lemmatization) that numericalizes the text and ensures the final tensors 
    are ready on the GPU (or CPU as fallback).
    """
    def __init__(self, texts, device='cuda', perform_spell_correction=False):
        
        # --- 1. Device Setup ---
        # Explicitly check for CUDA and set the device (GPU/CPU)
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        
        # --- 2. Preprocessing Tools ---
        self.tokenizer = get_tokenizer("basic_english")
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.perform_spell_correction = perform_spell_correction
        
        if self.perform_spell_correction:
            # Note: You need 'pip install pyspellchecker' for this to work
            try:
                from spellchecker import SpellChecker
                self.spell = SpellChecker()
            except ImportError:
                print("Warning: 'spellchecker' not installed. Spell correction will be skipped.")
                self.perform_spell_correction = False
        
        # --- 3. Initial Processing (on CPU) ---
        self.cleaned_texts = [self.clean_text(t) for t in texts]
        
        # --- 4. Build Vocabulary ---
        def token_iterator():
            for text in self.cleaned_texts:
                # Need to check if text is not empty after cleaning
                if text:
                    yield self.tokenizer(text)
                    
        # Include '<unk>' (unknown) and '<pad>' (padding) tokens
        self.vocab = build_vocab_from_iterator(token_iterator(), specials=["<unk>", "<pad>"])
        self.vocab.set_default_index(self.vocab["<unk>"])
        
        # Store the padding index for use in batch_encode
        self.padding_idx = self.vocab["<pad>"]

    def clean_text(self, text):
        """Performs initial string cleaning and lemmatization (CPU-bound)."""
        if pd.isna(text): return ""
        text = str(text) # Ensure the input is a string
        text = BeautifulSoup(text, "html.parser").get_text()
        text = text.lower().translate(str.maketrans('', '', string.punctuation))
        tokens = [t for t in self.tokenizer(text) if t not in self.stop_words and t.isalpha()]
        
        if self.perform_spell_correction:
            # Optimizing spell correction by only correcting unknown words
            misspelled = self.spell.unknown(tokens)
            tokens = [self.spell.correction(t) if t in misspelled else t for t in tokens]
            
        tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        return ' '.join(tokens)

    def encode_tensor(self, cleaned_text):
        """Converts text to token indices and transfers the tensor to the GPU."""
        tokens = self.tokenizer(cleaned_text)
        indices = [self.vocab[token] for token in tokens]
        
        # The key step: creating the tensor and moving it to the designated device (GPU or CPU)
        return torch.tensor(indices, dtype=torch.long).to(self.device)

    def batch_encode(self):
        """
        Encodes all texts and pads them to the longest sequence in the batch. 
        The final padded tensor remains on the GPU.
        """
        tensors = [self.encode_tensor(t) for t in self.cleaned_texts if t]
        
        if not tensors: 
            return torch.tensor([], dtype=torch.long).to(self.device)
            
        # pad_sequence handles padding and the final output tensor is on self.device
        return pad_sequence(tensors, batch_first=True, padding_value=self.padding_idx)

In [None]:
df['text'] = (
    df['title'].fillna('') + ' ' +
    df['company_profile'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['requirements'].fillna('') + ' ' +
    df['benefits'].fillna('')
)


In [None]:
import time
descriptions = df['description'].head(2000).tolist()
# Show a few raw samples before preprocessing
print("\n--- Sample Raw Descriptions (Before Cleaning) ---")
for i, desc in enumerate(descriptions[:3]):
    print(f"[{i+1}] {desc[:300]}...\n")  # Truncate for readability


print("-" * 30)
print("Running optimized preprocessor (spell correction OFF)...")
start = time.time()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(df['text'])
print('Shape of TF-IDF features:', tfidf_features.shape)


In [None]:

    
preprocessor_fast = GPUTextPreprocessor(descriptions, perform_spell_correction=False)

# Show cleaned versions of the same samples
print("\n--- Sample Cleaned Descriptions (After Cleaning) ---")
for i, cleaned in enumerate(preprocessor_fast.cleaned_texts[:3]):
    print(f"[{i+1}] {cleaned}\n")
padded_tensor_batch = preprocessor_fast.batch_encode()
end = time.time() 

print(f"\nTotal execution time: {end - start:.2f} seconds")
print("Tensor shape:", padded_tensor_batch.shape)
if padded_tensor_batch.numel() > 0:
    print("First tensor sample:", padded_tensor_batch[0][:20])
print("-" * 30)
    


    

In [None]:
import pandas as pd
# Assuming 'df' is your DataFrame after all EDA and Feature Engineering

# 1. Define the new, descriptive file path
# A good practice is to include a version number or a suffix indicating the content.
new_file_path = '/home/vikhil/GROUP_1-INFOSYS/Member_Vikhil/Datasets/fake_job_features_ready_for_model_v1.csv'

# 2. Save the updated DataFrame to the new file
# index=False prevents pandas from writing the DataFrame index as a new column
try:
    df.to_csv(new_file_path, index=False)
    print(f"✅ Successfully saved the feature-engineered dataset to: {new_file_path}")
    print(f"Shape of the saved dataset: {df.shape}")
except Exception as e:
    print(f"❌ Error saving the file: {e}")