In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import tensorflow as tf


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [6]:
from transformers import BertTokenizer, TFBertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm





In [7]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bindu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bindu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Loading the dataset
df = pd.read_csv('fake_job_postings.csv')

In [9]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # Return an empty string for non-string values
    text = text.lower()  # Lowercase text
    text = re.sub(r'http\S+', '', text)  # Remove links
    text = re.sub(r'\n', ' ', text)  # Remove newline characters
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    text = ' '.join(words)
    return text

df['text'] = df['description'].apply(preprocess_text)

# Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    words = text.split()
    stemmed = [stemmer.stem(word) for word in words]
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    return ' '.join(lemmatized)

df['text'] = df['text'].apply(stem_and_lemmatize)

# Check for missing values
df.dropna(subset=['text'], inplace=True)

# Convert target variable to binary
df['target'] = df['fraudulent'].apply(lambda x: 1 if x == 'yes' else 0)


In [10]:
# Split the dataset
train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)
test_data, val_data = train_test_split(temp_data, test_size=0.33, random_state=42)


In [11]:

# Extract features and labels
X_train = train_data['text']
y_train = train_data['target']
X_test = test_data['text']
y_test = test_data['target']
X_val = val_data['text']
y_val = val_data['target']


In [12]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_val_tfidf = vectorizer.transform(X_val)

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
# XGBoost Model
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train_tfidf, y_train)

In [15]:

# Predictions and evaluation
y_pred_train_xgb = xgb_model.predict(X_train_tfidf)
y_pred_test_xgb = xgb_model.predict(X_test_tfidf)

In [16]:
# Calculate accuracy
train_accuracy_xgb = accuracy_score(y_train, y_pred_train_xgb)
test_accuracy_xgb = accuracy_score(y_test, y_pred_test_xgb)


In [17]:

# Print accuracy
print("XGBoost - Training Accuracy:", train_accuracy_xgb)
print("XGBoost - Testing Accuracy:", test_accuracy_xgb)

XGBoost - Training Accuracy: 1.0
XGBoost - Testing Accuracy: 1.0


In [18]:

# Classification report for detailed performance metrics
print("XGBoost - Training Classification Report")
print(classification_report(y_train, y_pred_train_xgb))
print("XGBoost - Testing Classification Report")
print(classification_report(y_test, y_pred_test_xgb))

XGBoost - Training Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12516

    accuracy                           1.00     12516
   macro avg       1.00      1.00      1.00     12516
weighted avg       1.00      1.00      1.00     12516

XGBoost - Testing Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3593

    accuracy                           1.00      3593
   macro avg       1.00      1.00      1.00      3593
weighted avg       1.00      1.00      1.00      3593



In [19]:

# Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

In [20]:
grid_xgb = GridSearchCV(xgb.XGBClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_xgb.fit(X_train_tfidf, y_train)



In [21]:
best_xgb = grid_xgb.best_estimator_

In [22]:
# Evaluation of best model
y_pred_train_best_xgb = best_xgb.predict(X_train_tfidf)
y_pred_test_best_xgb = best_xgb.predict(X_test_tfidf)

In [23]:
# Calculate accuracy
train_accuracy_best_xgb = accuracy_score(y_train, y_pred_train_best_xgb)
test_accuracy_best_xgb = accuracy_score(y_test, y_pred_test_best_xgb)


In [24]:
# Print accuracy
print("Best XGBoost - Training Accuracy:", train_accuracy_best_xgb)
print("Best XGBoost - Testing Accuracy:", test_accuracy_best_xgb)


Best XGBoost - Training Accuracy: 1.0
Best XGBoost - Testing Accuracy: 1.0


In [25]:
# Classification report for detailed performance metrics
print("Best XGBoost - Training Classification Report")
print(classification_report(y_train, y_pred_train_best_xgb))
print("Best XGBoost - Testing Classification Report")
print(classification_report(y_test, y_pred_test_best_xgb))

Best XGBoost - Training Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12516

    accuracy                           1.00     12516
   macro avg       1.00      1.00      1.00     12516
weighted avg       1.00      1.00      1.00     12516

Best XGBoost - Testing Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3593

    accuracy                           1.00      3593
   macro avg       1.00      1.00      1.00      3593
weighted avg       1.00      1.00      1.00      3593



LSTM MODEL

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping


In [27]:
# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)


In [28]:
maxlen = 100  # Set maximum sequence length

In [29]:
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)
X_val_pad = pad_sequences(X_val_seq, maxlen=maxlen)

In [30]:
# LSTM Model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))  # Removed input_length argument
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

In [32]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [33]:
# Train the model
history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_val_pad, y_val),
    callbacks=[early_stopping]
)

Epoch 1/10
[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 168ms/step - accuracy: 0.9854 - loss: 0.0647 - val_accuracy: 1.0000 - val_loss: 3.3312e-05
Epoch 2/10
[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 115ms/step - accuracy: 1.0000 - loss: 3.0868e-05 - val_accuracy: 1.0000 - val_loss: 1.7061e-05
Epoch 3/10
[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 120ms/step - accuracy: 1.0000 - loss: 1.6524e-05 - val_accuracy: 1.0000 - val_loss: 1.0675e-05
Epoch 4/10
[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 130ms/step - accuracy: 1.0000 - loss: 1.0464e-05 - val_accuracy: 1.0000 - val_loss: 6.9977e-06
Epoch 5/10
[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 122ms/step - accuracy: 1.0000 - loss: 6.8519e-06 - val_accuracy: 1.0000 - val_loss: 4.7212e-06
Epoch 6/10
[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 119ms/step - accuracy: 1.0000 - loss: 4.7215e-06 - val_accura

In [34]:
# Predictions and evaluation
y_pred_train_lstm = (model.predict(X_train_pad) > 0.5).astype("int32")
y_pred_test_lstm = (model.predict(X_test_pad) > 0.5).astype("int32")

[1m392/392[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step


In [35]:
# Calculate accuracy
train_accuracy_lstm = accuracy_score(y_train, y_pred_train_lstm)
test_accuracy_lstm = accuracy_score(y_test, y_pred_test_lstm)

In [36]:
# Print accuracy
print("LSTM - Training Accuracy:", train_accuracy_lstm)
print("LSTM - Testing Accuracy:", test_accuracy_lstm)

LSTM - Training Accuracy: 1.0
LSTM - Testing Accuracy: 1.0


In [37]:
# Classification report for detailed performance metrics
print("LSTM - Training Classification Report")
print(classification_report(y_train, y_pred_train_lstm))
print("LSTM - Testing Classification Report")
print(classification_report(y_test, y_pred_test_lstm))

LSTM - Training Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12516

    accuracy                           1.00     12516
   macro avg       1.00      1.00      1.00     12516
weighted avg       1.00      1.00      1.00     12516

LSTM - Testing Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3593

    accuracy                           1.00      3593
   macro avg       1.00      1.00      1.00      3593
weighted avg       1.00      1.00      1.00      3593



BERT BASED MODEL

In [38]:
! pip install torch




[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
! pip install transformers torch




[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
# Split the dataset
train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)
test_data, val_data = train_test_split(temp_data, test_size=0.33, random_state=42)

# Tokenization and encoding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [41]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [42]:
! pip install datasets




[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [43]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset

In [44]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_data[['text', 'target']])
test_dataset = Dataset.from_pandas(test_data[['text', 'target']])
val_dataset = Dataset.from_pandas(val_data[['text', 'target']])

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 12516/12516 [00:42<00:00, 294.01 examples/s]
Map: 100%|██████████| 3593/3593 [00:12<00:00, 288.12 examples/s]
Map: 100%|██████████| 1771/1771 [00:05<00:00, 296.92 examples/s]


In [45]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])

In [46]:
! pip install torch torchvision torchaudio




[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [47]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import tensorflow as tf

In [48]:
# Convert datasets to tf.data.Dataset
train_features = {x: train_dataset[x] for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["target"])).batch(16)

In [49]:
val_features = {x: val_dataset[x] for x in tokenizer.model_input_names}
val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, val_dataset["target"])).batch(16)

test_features = {x: test_dataset[x] for x in tokenizer.model_input_names}
test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["target"])).batch(16)

In [50]:
! pip install transformers




[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [51]:
from transformers import BertTokenizer

In [52]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text
train_encodings = tokenizer(train_data['text'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_data['text'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_data['text'].tolist(), truncation=True, padding=True)


In [53]:
import tensorflow as tf

# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_data['target'].tolist()
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_data['target'].tolist()
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_data['target'].tolist()
))

# Batch and shuffle datasets
train_dataset = train_dataset.shuffle(len(train_data)).batch(16)
val_dataset = val_dataset.batch(16)
test_dataset = test_dataset.batch(16)

In [54]:
! pip show tensorflow transformers

Name: tensorflow
Version: 2.16.1
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: C:\Users\Bindu\AppData\Local\Programs\Python\Python311\Lib\site-packages
Requires: tensorflow-intel
Required-by: tf_keras
---
Name: transformers
Version: 4.41.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\Bindu\AppData\Local\Programs\Python\Python311\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [55]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

In [56]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [57]:
# Example dataset (replace with your actual dataset loading)
train_texts = ["Example text 1", "Example text 2"]
train_labels = [0, 1]  # Binary labels (0 or 1)

In [None]:
from transformers import TFBertForSequenceClassification

# Load pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = tf.metrics.SparseCategoricalAccuracy()

# Compile model
model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

# Train model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
    batch_size=16
)