In [1]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Using device: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU. Using CPU.")

GPU is available!
Using device: NVIDIA GeForce GTX 1650 Ti


# 1. Loading the Data

In [3]:
df = pd.read_csv(r"final_data_with_summarization.csv")

In [4]:
df["input"] = df["text_summary"].fillna(df["text"])

In [5]:
df["input"].isnull().sum()

np.int64(39)

In [6]:
X = df["title"].fillna("") + " " + df["input"].fillna("")

In [7]:
# now fake = 1
df["label"] = df["label"].map({0: 1, 1: 0})

In [8]:
Y = df["label"]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.15, random_state=42)

In [10]:
X_train.shape, X_val.shape, X_test.shape

((49043,), (8655,), (14425,))

# 2. Tokenization 

We use the [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) embedding model from huggingface. 

This embedding model is relatively lightweight, one of the most popular ones, and converts an input text (max 512 tokens) into a 384 dimensional dense vector embedding. 


The produced embedding can be used for checking sentence similarity, clustering, information retrieval, or text classification by passing in the embeddings as input features.

In [11]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

In [12]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [13]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

# 3. Generating Embeddings

In [14]:
X_train = X_train.values.tolist()
X_val = X_val.values.tolist()
X_test = X_test.values.tolist()

Y_train = Y_train.values.tolist()
Y_val = Y_val.values.tolist()
Y_test = Y_test.values.tolist()

In [20]:
len(X_train), len(X_val), len(X_test)

(49043, 8655, 14425)

In [24]:
from tqdm import tqdm

In [25]:
def batch_encode(text_list, model, batch_size=2048):
    embeddings = []
    for i in tqdm(range(0, len(text_list), batch_size)):
        batch = text_list[i:i+batch_size]
        batch_emb = model.encode(batch)
        embeddings.extend(batch_emb)
    return embeddings

In [26]:
X_val_embeddings = batch_encode(X_val, model)

100%|██████████| 5/5 [00:37<00:00,  7.60s/it]


In [27]:
X_train_embeddings = batch_encode(X_train, model)

100%|██████████| 24/24 [03:36<00:00,  9.01s/it]


In [28]:
X_test_embeddings = batch_encode(X_test, model)

100%|██████████| 8/8 [01:03<00:00,  7.92s/it]


# 4. Model Training, Optimization and Evaluation

# 4.1 Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [30]:
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_embeddings, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [31]:
train_preds = logreg.predict(X_train_embeddings)
val_preds = logreg.predict(X_val_embeddings)
test_preds = logreg.predict(X_test_embeddings)

In [32]:
print("Train")
print(classification_report(Y_train, train_preds))

Train
              precision    recall  f1-score   support

           0       0.85      0.84      0.84     25190
           1       0.83      0.84      0.83     23853

    accuracy                           0.84     49043
   macro avg       0.84      0.84      0.84     49043
weighted avg       0.84      0.84      0.84     49043



In [33]:
print("Val")
print(classification_report(Y_val, val_preds))

Val
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      4558
           1       0.81      0.83      0.82      4097

    accuracy                           0.83      8655
   macro avg       0.83      0.83      0.83      8655
weighted avg       0.83      0.83      0.83      8655



In [34]:
print("Test")
print(classification_report(Y_test, test_preds))

Test
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      7347
           1       0.83      0.84      0.83      7078

    accuracy                           0.84     14425
   macro avg       0.84      0.84      0.84     14425
weighted avg       0.84      0.84      0.84     14425



## 4.2 XGBoost

In [52]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=1000,
                      early_stopping_rounds=30,
                       tree_method="gpu_hist",   
    predictor="gpu_predictor" ,
    max_depth=2,
)

model.fit(
    X_train_embeddings, Y_train,
    eval_set=[(X_val_embeddings, Y_val)],
    verbose=True
)


[0]	validation_0-logloss:0.66553
[1]	validation_0-logloss:0.64552
[2]	validation_0-logloss:0.62933
[3]	validation_0-logloss:0.61613
[4]	validation_0-logloss:0.60418
[5]	validation_0-logloss:0.59511
[6]	validation_0-logloss:0.58593
[7]	validation_0-logloss:0.57694
[8]	validation_0-logloss:0.56929
[9]	validation_0-logloss:0.56412
[10]	validation_0-logloss:0.55849
[11]	validation_0-logloss:0.55330
[12]	validation_0-logloss:0.54869
[13]	validation_0-logloss:0.54481
[14]	validation_0-logloss:0.54075
[15]	validation_0-logloss:0.53695
[16]	validation_0-logloss:0.53372
[17]	validation_0-logloss:0.53090
[18]	validation_0-logloss:0.52783
[19]	validation_0-logloss:0.52509
[20]	validation_0-logloss:0.52246
[21]	validation_0-logloss:0.51931
[22]	validation_0-logloss:0.51603
[23]	validation_0-logloss:0.51336
[24]	validation_0-logloss:0.51019
[25]	validation_0-logloss:0.50783
[26]	validation_0-logloss:0.50539
[27]	validation_0-logloss:0.50334
[28]	validation_0-logloss:0.50167
[29]	validation_0-loglos

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,30
,enable_categorical,False


In [53]:
train_preds = model.predict(X_train_embeddings)
val_preds = model.predict(X_val_embeddings)
test_preds = model.predict(X_test_embeddings)

In [54]:
print("XGBoost")
print(classification_report(Y_train, train_preds))

XGBoost
              precision    recall  f1-score   support

           0       0.93      0.92      0.93     25190
           1       0.92      0.92      0.92     23853

    accuracy                           0.92     49043
   macro avg       0.92      0.92      0.92     49043
weighted avg       0.92      0.92      0.92     49043



In [55]:
print("XGBoost")
print(classification_report(Y_val, val_preds))

XGBoost
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      4558
           1       0.84      0.85      0.84      4097

    accuracy                           0.85      8655
   macro avg       0.85      0.85      0.85      8655
weighted avg       0.85      0.85      0.85      8655



In [56]:
print("XGBoost")
print(classification_report(Y_test, test_preds))

XGBoost
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      7347
           1       0.86      0.85      0.86      7078

    accuracy                           0.86     14425
   macro avg       0.86      0.86      0.86     14425
weighted avg       0.86      0.86      0.86     14425



# CNN

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(np.stack(X), dtype=torch.float32)
        self.Y = torch.tensor(Y, dtype=torch.long)
    def __len__(self):
        return len(self.Y)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]


In [None]:
class SimpleCNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, kernel_size=5, padding=2)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(64, num_classes)
    def forward(self, x):
        x = x.unsqueeze(1)  
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return x

In [None]:
batch_size = 128
train_ds = EmbeddingDataset(X_train_embeddings, Y_train)
val_ds = EmbeddingDataset(X_val_embeddings, Y_val)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = len(X_train_embeddings[0])
num_classes = 2
cnn = SimpleCNN(input_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn.parameters(), lr=1e-3)

In [None]:
for epoch in range(30):
    cnn.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = cnn(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
    
    cnn.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = cnn(xb).argmax(1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    print(f"Epoch {epoch+1}: Val Acc = {correct/total:.4f}")

Epoch 1: Val Acc = 0.5266
Epoch 2: Val Acc = 0.5314
Epoch 3: Val Acc = 0.5261
Epoch 4: Val Acc = 0.5166
Epoch 5: Val Acc = 0.5559
Epoch 6: Val Acc = 0.5458
Epoch 7: Val Acc = 0.5561
Epoch 8: Val Acc = 0.5521
Epoch 9: Val Acc = 0.5475
Epoch 10: Val Acc = 0.5581
Epoch 11: Val Acc = 0.5521
Epoch 12: Val Acc = 0.5643
Epoch 13: Val Acc = 0.5579
Epoch 14: Val Acc = 0.5592
Epoch 15: Val Acc = 0.5633
Epoch 16: Val Acc = 0.5581
Epoch 17: Val Acc = 0.5668
Epoch 18: Val Acc = 0.5671
Epoch 19: Val Acc = 0.5599
Epoch 20: Val Acc = 0.5606
Epoch 21: Val Acc = 0.5655
Epoch 22: Val Acc = 0.5682
Epoch 23: Val Acc = 0.5539
Epoch 24: Val Acc = 0.5670
Epoch 25: Val Acc = 0.5635
Epoch 26: Val Acc = 0.5648
Epoch 27: Val Acc = 0.5710
Epoch 28: Val Acc = 0.5586
Epoch 29: Val Acc = 0.5634
Epoch 30: Val Acc = 0.5708


In [62]:
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

mlp = SimpleMLP(input_dim, num_classes).to(device)
optimizer = optim.Adam(mlp.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(30):
    mlp.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = mlp(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
    
    
    mlp.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = mlp(xb).argmax(1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    print(f"Epoch {epoch+1}: Val Acc = {correct/total:.4f}")

In [None]:
X_test_tensor = torch.tensor(np.stack(X_test_embeddings), dtype=torch.float32).to(device)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.long).to(device)

mlp.eval()
with torch.no_grad():
    outputs = mlp(X_test_tensor)
    preds = outputs.argmax(dim=1).cpu().numpy()

print(classification_report(Y_test, preds))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90      7347
           1       0.91      0.87      0.89      7078

    accuracy                           0.89     14425
   macro avg       0.89      0.89      0.89     14425
weighted avg       0.89      0.89      0.89     14425

