In [3]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import nltk
import torch
import torch.nn as nn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from scipy.sparse import hstack
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Data Preprocessing 
def check_tags_size(tags_list):
    assert len(tags_list) == 1, f"Expected list of size 1, but got {len(tags_list)}"

def check_clickbait_size(clickbait_list):
    assert len(clickbait_list) == 1, f"Expected list of size 1, but got {len(clickbait_list)}"

def clean_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(tokens)

def preprocessData(fileName):
  data = []
  # Load data
  with open(fileName, "r") as file:
    for line in file:
      data.append(json.loads(line))

  # Convert to DataFrame
  df = pd.DataFrame(data)

  # Take only relevant columns to create training data
  df = df[["postText", "targetParagraphs", "targetTitle", "tags"]]

  # Convert tags to numbers
  df['tags'].apply(check_tags_size)
  df['tags'] = df['tags'].apply(lambda tags_list: tags_list[0])
  tag_mapping = {'phrase': 0, 'passage': 1, 'multi': 2}
  df['tags'] = df['tags'].map(tag_mapping)

  # Filter out data having multi tag
  df = df[df['tags'] != 2]

  # Concat targetTitle and targetParagraphs
  df['text'] = df['targetTitle'] + ' ' + df['targetParagraphs'].apply(' '.join)

  # Rename postText to clickbait to make things more relevant
  df.rename(columns={'postText': 'clickbait'}, inplace=True)

  df['clickbait'].apply(check_clickbait_size)
  df['clickbait'] = df['clickbait'].apply(lambda clickbait_list: clickbait_list[0])

  # Drop targetTitle, targetParagraphs as they are no more required
  df.drop(['targetTitle', 'targetParagraphs'], axis=1, inplace=True)

  # Clean Data
  df['clickbait'] = df['clickbait'].apply(clean_text)
  df['text'] = df['text'].apply(clean_text)
  return df

In [None]:
# Training data preprocessing
trainingFileName = "/content/drive/MyDrive/Colab Notebooks/webis-clickbait-22/train.jsonl"
validationFileName = "/content/drive/MyDrive/Colab Notebooks/webis-clickbait-22/validation.jsonl"
train_df = preprocessData(trainingFileName)
validation_df = preprocessData(validationFileName)
train_df.head()

Unnamed: 0,clickbait,tags,text
0,wes welker wanted dinner tom brady patriot qb ...,1,wes welker wanted dinner tom brady patriot qb ...
1,nasa set date full recovery ozone hole,0,hole ozone layer expected make full recovery n...
2,make employee happy paycheck,0,intellectual stimulation trump money employee ...
4,perfect way cook rice perfectly fluffy never s...,0,revealed perfect way cook rice perfectly fluff...
5,happens new airpods get lost stolen apple anyt...,1,happens apple airpods get lost stolen one bigg...


In [None]:
# Generating TFIDF for both clickbait and text column 
def generateTfidf(train_df, validation_df):
  # Create a TfidfVectorizer for trigrams
  clickbait_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))
  text_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))

  # Fit and transform the text data
  clickbait_tfidf = clickbait_tfidf_vectorizer.fit_transform(train_df["clickbait"])
  text_tfidf = text_tfidf_vectorizer.fit_transform(train_df["text"])

  clickbait_tfidf_validation = clickbait_tfidf_vectorizer.transform(validation_df["clickbait"])
  text_tfidf_validation = text_tfidf_vectorizer.transform(validation_df["text"])

  print("Train Clickbait - Number of features = ", len(clickbait_tfidf_vectorizer.get_feature_names_out()))
  print("Train Text - Number of features = ", len(text_tfidf_vectorizer.get_feature_names_out()))

  print("Validation Clickbait - Number of features = ", len(clickbait_tfidf_vectorizer.get_feature_names_out()))
  print("Validation Text - Number of features = ", len(text_tfidf_vectorizer.get_feature_names_out()))
  return clickbait_tfidf, text_tfidf, clickbait_tfidf_validation, text_tfidf_validation

train_clickbait_tfidf, train_text_tfidf, validation_clickbait_tfidf, validation_text_tfidf = generateTfidf(train_df, validation_df)

Train Clickbait - Number of features =  28520
Train Text - Number of features =  1167649
Validation Clickbait - Number of features =  28520
Validation Text - Number of features =  1167649


In [None]:
# Generating one hot encoded POS for clickbait 
def generatePos(train_df, validation_df):
  train_df["clickbait_pos"] = train_df["clickbait"].apply(lambda x: [tag for _, tag in pos_tag(word_tokenize(x))])
  validation_df["clickbait_pos"] = validation_df["clickbait"].apply(lambda x: [tag for _, tag in pos_tag(word_tokenize(x))])

  # Tokenize the POS tags
  pos_tokenizer = Tokenizer()
  pos_tokenizer.fit_on_texts(train_df['clickbait_pos'])

  # Convert the POS tags to integer sequences
  train_df['clickbait_pos_encoded'] = train_df['clickbait_pos'].apply(lambda x: pos_tokenizer.texts_to_sequences([x])[0]) 
  validation_df['clickbait_pos_encoded'] = validation_df['clickbait_pos'].apply(lambda x: pos_tokenizer.texts_to_sequences([x])[0]) 

  # Pad the sequences to a fixed length
  max_clickbait_length = train_df['clickbait'].apply(len).max() 
  print("max_clickbait_length ", max_clickbait_length)
  train_df['clickbait_pos_padded'] = pad_sequences(train_df['clickbait_pos_encoded'], maxlen=max_clickbait_length, padding='post', value=0).tolist() 
  validation_df['clickbait_pos_padded'] = pad_sequences(validation_df['clickbait_pos_encoded'], maxlen=max_clickbait_length, padding='post', value=0).tolist() 

generatePos(train_df, validation_df)

max_clickbait_length  96


In [None]:
def reduce_dimensions(train_tfidf, validation_tfidf, n_components=100):
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    train_reduced = svd.fit_transform(train_tfidf)
    validation_reduced = svd.transform(validation_tfidf)
    return train_reduced, validation_reduced

# Set the desired number of components (dimensions) to keep
n_components = 200

train_clickbait_reduced, validation_clickbait_reduced = reduce_dimensions(train_clickbait_tfidf, validation_clickbait_tfidf, n_components)
train_text_reduced, validation_text_reduced = reduce_dimensions(train_text_tfidf, validation_text_tfidf, n_components)

def get_combined_features_reduced(df, clickbait_reduced, text_reduced):
    clickbait_pos_padded_array = np.array(df['clickbait_pos_padded'].tolist())
    combined_features = np.hstack([clickbait_reduced, text_reduced, clickbait_pos_padded_array])
    scaler = MinMaxScaler()
    scaled_combined_features = scaler.fit_transform(combined_features)
    return scaled_combined_features

X_train = get_combined_features_reduced(train_df, train_clickbait_reduced, train_text_reduced)
X_validation = get_combined_features_reduced(validation_df, validation_clickbait_reduced, validation_text_reduced)
print(X_train.shape)
print(X_train.shape)


y_train = train_df["tags"]
y_validation = validation_df["tags"]


(2641, 496)
(2641, 496)


In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)

X_validation_tensor = torch.tensor(X_validation, dtype=torch.float32)
y_validation_tensor = torch.tensor(y_validation.values, dtype=torch.float32).unsqueeze(1)

## Logistic Regression Training


In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score

# Define the logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        out = torch.sigmoid(self.linear(x))
        return out

# Set up hyperparameters
input_dim = X_train_tensor.shape[1]
learning_rate = 0.01
num_epochs = 1000
batch_size = 64

# Create DataLoader for mini-batch processing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Create DataLoader for validation dataset
validation_dataset = TensorDataset(X_validation_tensor, y_validation_tensor)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model, loss function, and optimizer
model = LogisticRegression(input_dim)
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Train the model and evaluate on validation dataset
for epoch in range(num_epochs):
    for i, (x, y) in enumerate(train_loader):
        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(x)

        # Calculate the loss
        loss = criterion(outputs, y)

        # Backward pass
        loss.backward()

        # Update the weights
        optimizer.step()

    # Calculate and print accuracy on training dataset
    # Calculate and print Macro-averaged F1 and accuracy on training dataset
    with torch.no_grad():
        train_outputs = model(X_train_tensor)
        train_preds = (train_outputs > 0.5).float()
        train_acc = accuracy_score(y_train_tensor, train_preds)
        train_f1 = f1_score(y_train_tensor, train_preds, average='macro')
    
    # Calculate and print Macro-averaged F1 and accuracy on validation dataset
    with torch.no_grad():
        model.eval()
        correct_preds = 0
        total_preds = 0
        y_val_true = []
        y_val_preds = []

        for x_val, y_val in validation_loader:
            val_outputs = model(x_val)
            val_preds = (val_outputs > 0.5).float()
            correct_preds += (val_preds == y_val).sum().item()
            total_preds += x_val.size(0)
            y_val_true.extend(y_val.tolist())
            y_val_preds.extend(val_preds.tolist())

        val_acc = correct_preds / total_preds
        val_f1 = f1_score(y_val_true, y_val_preds, average='macro')
        model.train()
    if (epoch + 1) % 50 == 0:
        print(f"Epoch: {epoch + 1}, Training F1: {train_f1:.4f}, Training Accuracy: {train_acc:.4f}, Validation F1: {val_f1:.4f}, Validation Accuracy: {val_acc:.4f}")


Epoch: 50, Training F1: 0.6241, Training Accuracy: 0.6354, Validation F1: 0.5599, Validation Accuracy: 0.5890
Epoch: 100, Training F1: 0.5636, Training Accuracy: 0.6058, Validation F1: 0.5229, Validation Accuracy: 0.5784
Epoch: 150, Training F1: 0.5626, Training Accuracy: 0.6191, Validation F1: 0.5192, Validation Accuracy: 0.5693
Epoch: 200, Training F1: 0.6089, Training Accuracy: 0.6445, Validation F1: 0.5700, Validation Accuracy: 0.5936
Epoch: 250, Training F1: 0.6704, Training Accuracy: 0.6766, Validation F1: 0.6000, Validation Accuracy: 0.6195
Epoch: 300, Training F1: 0.6794, Training Accuracy: 0.6838, Validation F1: 0.6494, Validation Accuracy: 0.6499
Epoch: 350, Training F1: 0.6489, Training Accuracy: 0.6676, Validation F1: 0.6182, Validation Accuracy: 0.6256
Epoch: 400, Training F1: 0.6937, Training Accuracy: 0.6944, Validation F1: 0.6525, Validation Accuracy: 0.6530
Epoch: 450, Training F1: 0.6889, Training Accuracy: 0.6891, Validation F1: 0.6306, Validation Accuracy: 0.6347
Ep

# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

# Train a Multinomial Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

# Calculate and print Macro-averaged F1 and training accuracy
train_preds_NB = naive_bayes_classifier.predict(X_train)
train_acc_NB = accuracy_score(y_train, train_preds_NB)
train_f1_NB = f1_score(y_train, train_preds_NB, average='macro')
print(f"Training F1 (Naive Bayes): {train_f1_NB:.4f}")
print(f"Training Accuracy (Naive Bayes): {train_acc_NB:.4f}")

# Calculate and print Macro-averaged F1 and validation accuracy
validation_preds_NB = naive_bayes_classifier.predict(X_validation)
validation_acc_NB = accuracy_score(y_validation, validation_preds_NB)
validation_f1_NB = f1_score(y_validation, validation_preds_NB, average='macro')
print(f"Validation F1 (Naive Bayes): {validation_f1_NB:.4f}")
print(f"Validation Accuracy (Naive Bayes): {validation_acc_NB:.4f}")


Training F1 (Naive Bayes): 0.4875
Training Accuracy (Naive Bayes): 0.5782
Validation F1 (Naive Bayes): 0.5335
Validation Accuracy (Naive Bayes): 0.5693


# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# Create an SVM classifier
svm_classifier = SVC(kernel='linear', C=1)

# Train the SVM classifier using the scaled training data
svm_classifier.fit(X_train, y_train)

# Calculate and print Macro-averaged F1 and training accuracy
train_preds_svm = svm_classifier.predict(X_train)
train_acc_svm = accuracy_score(y_train, train_preds_svm)
train_f1_svm = f1_score(y_train, train_preds_svm, average='macro')
print(f"Training F1 (SVM): {train_f1_svm:.4f}")
print(f"Training Accuracy (SVM): {train_acc_svm:.4f}")

# Calculate and print Macro-averaged F1 and validation accuracy
validation_preds_svm = svm_classifier.predict(X_validation)
validation_acc_svm = accuracy_score(y_validation, validation_preds_svm)
validation_f1_svm = f1_score(y_validation, validation_preds_svm, average='macro')
print(f"Validation F1 (SVM): {validation_f1_svm:.4f}")
print(f"Validation Accuracy (SVM): {validation_acc_svm:.4f}")

Training F1 (SVM): 0.7427
Training Accuracy (SVM): 0.7429
Validation F1 (SVM): 0.5251
Validation Accuracy (SVM): 0.5784
