In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summarized/merged_summarized_fulltrain.csv
/kaggle/input/summarized/merged_summarized_test.csv


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Notebook was run on kaggle, replace with appropriate path
train_data = pd.read_csv('/kaggle/input/summarized/merged_summarized_fulltrain.csv', names=['text', 'label'])[1:]
test_data = pd.read_csv('/kaggle/input/summarized/merged_summarized_test.csv',  names=['text', 'label'])[1:]

df_train, df_val = train_test_split(train_data, test_size=0.2, random_state=42)

X_train = df_train['text']
y_train = df_train['label']

X_val = df_val['text']
y_val = df_val['label']

X_test = test_data['text']
y_test = test_data['label']
print(test_data.head())
print(train_data.head())

                                                text label
1  Thats why Jeff Bridges made waves this week wh...     0
2  In what football insiders are calling an unexp...     0
3  In a freak accident following Game 3 of the N....     0
4  North Koreas official news agency announced to...     0
5  According to the poll, conducted by the Univer...     0
                                                text label
1  A little less than a decade ago, hockey fans w...     0
2  The writers of the HBO series The Sopranos too...     0
3  Despite claims from the TV news outlet to offe...     0
4  Maybe I was a little short with him when I tol...     0
5  'Well, it lets people see the teams they norma...     0


In [None]:
from matplotlib import pyplot as plt
fig,ax = plt.subplots(nrows = 1, ncols = 3, figsize = (15,4))
ax[0].bar(["Satire","Hoax","Propaganda","Reliable"],df_train.groupby("label").count()["text"])
ax[0].set_title("Train set distribution")
ax[1].bar(["Satire","Hoax","Propaganda","Reliable"],df_val.groupby("label").count()["text"])
ax[1].set_title("Valid set distribution")
ax[2].bar(["Satire","Hoax","Propaganda","Reliable"],test_data.groupby("label").count()["text"])
ax[2].set_title("Test set distribution")

In [1]:
import tensorflow as tf
from transformers import (
    GPT2Tokenizer,
    GPT2ForSequenceClassification,
    TFGPT2ForSequenceClassification,
    AdamW,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    set_seed,
    get_linear_schedule_with_warmup
)
import torch
from torch.utils.data import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
# Set seed for reproducibility.
set_seed(123)

MODEL_NAME = "gpt2"
MAX_LEN = 128
TRAIN_BATCH_SIZE = 64
EVAL_BATCH_SIZE = 64
NUM_EPOCHS = 5


device = "cuda" if torch.cuda.is_available() else "cpu"

label2id = {"Satire": 0, "Hoax": 1, "Propoganda": 2, "Reliable": 3}
id2label = {value: key for key, value in label2id.items()}

print(id2label) 
NUM_LABELS = len(label2id)

{0: 'Satire', 1: 'Hoax', 2: 'Propoganda', 3: 'Reliable'}


In [5]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [6]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, features, tokenizer):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.features = features

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    text = self.texts.iloc[idx].lower()
    label = int(self.labels.iloc[idx]) - 1

    # Apply preprocessing steps using the tokenizer
    encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=MAX_LEN)
    features = self.features.iloc[idx]
    
    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': label,
        'feature_vector': features.astype(np.float32)
    }

import evaluate

acc = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return { 'accuracy': acc.compute(predictions=predictions, references=labels), 'f1': f1_metric.compute(predictions=predictions, references=labels, average="macro")}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [15]:
from typing import Optional, Tuple
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput

class CustomGPT2ForSequenceClassification(GPT2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.hidden_size + 3, config.num_labels)
        
    def forward(self, input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        feature_vector: Optional[torch.FloatTensor] = None):
        
        outputs = self.transformer(input_ids, attention_mask=attention_mask)
        hidden_state = outputs[0]
        pooled_output = hidden_state[:,-1]

        # Concatenate feature vector with transformer output
        combined_output = torch.cat((pooled_output, feature_vector.to(device)), dim=1)

        # (Optional) Add dropout
        pooled_output = self.dropout(combined_output)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [8]:
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

# replace with RFE method
def extract_features(text):
    blob = TextBlob(text)
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)['compound']  # Using compound score for simplicity
    return [blob.sentiment.polarity, blob.sentiment.subjectivity, sentiment_score]

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




In [9]:
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.padding_side = "left"
tokenizer.pad_token = '[PAD]'

model = CustomGPT2ForSequenceClassification.from_pretrained(MODEL_NAME, pad_token_id=tokenizer.pad_token_id, num_labels = NUM_LABELS, id2label=id2label, label2id=label2id )
# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))
model.to(device)
print(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of CustomGPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


In [None]:
from tqdm import tqdm 
def map_to_feat(data):
    feature_vectors = []

    # Iterate over the texts in X_train and extract features
    for text in tqdm(data, desc='Extracting features'):
        features = extract_features(text)
        feature_vectors.append(features)
    return feature_vectors

train_features = pd.DataFrame(map_to_feat(X_train), index=X_train.index)


Extracting features:  87%|████████▋ | 34145/39083 [06:53<01:02, 78.88it/s]

In [18]:
val_features = pd.DataFrame(map_to_feat(X_val), index=X_val.index)
# print(train_features.head())
print(val_features.dtypes)

Extracting features: 100%|██████████| 9771/9771 [01:58<00:00, 82.62it/s]

0    float64
1    float64
2    float64
dtype: object





In [19]:
test_features = pd.DataFrame(map_to_feat(X_test), index=X_test.index)

Extracting features: 100%|██████████| 3000/3000 [00:36<00:00, 82.49it/s]


In [20]:
train_dataset = CustomDataset(X_train, y_train, train_features, tokenizer)

val_dataset = CustomDataset(X_val, y_val, val_features, tokenizer)

test_dataset = CustomDataset(X_test, y_test, test_features, tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    optim='adamw_torch',
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    eval_accumulation_steps = 5,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [44]:
model.eval()
x = trainer.predict(test_dataset,  metric_key_prefix="predict")
print(x)



PredictionOutput(predictions=array([[ 3.021332  ,  8.165882  ,  0.60301507,  0.38995174],
       [ 4.2303147 ,  4.78914   , -1.6888192 ,  0.6443556 ],
       [ 4.760718  , -1.4730589 , -3.2594483 ,  2.1674411 ],
       ...,
       [-0.39688212,  2.1488547 , -1.5574473 ,  7.5995545 ],
       [ 0.43418315, -1.7702341 , -0.18705364,  6.1374984 ],
       [-1.7151487 , -1.3822778 ,  0.07584005,  6.5434227 ]],
      dtype=float32), label_ids=array([0, 0, 0, ..., 3, 3, 3]), metrics={'predict_loss': 1.8830411434173584, 'predict_accuracy': {'accuracy': 0.5706666666666667}, 'predict_f1': {'f1': 0.5564668237882152}, 'predict_runtime': 22.6555, 'predict_samples_per_second': 132.418, 'predict_steps_per_second': 2.075})


In [72]:
predicted_probs = torch.nn.functional.softmax(torch.tensor(x.predictions), dim=-1)
y_pred = np.argmax(predicted_probs, axis = 1)

print(y_pred)

tensor([0, 0, 0,  ..., 2, 2, 3])


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)

print("Classification Report:")
print(classification_report(y_test_encoded, y_pred))

# Print the confusion matrix
print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()
