<a href="https://colab.research.google.com/github/Njarrin/Final-LLM-class/blob/main/Final_Final_LLM_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample
from transformers import InputFeatures
import tensorflow as tf
from tqdm import tqdm

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/Njarrin/Final-LLM-class/main/NJ%20fake_or_real_news%20-%20fake_or_real_news.csv")

In [3]:
df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
df['fake'] = df['label'].apply(lambda x: 0 if x == "REAL" else 1)
df = df.drop("label", axis=1)

In [6]:
X, y = df['text'], df['fake']
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
train_InputExamples = [InputExample(guid=None, text_a=text, label=label) for text, label in zip(train_texts, train_labels)]
validation_InputExamples = [InputExample(guid=None, text_a=text, label=label) for text, label in zip(test_texts, test_labels)]

In [8]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    input_ids_list = []
    attention_mask_list = []
    token_type_ids_list = []
    label_list = []

    for e in tqdm(examples):
        if isinstance(e, InputExample):
            text = e.text_a
            label = e.label
        else:
            text = e
            label = None

        if isinstance(text, str):
            input_dict = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=max_length,
                return_token_type_ids=True,
                return_attention_mask=True,
                pad_to_max_length=True,
                truncation=True
            )

            input_ids, token_type_ids, attention_mask = (
                input_dict["input_ids"],
                input_dict["token_type_ids"],
                input_dict["attention_mask"],
            )

            input_ids_list.append(input_ids)
            token_type_ids_list.append(token_type_ids)
            attention_mask_list.append(attention_mask)
            label_list.append(label)

    return (
        tf.data.Dataset.from_tensor_slices({
            "input_ids": input_ids_list,
            "attention_mask": attention_mask_list,
            "token_type_ids": token_type_ids_list
        }),
        tf.data.Dataset.from_tensor_slices(label_list)
    )

In [9]:
DATA_COLUMN = 'text'
LABEL_COLUMN = 'fake'

In [10]:
train_dataset, train_labels = convert_examples_to_tf_dataset(
    train_InputExamples, tokenizer, max_length=128)

100%|██████████| 3998/3998 [01:41<00:00, 39.24it/s]


In [11]:
validation_dataset, validation_labels = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)

100%|██████████| 1000/1000 [00:24<00:00, 40.73it/s]


In [12]:
train_data = tf.data.Dataset.zip((train_dataset, train_labels)).batch(32)
validation_data = tf.data.Dataset.zip((validation_dataset, validation_labels)).batch(32)

In [13]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-8),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
)
model.fit(train_data, epochs=1, validation_data=validation_data)



<keras.src.callbacks.History at 0x7bc0ea23d030>

In [24]:
pred_sentences = ['One of Bloomberg most notable post-mayoral activities has been his philanthropic efforts. He has given away billions of dollars to causes ranging from environmental protection to public health, but some say that his philanthropy is merely a way to enhance his public image and further his political ambitions.']

In [25]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
# axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['REAL','FAKE']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": ", labels[label[i]])

One of Bloomberg most notable post-mayoral activities has been his philanthropic efforts. He has given away billions of dollars to causes ranging from environmental protection to public health, but some say that his philanthropy is merely a way to enhance his public image and further his political ambitions. :  REAL
