In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [4]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from datasets import Dataset, DatasetDict

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
data_dir = '/content/drive/MyDrive/Fake_new_data_2'
files = ['train.tsv', 'valid.tsv', 'test.tsv']
dfs = [pd.read_csv(os.path.join(data_dir, f), sep='\t', header=None) for f in files]

In [7]:
cols = ['id', 'label', 'statement', 'subjects', 'speaker', 'job_title', 'state_info',
        'party_affiliation', 'barely_true_cnt', 'false_cnt', 'half_true_cnt',
        'mostly_true_cnt', 'pants_on_fire_cnt', 'context']
for df in dfs:
    df.columns = cols
train_df, val_df, test_df = dfs[0], dfs[1], dfs[2]

In [8]:
encoder = LabelEncoder()
train_df['label_enc'] = encoder.fit_transform(train_df['label'])
val_df['label_enc'] = encoder.transform(val_df['label'])
test_df['label_enc'] = encoder.transform(test_df['label'])
num_labels = len(encoder.classes_)

In [9]:
def clean_text(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9 ]", " ", s)
    return s.strip()

train_df['clean'] = train_df['statement'].apply(clean_text)
val_df['clean']   = val_df['statement'].apply(clean_text)
test_df['clean']  = test_df['statement'].apply(clean_text)

In [10]:
MAX_VOCAB = 20000
MAX_LEN = 50

tok = Tokenizer(num_words=MAX_VOCAB, oov_token='<OOV>')
tok.fit_on_texts(train_df['clean'])

X_train = pad_sequences(tok.texts_to_sequences(train_df['clean']), maxlen=MAX_LEN, padding='post')
X_val   = pad_sequences(tok.texts_to_sequences(val_df['clean']),   maxlen=MAX_LEN, padding='post')
X_test  = pad_sequences(tok.texts_to_sequences(test_df['clean']),  maxlen=MAX_LEN, padding='post')

y_train = tf.keras.utils.to_categorical(train_df['label_enc'], num_labels)
y_val   = tf.keras.utils.to_categorical(val_df['label_enc'],   num_labels)
y_test  = tf.keras.utils.to_categorical(test_df['label_enc'],  num_labels)

In [12]:
bilstm_model = Sequential([
    Embedding(MAX_VOCAB, 128, input_length=MAX_LEN),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(num_labels, activation='softmax')
])

bilstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
bilstm_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[early_stop]
)

Epoch 1/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.1972 - loss: 1.7594 - val_accuracy: 0.2430 - val_loss: 1.7087
Epoch 2/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.2868 - loss: 1.6393 - val_accuracy: 0.2321 - val_loss: 1.7506
Epoch 3/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.4466 - loss: 1.3931 - val_accuracy: 0.2313 - val_loss: 1.9169


<keras.src.callbacks.history.History at 0x7f6ed2299810>

In [13]:
print(bilstm_model.evaluate(X_test, y_test))
def predict_bilstm(text):
    seq = pad_sequences(tok.texts_to_sequences([clean_text(text)]), maxlen=MAX_LEN, padding='post')
    pred = bilstm_model.predict(seq)
    return encoder.inverse_transform([np.argmax(pred)])

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2292 - loss: 1.7300
[1.7126680612564087, 0.23756906390190125]


In [14]:
full = DatasetDict({
    'train': Dataset.from_pandas(train_df[['statement', 'label']]),
    'validation': Dataset.from_pandas(val_df[['statement', 'label']]),
    'test': Dataset.from_pandas(test_df[['statement', 'label']])
})

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_fn(batch):
    enc = bert_tokenizer(batch['statement'], padding='max_length', truncation=True, max_length=128)
    enc['labels'] = encoder.transform(batch['label'])
    return enc

raw = full.map(tokenize_fn, batched=True)
raw.set_format(type='tensorflow', columns=['input_ids','attention_mask','labels'])

tf_train = raw['train'].to_tf_dataset(columns=['input_ids','attention_mask'], label_cols=['labels'], shuffle=True, batch_size=8)

tf_val   = raw['validation'].to_tf_dataset(columns=['input_ids','attention_mask'], label_cols=['labels'], shuffle=False, batch_size=8)

tf_test  = raw['test'].to_tf_dataset(columns=['input_ids','attention_mask'], label_cols=['labels'], shuffle=False, batch_size=8)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [15]:
from tensorflow.keras.optimizers import Adam

In [16]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
bert_model.compile(optimizer='adam',
                   loss=loss_fn,
                   metrics=['accuracy'])
bert_model.optimizer.learning_rate.assign(2e-5)

bert_model.fit(
    tf_train,
    validation_data=tf_val,
    epochs=10
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7f6ec027dc90>

In [23]:
print(bert_model.evaluate(tf_test))
def predict_bert(text):
    enc = bert_tokenizer(text, return_tensors='tf', truncation=True, padding='max_length', max_length=128)
    logits = bert_model(enc).logits
    return encoder.inverse_transform([tf.argmax(logits, axis=1).numpy()[0]])

print(predict_bilstm("Alex Sink funneled three quarters of a million dollars in no-bid contracts to Bank of America."))
print(predict_bert("Transgender individuals in the U.S. have a 1-in-12 chance of being murdered."))


[4.088626861572266, 0.2525651156902313]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
['half-true']
['pants-fire']
