CNN for Text Classification

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [None]:
data = pd.read_csv("/content/gossipcop_fake.csv")

In [None]:
print(data.columns)

Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')


In [None]:
print(data.head())

                     id                                           news_url  \
0  gossipcop-2493749932  www.dailymail.co.uk/tvshowbiz/article-5874213/...   
1  gossipcop-4580247171  hollywoodlife.com/2018/05/05/paris-jackson-car...   
2   gossipcop-941805037  variety.com/2017/biz/news/tax-march-donald-tru...   
3  gossipcop-2547891536  www.dailymail.co.uk/femail/article-3499192/Do-...   
4  gossipcop-5476631226  variety.com/2018/film/news/list-2018-oscar-nom...   

                                               title  \
0  Did Miley Cyrus and Liam Hemsworth secretly ge...   
1  Paris Jackson & Cara Delevingne Enjoy Night Ou...   
2  Celebrities Join Tax March in Protest of Donal...   
3  Cindy Crawford's daughter Kaia Gerber wears a ...   
4      Full List of 2018 Oscar Nominations – Variety   

                                           tweet_ids  
0  284329075902926848\t284332744559968256\t284335...  
1  992895508267130880\t992897935418503169\t992899...  
2  853359353532829696\t853359

Feature Extraction

In [None]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [None]:
max_sequence_length = 100
x_data = pad_sequences(sequences, maxlen=max_sequence_length)

In [None]:
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_sequence_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [8]:
texts = data['title'].fillna("").values

In [9]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [10]:
max_sequence_length = 100
x_data = pad_sequences(sequences, maxlen=max_sequence_length)

In [11]:
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_sequence_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    BatchNormalization(),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dense(32, activation='relu')
])

In [12]:
features = model.predict(x_data)
np.save("extracted_features.npy", features)
print("Feature extraction completed. Features saved as 'extracted_features.npy'.")

[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step
Feature extraction completed. Features saved as 'extracted_features.npy'.


Dropout for Feature Extraction

In [14]:
texts = data['title'].fillna("").values
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [15]:
max_sequence_length = 100
x_data = pad_sequences(sequences, maxlen=max_sequence_length)

In [16]:
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_sequence_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Dropout(0.5),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu')
])

In [17]:
features = model.predict(x_data)
np.save("extracted_features_with_dropout.npy", features)
print("Feature extraction completed with Dropout. Features saved as 'extracted_features_with_dropout.npy'.")

[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step
Feature extraction completed with Dropout. Features saved as 'extracted_features_with_dropout.npy'.


Transfer Learning

In [20]:
pip install transformers



In [18]:
texts = data['title'].fillna("").values

In [24]:
from transformers import BertTokenizer, TFBertModel

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Data Augmentation

In [25]:
texts = data['title'].fillna('').values
labels = data['id']

In [26]:
max_words = 10000
max_len = 50
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

Found 8872 unique tokens.


In [27]:
padded_sequences = pad_sequences(sequences, maxlen=max_len)
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
embedding_dim = 100

In [28]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [29]:
def augment_text(text):
    words = text_to_word_sequence(text)
    augmented_text = " ".join(words[::-1])
    return augmented_text

In [35]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
augmented_texts = [augment_text(text) for text in texts]
augmented_sequences = tokenizer.texts_to_sequences(augmented_texts)
augmented_padded = pad_sequences(augmented_sequences, maxlen=max_len)
X_train = tf.concat([X_train, augmented_padded[:len(X_train)]], axis=0)
y_train = tf.concat([y_train, y_train[:len(X_train)]], axis=0)

In [37]:
import tensorflow as tf
tf.compat.v1.disable_eager_execution()

In [39]:
print(X_train.shape)
print(X_val.shape)

(8516, 50)
(1065, 50)


In [41]:
print(X_val.shape, y_val.shape)

(1065, 50) (1065,)


In [45]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print(tf.__version__)

2.17.1


Attention Mechanism

In [48]:
texts = data['title'].fillna('').values
labels = data['id']

In [49]:
max_words = 10000
max_len = 50

In [53]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")
padded_sequences = pad_sequences(sequences, maxlen=max_len)

Found 8872 unique tokens.


In [62]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, attention_dim):
        super(AttentionLayer, self).__init__()
        self.attention = MultiHeadAttention(num_heads=2, key_dim=attention_dim)
        self.dense = Dense(1, activation='sigmoid')
    def call(self, inputs):
        att_output = self.attention(inputs, inputs)
        output = GlobalMaxPooling1D()(att_output)
        return self.dense(output)