In [1]:
import tensorflow as tf
import keras as K
import myutils 
%load_ext autoreload
%reload_ext autoreload
%autoreload 2 


Settings = "Balanced"
# Settings = "UnBalancedReal"
# Settings = "UnBalancedFake"

allNews = myutils.initDF()

# tf.config.list_logical_devices(), tf.config.list_physical_devices()

Loading data from processed file


## Create balanced dataset and create train, validation and test sets

In [2]:
from sklearn.model_selection import train_test_split 
# Pop the label column
balancedLabels = allNews.pop('label')

# Split the data into train, test, and validation sets
x_train, x_test, y_train, y_test = train_test_split(allNews, balancedLabels, test_size=0.1, train_size=0.9, shuffle=False)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2222, train_size=0.7778, shuffle=True)

## Create unbalanced dataset and create train, validation and test sets
### Only half fake news

In [3]:
# FakeNews label = 1
x_trainUF, y_trainUF = myutils.unbalance(x_train, y_train, 1)

### Only half real news

In [4]:
# RealNews label = 0
x_trainUR, y_trainUR = myutils.unbalance(x_train, y_train, 0)

## Choosing the train set by settings (balanced or unbalanced) 

In [None]:
if Settings == "UnBalancedReal":
    x_train = x_trainUR
    y_train = y_trainUR
elif Settings == "UnBalancedFake":
    x_train = x_trainUF
    y_train = y_trainUF

## Dict Index

In [5]:
from keras.preprocessing.text import Tokenizer
import numpy as np

vectorizer = Tokenizer(num_words=10000)

vectorizer.fit_on_texts(x_train.title)
vectorizer.fit_on_texts(x_train.text)
vectorizer.fit_on_texts(x_train.subject)

vocab_size = len(vectorizer.word_index) + 1

### Text conversion

In [6]:

from keras_preprocessing.sequence import pad_sequences

def convert(df, columns = ['title', 'text', 'subject']):
    for column in columns:
        df[column] = vectorizer.texts_to_sequences(df[column])
    return df

x_train = convert(x_train)
x_val = convert(x_val)

### Text Padding/Truncation for Normalization

In [7]:
import numpy as np


def Normalize(df, columns = ['title', 'text'], MAX_LEN = 2000, PaddingKind = 'post'):
    tmp = {}
    for column in columns:
        tmp[column] = pad_sequences(df[column].tolist(), maxlen=MAX_LEN, padding=PaddingKind)
    tmp['subject'] = pad_sequences(df['subject'].tolist(), maxlen=2, padding=PaddingKind)
    out = np.empty((len(tmp['text']), MAX_LEN*2+2))
    for i in range(len(tmp['text'])):
        out[i] = np.append(np.append(tmp['text'][i], tmp['title'][i]),  tmp['subject'][i])
    return out

x_train = Normalize(x_train)
x_val = Normalize(x_val)

y_train = np.array(y_train)
y_val = np.array(y_val)

x_train.shape, x_val.shape, y_train.shape, y_val.shape

((31429, 4002), (8979, 4002), (31429,), (8979,))

## Embedding

In [8]:
import numpy as np
embedding_dict = {}
with open("./embeddings/glove.6B.300d.txt") as f:
    for line in f:
        words, coefficients = line.split(maxsplit=1)
        coefficients = np.fromstring(coefficients, "f", sep=" ")
        embedding_dict[words] = coefficients

### Embedding Matrix

In [9]:
import numpy as np
embedding_dim = 300
vocab_size = len(vectorizer.word_index) + 2

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in vectorizer.word_index.items():
    embedding_vector = vectorizer.word_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
embedding_matrix.shape

(126078, 300)

# Tensorflow embedding layer

In [10]:
embedding_layer = K.layers.Embedding(vocab_size, embedding_dim, embeddings_initializer= K.initializers.Constant(embedding_matrix), trainable=False, input_length=4002)

# Tensorflow CNN model

In [11]:
# # input = K.layers.Input(shape=(None, ), dtype='int64')
# embedded_input = embedding_layer(input)
# x = K.layers.Sequential()
# x = K.layers.Conv1D(128, 4, padding='same', activation='relu')(embedding_layer)
# x = K.layers.MaxPooling1D(2)(x)
# x = K.layers.Conv1D(64, 4, padding='same', activation='relu')(x)
# x = K.layers.MaxPooling1D(2)(x) 
# x = K.layers.Conv1D(32, 4, padding='same', activation='relu')(x)
# x = K.layers.GlobalMaxPooling1D()(x)
# x = K.layers.Dense(128, activation='relu')(x)
# # x = K.layers.Dropout(0.5)(x)
# predictions = K.layers.Dense(1, activation='sigmoid')(x)
# model = K.models.Model(input, predictions)
# model.summary()

In [12]:
model = K.Sequential()
model.add(embedding_layer) 
model.add(K.layers.Conv1D(128, 4, padding='same', activation='relu'))
model.add(K.layers.MaxPooling1D(2))
model.add(K.layers.Conv1D(64, 4, padding='same', activation='relu'))
model.add(K.layers.MaxPooling1D(2))
model.add(K.layers.Conv1D(32, 4, padding='same', activation='relu'))
# model.add(K.layers.GlobalMaxPooling1D())
model.add(K.layers.MaxPooling1D(2))
model.add(K.layers.Flatten())
model.add(K.layers.Dense(256, activation='relu'))
model.add(K.layers.Dense(1, activation='sigmoid'))

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



In [13]:
early_stopping = K.callbacks.EarlyStopping(
    monitor='val_loss',
    mode='min',
    patience=3,
    min_delta=0.005,
    restore_best_weights=True   
)

model.compile(
    # optimizer=K.optimizers.Adam(learning_rate=0.01),
    optimizer=K.optimizers.Adam(learning_rate=0.001), 
    loss=K.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4002, 300)         37823400  
                                                                 
 conv1d (Conv1D)             (None, 4002, 128)         153728    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 2001, 128)        0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 2001, 64)          32832     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 1000, 64)         0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 1000, 32)          8

## Model Train

In [14]:
hist = model.fit(x_train, y_train, 
    batch_size=128, 
    epochs=100,
    validation_data=(x_val, y_val), 
    callbacks=[early_stopping]
)

Epoch 1/100


2023-04-29 12:33:50.229256: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
 45/246 [====>.........................] - ETA: 51s - loss: 0.1672 - accuracy: 0.9361

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

plt.plot(hist.history['accuracy'], label='accuracy')
plt.plot(hist.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

NameError: name 'hist' is not defined