Sources:

- https://www.kaggle.com/code/yacharki/10-classses-yahoo-answers-41-cnn

Stappen:
1. Download libraries
2. Bouw preprocessing functie
3. Laad de dataset
4. Laat NA waardes vallen
5. Split de X en y waardes
6. Preprocess de X waardes
7. Tokenize de X waardes

In [12]:
import pandas as pd # For loading the dataframe
import matplotlib.pyplot as plt # For plotting the graphs
import pickle # For saving and loading
import numpy as np # Used for processing
import seaborn as sns # For the confusion matrix

import nltk # Used for preprocessing
import string # Used for preprocessing
from nltk.tokenize import word_tokenize # Used for preprocessing
from nltk.corpus import stopwords # Used for preprocessing
from nltk.stem   import WordNetLemmatizer # Used for preprocessing

from tensorflow.keras.preprocessing.text import Tokenizer # For the word embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences # For the word embedding
from tensorflow.keras.models import model_from_json

from sklearn.model_selection import train_test_split # For splitting the data

from tensorflow.keras import Sequential # Used for building the CNN
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Activation, Dense, Dropout # Used for building the CNN

from sklearn.metrics import classification_report # Used for model evaluation
from sklearn.metrics import confusion_matrix # Used for model evaluation

In [13]:
def preprocess_text(text):
    
    def processor(j):
        j = ' '.join([c for c in j if c not in string.punctuation and c not in string.digits])
        tokens = word_tokenize(j, 'english')
        lowered = [x.lower() for x in tokens]
        lemmatiser = WordNetLemmatizer()
        lemmatized = [lemmatiser.lemmatize(word) for word in lowered]
        sw = stopwords.words('english')
        stopped = [word for word in lemmatized if word.lower() not in sw]
        return stopped
    
    end = []
    for i in text[:]:
        i = i.split(" ")
        end.append(processor(i))
        
    return end

In [14]:
# Loading the dataset
df = pd.read_csv('WELFake_Dataset.csv', index_col = 0)
df = df.dropna()

In [15]:
# Splitting the data

X = df['text']
y = df['label']

#X = X[:1000]
#y = y[:1000]

# Splitting

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101) # Separating train data
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5, random_state = 101) # Separating train data

X_train = X_train.values
X_test = X_test.values
X_val = X_val.values
y_train = y_train.values
y_test = y_test.values
y_val = y_val.values

In [16]:
X_train = preprocess_text(X_train)
X_test = preprocess_text(X_test)
X_val = preprocess_text(X_val)

In [17]:
t = Tokenizer(oov_token='<UNK>')
# fit the tokenizer on train documents
t.fit_on_texts(X_train)
t.word_index['<PAD>'] = 0

In [18]:
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)
X_val = t.texts_to_sequences(X_val)

In [19]:
X_train = pad_sequences(X_train, maxlen=800)
X_test = pad_sequences(X_test, maxlen=800)
X_val = pad_sequences(X_val, maxlen=800)

In [20]:
VOCAB_SIZE = len(t.word_index)
EMBED_SIZE = 800

In [21]:
from tensorflow.keras.layers import Embedding, MaxPooling1D, Flatten
import tensorflow as tf
model = Sequential()
# The Embedding layer
model.add(Embedding(VOCAB_SIZE, EMBED_SIZE, input_length=800))
# The first one dimensional convolutional layer (32,4,same,relu)
model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
# The first Max pooling layer (2)
model.add(MaxPooling1D(pool_size=2))
# The first Dropout layer (10%)
model.add(Dropout(rate=0.10))
# The second one dimensional convolutional layer (32,4,same,relu)
model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
# The second Max pooling layer (2)
model.add(MaxPooling1D(pool_size=2))
# The second Dropout layer (10%)
model.add(Dropout(rate=0.10))
# The third one dimensional convolutional layer (32,4,same,relu)
model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
# The third Max pooling layer (2)
model.add(MaxPooling1D(pool_size=2))
# The third Dropout layer (10%)
model.add(Dropout(rate=0.10))
# The Flattening layer
model.add(Flatten())
# The First Dense Layer (256,relu)
model.add(Dense(256, activation='relu'))
# The Second Dense Layer or Prediction layer (1,sigmoid)
model.add(Dense(10, activation='softmax'))
# Compiling the Model using the Binary_Crossontropy as a loss function and accuracy as a meseaure and Adam as an Optimizer
model.compile(loss='SparseCategoricalCrossentropy', optimizer=tf.keras.optimizers.Adam(1e-4), metrics=['accuracy'])
# Displaying the Model Schema
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 800, 800)          265860000 
                                                                 
 conv1d_6 (Conv1D)           (None, 800, 32)           102432    
                                                                 
 conv1d_7 (Conv1D)           (None, 800, 32)           4128      
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 400, 32)          0         
 1D)                                                             
                                                                 
 dropout_3 (Dropout)         (None, 400, 32)           0         
                                                                 
 conv1d_8 (Conv1D)           (None, 400, 64)           8256      
                                                      

In [22]:
with tf.device('/GPU:0'):
    history1 = model.fit(X_train, y_train, validation_data=(X_val,y_val),epochs=20, batch_size=64, verbose=1)

Epoch 1/20
203/895 [=====>........................] - ETA: 10:22:26 - loss: 0.6676 - accuracy: 0.6838

KeyboardInterrupt: 

In [7]:
max_features = max([len(s.split()) for s in X])

In [8]:
# Word embedding

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_prep)
encoded_docs = tokenizer.texts_to_sequences(X_prep)
X_train = pad_sequences(encoded_docs, maxlen=max_features, padding='post')

#tokenizer = Tokenizer() # Tokenizing the text into numbers
#tokenizer.fit_on_texts(texts = X_pre) # Building the vocabulary
#X = tokenizer.texts_to_sequences(texts = X_pre) # Tokenized words as X data from the train dataset
#X_train = pad_sequences(sequences = X, maxlen = max_features, padding = 'post')

In [9]:
encoded_docs = tokenizer.texts_to_sequences(X_test_prep)
X_test = pad_sequences(encoded_docs, maxlen=max_features, padding='post')

In [10]:
# Padding and transforming to array

#X = pad_sequences(sequences = X, maxlen = max_features, padding = 'pre')
# X = An array shaped 20, 512 with the indexed words in there.
y_train = y_train.values
y_test = y_test.values
# Y is transformed from an index class to a array

In [11]:
# Hyperparameters

batch_size = 64
embedding_dims = 1 #Length of the token vectors
filters = 250 #number of filters in your Convnet
kernel_size = 3 # a window size of 3 tokens
hidden_dims = 250 #number of neurons at the normal feedforward NN
epochs = 8
maxlen = max_features

In [12]:
from tensorflow.keras.layers import Flatten

cnn_model = Sequential(name = "CNN_model")
cnn_model.add(Conv1D(filters, kernel_size, padding = 'valid', 
                 activation = 'relu', strides = 1, input_shape = (maxlen,embedding_dims), name = "1st_layer"))
cnn_model.add(GlobalMaxPooling1D(name = "2nd_layer"))
cnn_model.add(Dense(hidden_dims, name = "3rd_layer"))
#cnn_model.add(Dropout(0.2, name = "4th_layer"))
cnn_model.add(Activation('relu', name = "5th_layer"))
cnn_model.add(Dense(1, name = "6th_layer"))
cnn_model.add(Activation('sigmoid', name = "7th_layer"))
# Compiling the model
cnn_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [13]:
history = cnn_model.fit(X_train, y_train, 
          batch_size = batch_size, epochs = epochs, validation_data = (X_test,y_test))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
