# Part 1 - Pre processing

The preprocessing is a result of the work done in work/data_prep_1.ipynb, work/data_prep_2.ipynb, and work/data_prep_3.ipynb. It isn't included here for brevity since its very long

# Part 2 - Simple model

In [None]:
import pandas as pd
import numpy as np

complete_data = pd.read_parquet('p4_rich_tokens.parquet')

In [None]:
# We create a simple classification model: Fake news or real news.
# For each token, we calculate the average occurance rate in the fake news and in the real news.
# We take the ratios of average occurance rates to calculate the importance of each token in the classification.
# The simple model will be a weighted sum of the real-ness of each token and the fake-ness of each token.
# The final classification will be the sign of the weighted sum.
# so no training required, since we are using the average occurance rates:

def vocabulary(rich_tokens):
    vocab = set()
    for tokens_string in rich_tokens:
        # Split the string into tokens based on spaces
        tokens = tokens_string.split(" ")
        # Update the set with these tokens, not characters
        vocab.update(tokens)
    return vocab


fake_data = complete_data[complete_data['type'].isin(['fake', 'conspiracy', 'bias', 'hate'])]
real_data = complete_data[complete_data['type'].isin(['reliable', 'political'])]

fake_rich_tokens = fake_data['rich_tokens']
real_rich_tokens = real_data['rich_tokens']

fake_vocab = vocabulary(fake_rich_tokens)
real_vocab = vocabulary(real_rich_tokens)
complete_vocab = fake_vocab.union(real_vocab)

print('Number of tokens in the complete vocabulary:', len(complete_vocab))
print('Number of tokens in the fake vocabulary:', len(fake_vocab))
print('Number of tokens in the real vocabulary:', len(real_vocab))

# Redo, but this time we split the data into 80, 10, 10 percent for training, validation, and testing, respectively.

# Split the data into training, validation, and testing sets:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(complete_data, test_size=0.2, random_state=42)
validation_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

train_fake_data = train_data[train_data['type'].isin(['fake', 'conspiracy', 'bias', 'hate'])]
train_real_data = train_data[train_data['type'].isin(['reliable', 'political'])]

train_fake_rich_tokens = train_fake_data['rich_tokens']
train_real_rich_tokens = train_real_data['rich_tokens']

train_fake_vocab = vocabulary(train_fake_rich_tokens)
train_real_vocab = vocabulary(train_real_rich_tokens)
train_complete_vocab = train_fake_vocab.union(train_real_vocab)


print('Number of tokens in the train complete vocabulary:', len(train_complete_vocab))
print('Number of tokens in the train fake vocabulary:', len(train_fake_vocab))
print('Number of tokens in the train real vocabulary:', len(train_real_vocab))

# Again for validation and testing:
validation_fake_data = validation_data[validation_data['type'].isin(['fake', 'conspiracy', 'bias', 'hate'])]
validation_real_data = validation_data[validation_data['type'].isin(['reliable', 'political'])]

validation_fake_rich_tokens = validation_fake_data['rich_tokens']
validation_real_rich_tokens = validation_real_data['rich_tokens']

validation_fake_vocab = vocabulary(validation_fake_rich_tokens)
validation_real_vocab = vocabulary(validation_real_rich_tokens)
validation_complete_vocab = validation_fake_vocab.union(validation_real_vocab)

# and for testing:
test_fake_data = test_data[test_data['type'].isin(['fake', 'conspiracy', 'bias', 'hate'])]
test_real_data = test_data[test_data['type'].isin(['reliable', 'political'])]

test_fake_rich_tokens = test_fake_data['rich_tokens']
test_real_rich_tokens = test_real_data['rich_tokens']

test_fake_vocab = vocabulary(test_fake_rich_tokens)
test_real_vocab = vocabulary(test_real_rich_tokens)
test_complete_vocab = test_fake_vocab.union(test_real_vocab)

complete_vocab = train_complete_vocab.union(validation_complete_vocab).union(test_complete_vocab)

In [None]:
from collections import Counter


def vocabulary(rich_tokens):
    vocab = Counter()
    for sequence in rich_tokens:
        tokens = sequence.split()
        vocab.update(tokens)
    return vocab

complete_vocab = vocabulary(complete_data['rich_tokens'])
train_fake_vocab = vocabulary(train_fake_data['rich_tokens'])
train_real_vocab = vocabulary(train_real_data['rich_tokens'])

print('Number of tokens in the complete vocabulary:', len(complete_vocab))

In [None]:

frequent_words = {word: count for word, count in complete_vocab.items() if count > 1000}

train_fake_occurances = {word: count for word, count in train_fake_vocab.items() if word in frequent_words}
train_real_occurances = {word: count for word, count in train_real_vocab.items() if word in frequent_words}

print('Top 10 fake tokens:')
print(sorted(train_fake_occurances.items(), key=lambda x: x[1], reverse=True)[:10])
print('Top 10 real tokens:')
print(sorted(train_real_occurances.items(), key=lambda x: x[1], reverse=True)[:10])


train_fake_rich_tokens = train_fake_data['rich_tokens']
train_real_rich_tokens = train_real_data['rich_tokens']

from collections import Counter

def size_of_token_sequence_list_v2(sequence_list):
    # same but if its in frequent_words
    count = Counter()
    for sequence in sequence_list:
        tokens = sequence.split()
        count.update([word for word in tokens if word in frequent_words])
    
    return sum(count.values())


# if is in frequent_words:
train_fake_richer_token_count = size_of_token_sequence_list_v2(train_fake_rich_tokens)
train_real_richer_token_count = size_of_token_sequence_list_v2(train_real_rich_tokens)


# average:
train_fake_token_rates = {word: train_fake_occurances[word] / train_fake_richer_token_count for word in train_fake_vocab if word in frequent_words}
train_real_token_rates = {word: train_real_occurances[word] / train_real_richer_token_count for word in train_real_vocab if word in frequent_words}




# Calculate the importance of each token in the classification:
train_token_importance = {}
for token in frequent_words:
    if token in train_fake_token_rates and token in train_real_token_rates:
        if train_real_token_rates[token] > train_fake_token_rates[token]:
            train_token_importance[token] = (train_real_token_rates[token] / train_fake_token_rates[token] -1)
        else:
            train_token_importance[token] = -((train_fake_token_rates[token] / train_real_token_rates[token]) - 1)

    elif token in train_fake_token_rates:
        train_token_importance[token] = 0
    else:
        train_token_importance[token] = 0 


# Sort the tokens by importance:
train_sorted_tokens = sorted(train_token_importance.items(), key=lambda x: x[1], reverse=True)

# Print the 10 most important tokens:
print('The 50 most positive tokens:')
print(train_sorted_tokens[:50])
print('The 50 most negative tokens:')
print(train_sorted_tokens[-50:])

# most neutral tokens (lowest absolute value) that aren't exactly 0
print('The 50 most neutral tokens:')
print(sorted([x for x in train_token_importance.items() if x[1] != 0], key=lambda x: abs(x[1]))[:50])

In [None]:
def classify(sequence, token_importance):
    tokens = sequence.split()
    score = 0
    for token in tokens:
        if token in token_importance:
            score += token_importance[token]
    return score

def bin_classify(sequence, token_importance):
    score = classify(sequence, token_importance)
    return score > 0

Y = train_data['type']
Y_pred = train_data['rich_tokens'].apply(lambda x: bin_classify(x, train_token_importance))

print('y_pred:', Y_pred)
Y_bin = Y.isin(['reliable', 'political'])
Y_pred_bin = Y_pred

# we remve all rows with NaN values or values that aren't in the set {True, False} from both Y and Y_pred:
# i.e. if Y has a NaN value at index i, remove index i from Y and Y_pred.
mask = Y_bin.isin([True, False])
Y_bin = Y_bin[mask]
Y_pred_bin = Y_pred_bin[mask]




from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_bin, Y_pred_bin)
print('Training accuracy:', accuracy)

In [None]:
# Export the model. So the importance of each token, in csv:
import csv


with open('simple_model_importance.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['token', 'importance'])
    for token, importance in train_token_importance.items():
        writer.writerow([token, importance])


In [None]:
print('Token importance:', train_token_importance)

# Sum of positive importance:
positive_importance = sum(importance for importance in train_token_importance.values() if importance > 0)
# Sum of negative importance:
negative_importance = sum(importance for importance in train_token_importance.values() if importance < 0)

print('Sum of positive importance:', positive_importance)
print('Sum of negative importance:', negative_importance)

(the extra scraped data experiment was removed from this notebook but is present in work/part2.ipynb)

# Part 3 - Complex model (preparation)

In [None]:
import pandas as pd
import numpy as np

complete_data = pd.read_parquet('p4_rich_tokens.parquet')

In [None]:
# remove all rows with duplicate "rich_tokens" values:
print('Removing duplicates...')
complete_data = complete_data.drop_duplicates(subset='rich_tokens') 
# save as parquet again:
complete_data.to_parquet('pre_processed_news.parquet')
complete_data = pd.read_parquet('pre_processed_news.parquet')
complete_data = complete_data.dropna(subset=['type'])
complete_data[['type', 'cleaned_content']].to_parquet('pre_processed_news.parquet')
print('Done.')

# Part 3 - Complex model 1 (TF-IDF, Logreg, random forest)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(complete_data['cleaned_content'], complete_data['type'], test_size=0.2, random_state=42)
y_train = y_train.apply(lambda x: True if x in ["reliable", "political"] else False)
y_test = y_test.apply(lambda x: True if x in ["reliable", "political"] else False)

# Let's split test into test and validation:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# print:
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)
print('X_val: ', X_val.shape)
print('y_val: ', y_val.shape)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=0.005)

print('Fitting the vectorizer...')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print('Done. matrix: ', X_train_tfidf.shape)

In [None]:
clf = MultinomialNB()
print('Fitting the model...')
clf.fit(X_train_tfidf, y_train)
print('Done.')

In [None]:
predicted = clf.predict(X_test_tfidf)

accuracy = metrics.accuracy_score(y_test, predicted)
print('TF-IDF Accuracy: ', accuracy)

Log reg:

In [None]:
# Let's now use logistic regression:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
print('Fitting the model...')
clf.fit(X_train_tfidf, y_train)
print('Done.')

In [None]:
#predict:
predicted = clf.predict(X_test_tfidf)

accuracy = metrics.accuracy_score(y_test, predicted)
print('Logreg accuracy: ', accuracy)

Random forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=15, max_features='sqrt', n_jobs=-1)
print('Fitting the model...')
clf.fit(X_train_tfidf, y_train)
print('Done.')

In [None]:
print('Feature importances: ', clf.feature_importances_)
print('Number of estimators: ', clf.n_estimators)
print('Max depth: ', clf.max_depth)

In [None]:
predicted = clf.predict(X_test_tfidf)

accuracy = metrics.accuracy_score(y_test, predicted)

print('Random forest accuracy: ', accuracy)

In [None]:
clf2 = RandomForestClassifier(n_estimators=300, random_state=42, max_depth=15, max_features='sqrt', n_jobs=-1)
print('Fitting the model...')
clf2.fit(X_train_tfidf, y_train)

In [None]:
predicted = clf2.predict(X_test_tfidf)

accuracy = metrics.accuracy_score(y_test, predicted)

print('Random forest attempt 2: ', accuracy)

# Part 3 - Complex model (GRUs)

Note: The training of this segment was done in a kaggle notebook, to make use of the free GPUs they have. File references have been renamed here so that they can be accessed normally.

In [None]:
complete_data = pd.read_parquet('pre_processed_news.parquet')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(complete_data['cleaned_content'], complete_data['type'], test_size=0.2, random_state=42)
y_train = y_train.apply(lambda x: True if x in ["reliable", "political"] else False)
y_test = y_test.apply(lambda x: True if x in ["reliable", "political"] else False)

# Let's split test into test and validation:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)
print('X_val: ', X_val.shape)
print('y_val: ', y_val.shape)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words=25000) 
tokenizer.fit_on_texts(X_train)

print("Fit")

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('X_train_seq.pickle', 'wb') as handle:
    pickle.dump(X_train_seq, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('X_test_seq.pickle', 'wb') as handle:
    pickle.dump(X_test_seq, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle

with open('/kaggle/input/fakenews-tokens/tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

with open('/kaggle/input/fakenews-tokens/X_train_seq.pickle', 'rb') as handle:
    X_train_seq = pickle.load(handle)

# Loading X_test_seq
with open('/kaggle/input/fakenews-tokens/X_test_seq.pickle', 'rb') as handle:
    X_test_seq = pickle.load(handle)

In [None]:
X_train_pad = pad_sequences(X_train_seq, maxlen=800, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=800, padding='post')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [None]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

gpus = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpus))

In [None]:
total_samples = len(X_train_pad)  # Total number of samples in your training data
batch_size = 512  # Assuming this is the batch size you've chosen
steps_per_epoch = total_samples // batch_size
steps_per_epoch, total_samples

In [None]:
#with tpu_strategy.scope():
from keras.layers import Embedding, GRU, Dense, Dropout

with tf.device('/GPU:0'):
    model = Sequential([
        Embedding(input_dim=25000, output_dim=52, input_shape=(800,)),  # Adjust these parameters as needed
        GRU(units=324, return_sequences=True), 
        Dropout(0.2),
        GRU(units=200, return_sequences=True), 
        Dropout(0.2),
        GRU(units=64, return_sequences=False), 
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    #model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], steps_per_execution=32)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    

model.summary()

In [None]:
from tensorflow.keras.utils import Sequence
import numpy as np   

class DataGenerator(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y

train_gen = DataGenerator(X_train_pad, y_train, batch_size)
test_gen = DataGenerator(X_test_pad, y_test, batch_size)

In [None]:
#history = model.fit(X_train_pad, y_train, epochs=1,batch_size=BATCH_SIZE, validation_split=0.2)
history = model.fit(train_gen, epochs=8, validation_data=test_gen)

In [None]:
model.save('model3.h5')