In [20]:
#Enable autoreloading of imported modules
%load_ext autoreload
%autoreload 2

#Import required packages
import sys,os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import kagglehub
from sklearn.feature_extraction.text import TfidfVectorizer

#Add the repo root (one level up from this notebook) to sys.path
sys.path.insert(0, os.path.abspath("../"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
#Install necessary resources from nltk
import nltk

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to C:\Users\Philipp
[nltk_data]     Hoffmann\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Philipp Hoffmann\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to C:\Users\Philipp
[nltk_data]     Hoffmann\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [22]:
#Download latest version of dataset
print("Load or download dataset...")
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset") #path to downloaded dataset
#   (if already downloaded, will not download again)

fake_path=os.path.join(path, "Fake.csv")    #path to dataset with true news
true_path=os.path.join(path, "True.csv")    #path to dataset with fake news

#Read into dataframes
print("Loading fake.csv ...")
fake_df=pd.read_csv(fake_path)
print("Loading true.csv ...")
true_df=pd.read_csv(true_path)

#Label data (1=true, 0=fakenews)
true_df['label']=1
fake_df['label']=0

#Join dataframes
df=pd.concat([true_df, fake_df])
print("Done")

Load or download dataset...
Loading fake.csv ...
Loading true.csv ...
Done


In [23]:
from courselib.utils.splits import train_test_split

training_data_fraction=.8

#Split
df_, train_df, test_df=train_test_split(df, training_data_fraction=training_data_fraction, class_column_name='label', return_numpy=False)

In [24]:
from courselib.utils.preprocessing import labels_encoding

Y_train=train_df['label'].to_numpy()
Y_test=test_df['label'].to_numpy()

Y_train_neg=Y_train.copy()
Y_train_neg[Y_train_neg==0]=-1
Y_test_neg=Y_test.copy()
Y_test_neg[Y_test_neg==0]=-1

Y_train_enc=labels_encoding(Y_train, labels=[0,1])
Y_test_enc=labels_encoding(Y_test, labels=[0,1])

In [25]:
Y_train

array([1, 1, 1, ..., 1, 0, 0], shape=(35918,))

In [26]:
from courselib.optimizers import GDOptimizer
from extensions.vectorization_and_tokenization import multi_column_vectorizer

max_features=1000
col_names=['title']


vectorizer=multi_column_vectorizer(col_names=col_names, max_features_per_column=max_features, tokenizer=None)
X_train=vectorizer.fit_transform(train_df, col_names=col_names)
X_test=vectorizer.transform(test_df, col_names=col_names)

X_train_np=vectorizer.fit_transform(train_df, col_names=col_names, sparse=False)
X_test_np=vectorizer.transform(test_df, col_names=col_names, sparse=False)

In [27]:
from courselib.models.glm import LogisticRegression
from courselib.utils.metrics import binary_accuracy
epochs=100
lr=0.01 #learning rate
bs=X_train.shape[0] #full batch


optimizer=GDOptimizer(lr)

original logistic regression

In [28]:
#initialize model
w=np.zeros(X_train_np.shape[1]) #initial weights
b=0 #initial bias
model=LogisticRegression(w,b, optimizer=optimizer)
#Train model
model.fit(X_train_np, y=Y_train, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train_np), y_true=Y_train, class_labels=[0,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test_np), y_true=Y_test, class_labels=[0,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')


Train accuracy: 52.39434266941366%
Test accuracy: 51.915367483296215%


new logistic regression numpy

In [29]:
from extensions.sparse_array_compatible_models import LogisticRegression_S

#initialize model
w=np.zeros(X_train_np.shape[1]) #initial weights
b=0 #initial bias
model=LogisticRegression_S(w,b, optimizer=optimizer)
#Train model
model.fit(X_train_np, y=Y_train, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train_np), y_true=Y_train, class_labels=[0,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test_np), y_true=Y_test, class_labels=[0,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')

Train accuracy: 52.39434266941366%
Test accuracy: 51.915367483296215%


new logistic regression sparse

In [30]:
#initialize model
w=np.zeros(X_train.shape[1]) #initial weights
b=0 #initial bias
model=LogisticRegression_S(w,b, optimizer=optimizer)
#Train model
model.fit(X_train, y=Y_train, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train), y_true=Y_train, class_labels=[0,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test), y_true=Y_test, class_labels=[0,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')

Train accuracy: 52.39434266941366%
Test accuracy: 51.915367483296215%


with linear svm original

In [31]:
from courselib.models.svm import LinearSVM

w=np.zeros(X_train.shape[1]) #initial weights
b=0 #initial bias
model=LinearSVM(w,b, optimizer=optimizer)
#Train
model.fit(X_train_np, y=Y_train_neg, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train_np), y_true=Y_train_neg, class_labels=[-1,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test_np), y_true=Y_test_neg, class_labels=[-1,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')

Train accuracy: 52.39434266941366%
Test accuracy: 51.915367483296215%


with new linear svm np

In [32]:
from extensions.sparse_array_compatible_models import LinearSVM_S

w=np.zeros(X_train.shape[1]) #initial weights
b=0 #initial bias
model=LinearSVM_S(w,b, optimizer=optimizer)
#Train
model.fit(X_train_np, y=Y_train_neg, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train_np), y_true=Y_train_neg, class_labels=[-1,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test_np), y_true=Y_test_neg, class_labels=[-1,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')

Train accuracy: 52.39434266941366%
Test accuracy: 51.915367483296215%


with new linear svm sparse

In [33]:
w=np.zeros(X_train.shape[1]) #initial weights
b=0 #initial bias
model=LinearSVM_S(w,b, optimizer=optimizer)
#Train
model.fit(X_train, y=Y_train_neg, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train), y_true=Y_train_neg, class_labels=[-1,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test), y_true=Y_test_neg, class_labels=[-1,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')

Train accuracy: 52.39434266941366%
Test accuracy: 51.915367483296215%


### with z-score normalization

In [34]:
import scipy.sparse as sp
from courselib.utils.normalization import standardize
from extensions.normalization_ext import standardize_sparse_matrix

k=X_train.shape[0]

X=sp.vstack([X_train, X_test])
X_np=np.vstack([X_train_np, X_test_np])

X, offset=standardize_sparse_matrix(X)
X_np=standardize(X_np)

X_train, X_test= X[:k ], X[k:]
X_train_np, X_test_np= X_np[:k ], X_np[k:]


logistic regression original

In [35]:
#initialize model
w=np.zeros(X_train.shape[1]) #initial weights
b=0 #initial bias
model=LogisticRegression(w,b, optimizer=optimizer)

#Train model
model.fit(X_train_np, y=Y_train, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train_np), y_true=Y_train, class_labels=[0,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test_np), y_true=Y_test, class_labels=[0,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')

Train accuracy: 92.42997939751658%
Test accuracy: 92.13808463251671%


logistic regression new sparse

In [36]:
#initialize model
w=np.zeros(X_train.shape[1]) #initial weights
b=0 #initial bias
model=LogisticRegression_S(w,b, optimizer=optimizer, offset=offset)

#Train model
model.fit(X_train, y=Y_train, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train), y_true=Y_train, class_labels=[0,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test), y_true=Y_test, class_labels=[0,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')

Train accuracy: 92.42997939751658%
Test accuracy: 92.13808463251671%


linear svm original

In [37]:
w=np.zeros(X_train.shape[1]) #initial weights
b=0 #initial bias
model=LinearSVM(w,b, optimizer=optimizer)
#Train
model.fit(X_train_np, y=Y_train_neg, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train_np), y_true=Y_train_neg, class_labels=[-1,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test_np), y_true=Y_test_neg, class_labels=[-1,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')

Train accuracy: 94.71852553037475%
Test accuracy: 93.98663697104676%


linear svm new sparse

In [38]:
from extensions.sparse_array_compatible_models import LinearSVM_S
w=np.zeros(X_train.shape[1]) #initial weights
b=0 #initial bias
model=LinearSVM_S(w,b, optimizer=optimizer, offset=offset)
#Train
model.fit(X_train, y=Y_train_neg, num_epochs=epochs, batch_size=bs)
#Evaluate
train_accuracy=binary_accuracy(y_pred=model.decision_function(X_train), y_true=Y_train_neg, class_labels=[-1,1])
test_accuracy=binary_accuracy(y_pred=model.decision_function(X_test), y_true=Y_test_neg, class_labels=[-1,1])
print(f'Train accuracy: {train_accuracy}%')
print(f'Test accuracy: {test_accuracy}%')

Train accuracy: 94.71852553037475%
Test accuracy: 93.98663697104676%
