In [137]:
#Enable autoreloading of imported modules
%load_ext autoreload
%autoreload 2

#Import required packages
import sys,os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import kagglehub
from sklearn.feature_extraction.text import TfidfVectorizer

#Add the repo root (one level up from this notebook) to sys.path
sys.path.insert(0, os.path.abspath("../"))


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 1. Load data

In [138]:
#Download latest version of dataset
print("Load or download dataset...")
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset") #path to downloaded dataset
#   (if already downloaded, will not download again)

fake_path=os.path.join(path, "Fake.csv")    #path to dataset with true news
true_path=os.path.join(path, "True.csv")    #path to dataset with fake news
print("Done")

Load or download dataset...
Done


In [139]:
#Read into dataframes
fake_df=pd.read_csv(fake_path)
true_df=pd.read_csv(true_path)

#Label data (1=true, 0=fakenews)
true_df['label']=1
fake_df['label']=0

#Join dataframes
df=pd.concat([true_df, fake_df])


In [140]:
df

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


### 2. Preprocess data

In [141]:
max_terms=1000 #Maximal number of terms to consider

corpus=list(df['title'])
vectorizer=TfidfVectorizer(max_features=1000)
X_sparse=vectorizer.fit_transform(corpus)

#Convert to numpy
X=X_sparse.toarray()
Y=df['label'].to_numpy()

### 3. First (Test) Training and Evaluation

In [142]:
#Split into train and test data
from courselib.utils.splits import train_test_split_np

training_data_fraction=.8

X_train,Y_train, X_test, Y_test=train_test_split_np(X,Y, training_data_fraction)

In [143]:
# Initialize model
from courselib.models.glm import LogisticRegression
from courselib.optimizers import GDOptimizer

lr=10 # learining rate
w=np.zeros(X.shape[1]) # initial weights
b=0 # initial bias

optimizer=GDOptimizer(learning_rate=lr)
model=LogisticRegression(w,b,optimizer)

In [144]:
#Train model
epochs=100 # number of epochs
bs=100 # batch size

model.fit(X_train, Y_train, num_epochs=epochs, batch_size=bs)

In [145]:
#Evaluate
from courselib.utils.metrics import binary_accuracy

print(f"Final train accuracy {binary_accuracy(y_pred=model(X_train), y_true=Y_train, class_labels=[0,1]): .2f}%")
print(f"Final test accuracy {binary_accuracy(y_pred=model(X_test), y_true=Y_test, class_labels=[0,1]): .2f}%")

Final train accuracy  95.34%
Final test accuracy  94.16%
