In [1]:
#Enable autoreloading of imported modules
%load_ext autoreload
%autoreload 2

#Import required packages
import sys,os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import kagglehub
from sklearn.feature_extraction.text import TfidfVectorizer

#Add the repo root (one level up from this notebook) to sys.path
sys.path.insert(0, os.path.abspath("../"))

In [2]:
#Install necessary resources from nltk
import nltk

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phili\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\phili\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phili\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\phili\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Load data

In [35]:
#Download latest version of dataset
print("Load or download dataset...")
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset") #path to downloaded dataset
#   (if already downloaded, will not download again)

fake_path=os.path.join(path, "Fake.csv")    #path to dataset with true news
true_path=os.path.join(path, "True.csv")    #path to dataset with fake news

#Read into dataframes
print("Loading fake.csv ...")
fake_df=pd.read_csv(fake_path)
print("Loading true.csv ...")
true_df=pd.read_csv(true_path)

#Label data (1=true, 0=fakenews)
true_df['label']=1
fake_df['label']=0

#Join dataframes
df=pd.concat([true_df, fake_df])
print("Done")


#Convert labels to numpy
Y=df['label'].to_numpy()

Load or download dataset...
Loading fake.csv ...
Loading true.csv ...
Done


## 1.  Examples: Tokenizers

In [5]:
from vectorization_and_tokenization import stemming_tokenizer, basic_word_tokenizer, lemmatization_tokenizer

In [5]:
text=df['text'].iloc[0]
text

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [6]:
print(basic_word_tokenizer(text))
print(lemmatization_tokenizer(text))
print(stemming_tokenizer(text))

['washington', 'reuters', 'the', 'head', 'of', 'a', 'conservative', 'republican', 'faction', 'in', 'the', 'u.s.', 'congress', 'who', 'voted', 'this', 'month', 'for', 'a', 'huge', 'expansion', 'of', 'the', 'national', 'debt', 'to', 'pay', 'for', 'tax', 'cuts', 'called', 'himself', 'a', 'fiscal', 'conservative', 'on', 'sunday', 'and', 'urged', 'budget', 'restraint', 'in', '2018', 'in', 'keeping', 'with', 'a', 'sharp', 'pivot', 'under', 'way', 'among', 'republicans', 'u.s.', 'representative', 'mark', 'meadows', 'speaking', 'on', 'cbs', 'face', 'the', 'nation', 'drew', 'a', 'hard', 'line', 'on', 'federal', 'spending', 'which', 'lawmakers', 'are', 'bracing', 'to', 'do', 'battle', 'over', 'in', 'january', 'when', 'they', 'return', 'from', 'the', 'holidays', 'on', 'wednesday', 'lawmakers', 'will', 'begin', 'trying', 'to', 'pass', 'a', 'federal', 'budget', 'in', 'a', 'fight', 'likely', 'to', 'be', 'linked', 'to', 'other', 'issues', 'such', 'as', 'immigration', 'policy', 'even', 'as', 'the', 'n

#### with stopword removal:

In [7]:
from vectorization_and_tokenization import STOPWORDS_EN

print(basic_word_tokenizer(text, stop_words=STOPWORDS_EN))
print(lemmatization_tokenizer(text, stop_words=STOPWORDS_EN))
print(stemming_tokenizer(text, stop_words=STOPWORDS_EN))

['washington', 'reuters', 'head', 'conservative', 'republican', 'faction', 'u.s.', 'congress', 'voted', 'month', 'huge', 'expansion', 'national', 'debt', 'pay', 'tax', 'cuts', 'called', 'fiscal', 'conservative', 'sunday', 'urged', 'budget', 'restraint', '2018', 'keeping', 'sharp', 'pivot', 'way', 'among', 'republicans', 'u.s.', 'representative', 'mark', 'meadows', 'speaking', 'cbs', 'face', 'nation', 'drew', 'hard', 'line', 'federal', 'spending', 'lawmakers', 'bracing', 'battle', 'january', 'return', 'holidays', 'wednesday', 'lawmakers', 'begin', 'trying', 'pass', 'federal', 'budget', 'fight', 'likely', 'linked', 'issues', 'immigration', 'policy', 'even', 'november', 'congressional', 'election', 'campaigns', 'approach', 'republicans', 'seek', 'keep', 'control', 'congress', 'president', 'donald', 'trump', 'republicans', 'want', 'big', 'budget', 'increase', 'military', 'spending', 'democrats', 'also', 'want', 'proportional', 'increases', 'non-defense', 'discretionary', 'spending', 'progr

## 2. Examples: Vectorization

In [4]:
from vectorization_and_tokenization import vectorize_text_data
from courselib.utils.normalization import standardize

In [9]:
max_features=5

X, feature_names=vectorize_text_data(df, col_names=['title', 'text'], max_features_per_column=max_features, tokenizer=None)

In [10]:
X

array([[0.        , 0.        , 0.        , ..., 0.21994713, 0.72057681,
        0.50237702],
       [0.        , 0.        , 1.        , ..., 0.14057239, 0.76537666,
        0.50436112],
       [0.        , 0.        , 0.        , ..., 0.40430209, 0.73050876,
        0.26677694],
       ...,
       [0.78321562, 0.        , 0.62175019, ..., 0.29624731, 0.70517685,
        0.36502427],
       [0.        , 0.        , 1.        , ..., 0.24418239, 0.64153826,
        0.41431543],
       [0.        , 1.        , 0.        , ..., 0.19803696, 0.78045118,
        0.43402373]], shape=(44898, 10))

In [11]:
feature_names

{'title': array(['in', 'of', 'to', 'trump', 'video'], dtype=object),
 'text': array(['and', 'in', 'of', 'the', 'to'], dtype=object)}

####  with bag of words vectorization:

In [12]:
X, feature_names=vectorize_text_data(df,vectorization='bag_of_words', col_names=['title'], max_features_per_column=max_features)
X

array([[0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [1, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0]], shape=(44898, 5))

#### with normalization of output vectors:

In [13]:
X, feature_names=vectorize_text_data(df, col_names=['title', 'text'], max_features_per_column=max_features, normalizer=standardize)
X

array([[-0.48115251, -0.44009197, -0.67932176, ..., -0.58940876,
         0.1917217 ,  0.70638113],
       [-0.48115251, -0.44009197,  1.90136318, ..., -1.14378776,
         0.45105625,  0.71789513],
       [-0.48115251, -0.44009197, -0.67932176, ...,  0.69818631,
         0.2492151 , -0.66083466],
       ...,
       [ 1.88981099, -0.44009197,  0.92521959, ..., -0.05650344,
         0.10257537, -0.09069354],
       [-0.48115251, -0.44009197,  1.90136318, ..., -0.42014181,
        -0.26581171,  0.19534906],
       [-0.48115251,  2.79528734, -0.67932176, ..., -0.74243648,
         0.53831866,  0.30971875]], shape=(44898, 10))

#### with stop word removal:

In [14]:
X, feature_names=vectorize_text_data(df, col_names=['title'], max_features_per_column=max_features,  stop_words='english', normalizer=standardize)
feature_names


{'title': array(['hillary', 'obama', 'says', 'trump', 'video'], dtype=object)}

#### with custom tokenizer:

In [15]:
X, feature_names=vectorize_text_data(df, col_names=['title'], max_features_per_column=max_features,  stop_words='english', normalizer=standardize, tokenizer=stemming_tokenizer)
feature_names

{'title': array(["'s", 'say', 'trump', 'u.s.', 'video'], dtype=object)}

## 3. Comparison of different tokenizers for vectorization

In [42]:
from courselib.utils.splits import train_test_split_np
from courselib.utils.metrics import binary_accuracy
from courselib.models.glm import LogisticRegression
from courselib.models.svm import LinearSVM
from courselib.optimizers import GDOptimizer


In [75]:
tokenizers={'None': None, 'basic': basic_word_tokenizer, 'lemmatization': lemmatization_tokenizer, 'stemming': stemming_tokenizer}


lr=0.01 # learining rate
max_features=100 #maximal features to consider
columns=['title']
last_layers=[10,2]


epochs=100 # number of epochs
bs=100 # batch size
training_data_fraction=.8

optimizer=GDOptimizer(learning_rate=lr)
    

In [76]:
from courselib.utils.splits import train_test_split
from courselib.utils.preprocessing import labels_encoding


training_data_fraction=.8
df_, train_df, test_df=train_test_split(df, training_data_fraction=training_data_fraction, class_column_name='label', return_numpy=False)
Y_train=train_df['label'].to_numpy()
Y_test=test_df['label'].to_numpy()
Y_train_neg=Y_train.copy()
Y_train_neg[Y_train_neg==0]=-1
Y_test_neg=Y_test.copy()
Y_test_neg[Y_test_neg==0]=-1
Y_train_encoded=labels_encoding(Y_train, neg_value=0)
Y_test_encoded=labels_encoding(Y_test, neg_value=0)


In [77]:
import time
from vectorization_and_tokenization import multi_column_vectorizer


vect_start=time.time()
vectorizer=multi_column_vectorizer(col_names=columns,vectorization='bag_of_words',  max_features_per_column=max_features,ngram_range=(1,1), stop_words=None, tokenizer=None)
X_train=vectorizer.fit_transform(train_df, sparse=False)
X_test=vectorizer.transform(test_df, sparse=False)

vect_end=time.time()

In [78]:
np.vstack([X_train, X_test])

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(44898, 100))

In [81]:
import time
from vectorization_and_tokenization import multi_column_vectorizer
from courselib.utils.metrics import accuracy
from courselib.models.nn import MLP

results=[]
for ngrams in [(1,1)]:#,(1,2),(1,3)]:
    for vectorization in ['tf-idf']:#, 'bag_of_words']:
        for stop_words in [None]:#, 'english']:
            for tok_name, tok in [('None',None)]: #tokenizers.items():
                #Vectorize:
                vect_start=time.time()
                vectorizer=multi_column_vectorizer(col_names=columns,vectorization=vectorization,  max_features_per_column=max_features,ngram_range=ngrams, stop_words=stop_words, tokenizer=tok)
                X_train=vectorizer.fit_transform(train_df,sparse=False)
                X_test=vectorizer.transform(test_df, sparse=False)
                X=np.vstack([X_train, X_test])
                mean=np.mean(X, axis=0)
                std=np.std(X, axis=0)
                X_train=(X_train-mean)/std
                X_test=(X_test-mean)/std
                

                vect_end=time.time()



                #Logistic Regression
                # Initialize model
                w=np.zeros(X_train.shape[1]) # initial weights
                b=0 # initial bias
                model_lr=LogisticRegression(w,b,optimizer)
                
                #Train model
                train_start=time.time()
                model_lr.fit(X_train, Y_train, num_epochs=epochs, batch_size=bs)
                train_end=time.time()
                
                #Evaluate:
                train_accuracy=np.round(binary_accuracy(y_pred=model_lr(X_train), y_true=Y_train, class_labels=[0,1]), 4)
                test_accuracy=np.round(binary_accuracy(y_pred=model_lr(X_test), y_true=Y_test, class_labels=[0,1]), 4)
                
                result_lr={'model': 'LogisticRegression',
                    'vectorization type': vectorization,
                    'tokenizer': tok_name,
                    'stop_words': stop_words,
                    'ngram': ngrams,
                    'train accuracy [%]': train_accuracy,
                    'test accuracy [%]': test_accuracy,
                    'vectorization time [s]': np.round(vect_end-vect_start,2),
                    'training time [s]': np.round(train_end-train_start,2)
                    }
                results.append(result_lr)


                #SVM
                # Initialize model
                w=np.zeros(X_train.shape[1]) # initial weights
                b=0 # initial bias
                model_svm=LinearSVM(w,b,optimizer)
                
                #Train model
                train_start=time.time()
                model_svm.fit(X_train, Y_train_neg, num_epochs=epochs, batch_size=bs)
                train_end=time.time()
                
                #Evaluate:
                train_accuracy=np.round(binary_accuracy(y_pred=model_svm(X_train), y_true=Y_train_neg, class_labels=[-1,1]), 4)
                test_accuracy=np.round(binary_accuracy(y_pred=model_svm(X_test), y_true=Y_test_neg, class_labels=[-1,1]), 4)
                
                result_svm={'model': 'Linear SVM',
                    'vectorization type': vectorization,
                    'tokenizer': tok_name,
                    'stop_words': stop_words,
                    'ngram': ngrams,
                    'train accuracy [%]': train_accuracy,
                    'test accuracy [%]': test_accuracy,
                    'vectorization time [s]': np.round(vect_end-vect_start,2),
                    'training time [s]': np.round(train_end-train_start,2)
                    }
                results.append(result_svm)
                


                #MLP
                widths=[X_train.shape[1]]+last_layers
                model_mlp=MLP(widths=widths, optimizer=optimizer)
                train_start=time.time()
                model_mlp.fit(X_train, Y_train_encoded, num_epochs=epochs, batch_size=bs)
                train_end=time.time()
                train_accuracy=accuracy(y_pred=model_mlp.decision_function(X_train), y_true=Y_train_encoded)
                test_accuracy=accuracy(y_pred=model_mlp.decision_function(X_test), y_true=Y_test_encoded)

                result_mlp={'model': 'MLP',
                    'vectorization type': vectorization,
                    'tokenizer': tok_name,
                    'stop_words': stop_words,
                    'ngram': ngrams,
                    'train accuracy [%]': train_accuracy,
                    'test accuracy [%]': test_accuracy,
                    'vectorization time [s]': np.round(vect_end-vect_start,2),
                    'training time [s]': np.round(train_end-train_start,2)
                    }
                results.append(result_mlp)
                print(result_mlp)




                

{'model': 'MLP', 'vectorization type': 'tf-idf', 'tokenizer': 'None', 'stop_words': None, 'ngram': (1, 1), 'train accuracy [%]': np.float64(77.39016649033911), 'test accuracy [%]': np.float64(77.8173719376392), 'vectorization time [s]': np.float64(1.01), 'training time [s]': np.float64(20.75)}


In [82]:
result_df=pd.DataFrame(results)
result_df

Unnamed: 0,model,vectorization type,tokenizer,stop_words,ngram,train accuracy [%],test accuracy [%],vectorization time [s],training time [s]
0,LogisticRegression,tf-idf,,,"(1, 1)",88.8301,88.4187,1.01,2.11
1,Linear SVM,tf-idf,,,"(1, 1)",86.8228,86.6481,1.01,2.88
2,MLP,tf-idf,,,"(1, 1)",77.390166,77.817372,1.01,20.75
