In [12]:
#Enable autoreloading of imported modules
%load_ext autoreload
%autoreload 2

#Import required packages
import sys,os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import kagglehub
from sklearn.feature_extraction.text import TfidfVectorizer

#Add the repo root (one level up from this notebook) to sys.path
sys.path.insert(0, os.path.abspath("../"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
#Install necessary resources from nltk
import nltk

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')


[nltk_data] Downloading package wordnet to C:\Users\Philipp
[nltk_data]     Hoffmann\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Philipp Hoffmann\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to C:\Users\Philipp
[nltk_data]     Hoffmann\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Load data

In [14]:
#Download latest version of dataset
print("Load or download dataset...")
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset") #path to downloaded dataset
#   (if already downloaded, will not download again)

fake_path=os.path.join(path, "Fake.csv")    #path to dataset with true news
true_path=os.path.join(path, "True.csv")    #path to dataset with fake news

#Read into dataframes
print("Loading fake.csv ...")
fake_df=pd.read_csv(fake_path)
print("Loading true.csv ...")
true_df=pd.read_csv(true_path)

#Label data (1=true, 0=fakenews)
true_df['label']=1
fake_df['label']=0

#Join dataframes
df=pd.concat([true_df, fake_df])
print("Done")


Load or download dataset...
Loading fake.csv ...
Loading true.csv ...
Done


In [15]:
from courselib.utils.splits import train_test_split

training_data_fraction=.8

#Split
df_, train_df, test_df=train_test_split(df, training_data_fraction=training_data_fraction, class_column_name='label', return_numpy=False)

In [16]:
from courselib.utils.preprocessing import labels_encoding

Y_train=train_df['label'].to_numpy()
Y_test=test_df['label'].to_numpy()

Y_train_neg=Y_train.copy()
Y_train_neg[Y_train_neg==0]=-1
Y_test_neg=Y_test.copy()
Y_test_neg[Y_test_neg==0]=-1

Y_train_enc=labels_encoding(Y_train, labels=[0,1])
Y_test_enc=labels_encoding(Y_test, labels=[0,1])


## 1.  Examples: Tokenizers

In [57]:
from extensions.vectorization_and_tokenization import stemming_tokenizer, basic_word_tokenizer, lemmatization_tokenizer

In [18]:
text=df['text'].iloc[0]
text

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [19]:
print(basic_word_tokenizer(text))
print(lemmatization_tokenizer(text))
print(stemming_tokenizer(text))

['washington', 'reuters', 'the', 'head', 'of', 'a', 'conservative', 'republican', 'faction', 'in', 'the', 'u.s.', 'congress', 'who', 'voted', 'this', 'month', 'for', 'a', 'huge', 'expansion', 'of', 'the', 'national', 'debt', 'to', 'pay', 'for', 'tax', 'cuts', 'called', 'himself', 'a', 'fiscal', 'conservative', 'on', 'sunday', 'and', 'urged', 'budget', 'restraint', 'in', '2018', 'in', 'keeping', 'with', 'a', 'sharp', 'pivot', 'under', 'way', 'among', 'republicans', 'u.s.', 'representative', 'mark', 'meadows', 'speaking', 'on', 'cbs', 'face', 'the', 'nation', 'drew', 'a', 'hard', 'line', 'on', 'federal', 'spending', 'which', 'lawmakers', 'are', 'bracing', 'to', 'do', 'battle', 'over', 'in', 'january', 'when', 'they', 'return', 'from', 'the', 'holidays', 'on', 'wednesday', 'lawmakers', 'will', 'begin', 'trying', 'to', 'pass', 'a', 'federal', 'budget', 'in', 'a', 'fight', 'likely', 'to', 'be', 'linked', 'to', 'other', 'issues', 'such', 'as', 'immigration', 'policy', 'even', 'as', 'the', 'n

#### with stopword removal:

In [20]:
from extensions.vectorization_and_tokenization import ENGLISH_STOP_WORDS

print(basic_word_tokenizer(text, stop_words=ENGLISH_STOP_WORDS))
print(lemmatization_tokenizer(text, stop_words=ENGLISH_STOP_WORDS))
print(stemming_tokenizer(text, stop_words=ENGLISH_STOP_WORDS))

['washington', 'reuters', 'head', 'conservative', 'republican', 'faction', 'u.s.', 'congress', 'voted', 'month', 'huge', 'expansion', 'national', 'debt', 'pay', 'tax', 'cuts', 'called', 'fiscal', 'conservative', 'sunday', 'urged', 'budget', 'restraint', '2018', 'keeping', 'sharp', 'pivot', 'way', 'republicans', 'u.s.', 'representative', 'mark', 'meadows', 'speaking', 'cbs', 'face', 'nation', 'drew', 'hard', 'line', 'federal', 'spending', 'lawmakers', 'bracing', 'battle', 'january', 'return', 'holidays', 'wednesday', 'lawmakers', 'begin', 'trying', 'pass', 'federal', 'budget', 'fight', 'likely', 'linked', 'issues', 'immigration', 'policy', 'november', 'congressional', 'election', 'campaigns', 'approach', 'republicans', 'seek', 'control', 'congress', 'president', 'donald', 'trump', 'republicans', 'want', 'big', 'budget', 'increase', 'military', 'spending', 'democrats', 'want', 'proportional', 'increases', 'non-defense', 'discretionary', 'spending', 'programs', 'support', 'education', 'sc

## 2. Examples: Vectorization

In [22]:
from extensions.vectorization_and_tokenization import multi_column_vectorizer

In [58]:
max_features=5
vectorizer=multi_column_vectorizer(col_names=['title', 'text'], max_features_per_column=max_features, tokenizer=None)

X_train=vectorizer.fit_transform(train_df, col_names=['title', 'text'])
X_train

<Compressed Sparse Column sparse matrix of dtype 'float64'
	with 212735 stored elements and shape (35918, 10)>

In [24]:
X_test=vectorizer.transform(test_df, col_names=['title', 'text'])

In [25]:
vectorizer.get_feature_names_out()

{'title': array(['in', 'of', 'to', 'trump', 'video'], dtype=object),
 'text': array(['and', 'in', 'of', 'the', 'to'], dtype=object)}

#### as numpy arrays:

In [26]:
X_train=vectorizer.fit_transform(train_df, col_names=['title'], sparse=False)
X_train

array([[0.        , 0.        , 0.        , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.        ],
       ...,
       [0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.        , 0.80104832, 0.59859968, 0.        , 0.        ],
       [0.        , 0.79328016, 0.        , 0.60885679, 0.        ]],
      shape=(35918, 5))

In [27]:
X_test=vectorizer.transform(test_df, col_names=['title'], sparse=False)
X_test

array([[0.69878939, 0.        , 0.        , 0.        , 0.71532747],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        , 0.        , 0.        ],
       ...,
       [0.        , 0.79328016, 0.        , 0.60885679, 0.        ],
       [0.        , 0.        , 0.38093265, 0.78250837, 0.49251493],
       [1.        , 0.        , 0.        , 0.        , 0.        ]],
      shape=(8980, 5))

####  with bag of words vectorization:

In [28]:
vectorizer=multi_column_vectorizer(col_names=['title'], vectorization='bag-of-words', max_features_per_column=max_features, tokenizer=None)
X_train=vectorizer.fit_transform(train_df, col_names=['title'])
vectorizer.get_feature_names_out()


{'title': array(['in', 'of', 'to', 'trump', 'video'], dtype=object)}

#### with stop word removal:

In [29]:
vectorizer=multi_column_vectorizer(col_names=['title'], max_features_per_column=max_features, stop_words='english', tokenizer=None)
X_train=vectorizer.fit_transform(train_df, col_names=['title'])
vectorizer.get_feature_names_out()


{'title': array(['hillary', 'obama', 'says', 'trump', 'video'], dtype=object)}

#### with custom tokenizer:

In [30]:
vectorizer=multi_column_vectorizer(col_names=['title'], max_features_per_column=max_features, tokenizer=basic_word_tokenizer)
X_train=vectorizer.fit_transform(train_df, col_names=['title'])
vectorizer.get_feature_names_out()

{'title': array(['in', 'of', 'to', 'trump', 'video'], dtype=object)}

## 3. Comparison of different tokenizers for vectorization

In [35]:
from courselib.utils.metrics import binary_accuracy, accuracy
from extensions.sparse_array_compatible_models import LogisticRegression_S, LinearSVM_S
from courselib.models.nn import MLP
from courselib.optimizers import GDOptimizer


In [54]:
tokenizers={'None': None, 'basic': basic_word_tokenizer, 'lemmatization': lemmatization_tokenizer, 'stemming': stemming_tokenizer}
columns_list=[['title'], ['text'], ['title', 'text']] 
models=['LogisticRegression', 'LinearSVM', 'MLP']

lrs=[0.01] # learining rates
max_features_list=[None] # list maximal features to consider
C_list=[10] #C values for SVM
hidden_layer_widths_list=[[10], [10,10]]  # list widths of hidden layers (everything except input and output layer)
vectorizations=['tf-idf', 'bag-of-words']
stop_words_options=[None, 'english']
ngram_ranges=[(1,1)]
epochs_list=[100] #  number of epochs
bss=[len(train_df)]# full batch
z_score_options=[True] #whether to apply z-score normalization after vectorization
sparse_options=[True]

In [55]:
import time
from IPython.display import clear_output, display
import scipy.sparse as sp
from courselib.utils.normalization import standardize
from extensions.normalization_ext import standardize_sparse_matrix

column_order=['sparse','z-score','columns','model', '# epochs','learning rate','batch size','C','widths','vectorization','tokenizer','stop_words',
    'ngram range','# features','train accuracy [%]','test accuracy [%]','vectorization time [s]','training time [s]'
    ]

def run_experiments(z_score_options,sparse_options, columns_list, models, vectorizations, tokenizers, stop_words_options, ngram_ranges, max_features_list, lrs, bs, epochs_list, save_to_file=False):
    results=[]



    def display_results():
        df_result=pd.DataFrame(results, columns=column_order)
        clear_output(wait=True)
        display(df_result.style.hide(axis="index"))
        
    for sparse in z_score_options: 
        for z_score in z_score_options:
            for columns in columns_list:
                for max_features in max_features_list:
                    if max_features is None and not sparse:
                        continue #Not enough storage
                    for ngrams in ngram_ranges:
                        for vectorization in vectorizations:
                            for stop_words in stop_words_options: 
                                for tok_name, tok in tokenizers.items():
                                    #Vectorize:
                                    vect_start=time.time()
                                    vectorizer=multi_column_vectorizer(col_names=columns, vectorization=vectorization, max_features_per_column=max_features,
                                                                    ngram_range=ngrams, stop_words=stop_words, tokenizer=tok)
                                    X_train=vectorizer.fit_transform(train_df, sparse=sparse)
                                    X_test=vectorizer.transform(test_df, sparse=sparse)
                                    
                                    len=X_train.shape[0] #length of X_train
                                    
                                    if z_score:
                                        #Apply z-score normalization
                                        if sparse:
                                            X=sp.vstack([X_train, X_test])
                                            X, offset=standardize_sparse_matrix(X)
                                            X_train, X_test=X[:len], X[len:]
                                        else:
                                            X=np.vstack([X_train, X_test])
                                            X=standardize(X)
                                            X_train, X_test=X[:len], X[len:]
                                            offset=None
                                            
                                    vect_end=time.time()
                                    
                                    num_features=X_train.shape[1]
                                
                                    
                                    for lr in lrs:
                                        optimizer=GDOptimizer(learning_rate=lr)
                                        for bs in bss:
                                            for epochs in epochs_list:
                                                result={'# epochs': epochs,
                                                    'learning rate': lr,
                                                    'batch size':bs,
                                                    'vectorization': vectorization,
                                                    'tokenizer': tok_name,
                                                    'stop_words': stop_words,
                                                    'ngram range': ngrams,
                                                    '# features': num_features,
                                                    'vectorization time [s]': np.round(vect_end-vect_start,2),
                                                    'z-score': z_score, 
                                                    'columns': columns, 
                                                    'sparse': sparse
                                                    }
                                                for m in models:
                                                    if m=='MLP' and not sparse: #MLP does not support sparse operations
                                                        for hidden_layer_widths in hidden_layer_widths_list:
                                                            #initialize model
                                                            widths=[num_features]+hidden_layer_widths+[2] #layer widths
                                                            model=MLP(widths=widths, optimizer=optimizer)
                                                            #Train model
                                                            train_start=time.time()
                                                            model.fit(X_train, y=Y_train_enc, num_epochs=epochs, batch_size=bs)
                                                            train_end=time.time()
                                                            #Evaluate
                                                            train_accuracy=np.round(accuracy(y_pred=model.decision_function(X_train), y_true=Y_train_enc),4)
                                                            test_accuracy=np.round(accuracy(y_pred=model.decision_function(X_test), y_true=Y_test_enc),4)
                                                            
                                                            result.update({'model': 'MLP', 
                                                            'C': None,
                                                            'widths': widths,
                                                            'train accuracy [%]': train_accuracy,
                                                            'test accuracy [%]': test_accuracy,
                                                            'training time [s]': np.round(train_end-train_start,2)
                                                            })
                                                            results.append(result.copy())
                                                            display_results()
                                                        
                                                    elif m=='LinearSVM':
                                                        for C in C_list:
                                                            #initialize model
                                                            w=np.zeros(num_features) #initial weights
                                                            b=0 #initial bias
                                                            model=LinearSVM_S(w,b, optimizer=optimizer, offset=offset)
                                                            #Train model
                                                            train_start=time.time()
                                                            model.fit(X_train, y=Y_train_neg, num_epochs=epochs, batch_size=bs)
                                                            train_end=time.time()
                                                            #Evaluate
                                                            train_accuracy=np.round(binary_accuracy(y_pred=model.decision_function(X_train), y_true=Y_train_neg, class_labels=[-1,1]),4)
                                                            test_accuracy=np.round(binary_accuracy(y_pred=model.decision_function(X_test), y_true=Y_test_neg, class_labels=[-1,1]),4)
                                                            
                                                            result.update({'model': 'LinearSVM', 
                                                            'C': C,
                                                            'widths': None,
                                                            'train accuracy [%]': train_accuracy,
                                                            'test accuracy [%]': test_accuracy,
                                                            'training time [s]': np.round(train_end-train_start,2)
                                                            })
                                                            results.append(result.copy())
                                                            display_results()
                                                            
                                                    elif m=='LogisticRegression':
                                                        #initialize model
                                                        w=np.zeros(num_features) #initial weights
                                                        b=0 #initial bias
                                                        model=LogisticRegression_S(w,b, optimizer=optimizer, offset=offset)
                                                        #Train model
                                                        train_start=time.time()
                                                        model.fit(X_train, y=Y_train, num_epochs=epochs, batch_size=bs)
                                                        train_end=time.time()
                                                        #Evaluate
                                                        train_accuracy=np.round(binary_accuracy(y_pred=model.decision_function(X_train), y_true=Y_train, class_labels=[0,1]),4)
                                                        test_accuracy=np.round(binary_accuracy(y_pred=model.decision_function(X_test), y_true=Y_test, class_labels=[0,1]),4)
                                                        
                                                        result.update({'model': 'LogisticRegression', 
                                                        'C': None,
                                                        'widths': None,
                                                        'train accuracy [%]': train_accuracy,
                                                        'test accuracy [%]': test_accuracy,
                                                        'training time [s]': np.round(train_end-train_start,2)
                                                        })
                                                        results.append(result.copy())
                                                        display_results()

    result_df=pd.DataFrame(results)
    return result_df
                                                    
                                            

                                            
                        
                        
                        
                
        
        
        

In [56]:
result_df=run_experiments(z_score_options,sparse_options, columns_list, models, vectorizations, tokenizers, stop_words_options, ngram_ranges, max_features_list, lrs, bss, epochs_list, save_to_file=False)

sparse,z-score,columns,model,# epochs,learning rate,batch size,C,widths,vectorization,tokenizer,stop_words,ngram range,# features,train accuracy [%],test accuracy [%],vectorization time [s],training time [s]
True,True,['title'],LogisticRegression,100,0.01,35918,,,tf-idf,,,"(1, 1)",19585,97.4219,94.9666,0.56,0.52
True,True,['title'],LinearSVM,100,0.01,35918,10.0,,tf-idf,,,"(1, 1)",19585,99.8775,93.8641,0.56,0.5
True,True,['title'],LogisticRegression,100,0.01,35918,,,tf-idf,basic,,"(1, 1)",24938,98.4158,96.2806,5.06,0.51
True,True,['title'],LinearSVM,100,0.01,35918,10.0,,tf-idf,basic,,"(1, 1)",24938,99.8274,95.2895,5.06,0.5
True,True,['title'],LogisticRegression,100,0.01,35918,,,tf-idf,lemmatization,,"(1, 1)",20896,98.0177,96.1136,33.93,0.5
True,True,['title'],LinearSVM,100,0.01,35918,10.0,,tf-idf,lemmatization,,"(1, 1)",20896,99.9109,95.3229,33.93,0.5
True,True,['title'],LogisticRegression,100,0.01,35918,,,tf-idf,stemming,,"(1, 1)",17373,97.5138,95.8018,11.55,0.48
True,True,['title'],LinearSVM,100,0.01,35918,10.0,,tf-idf,stemming,,"(1, 1)",17373,99.8914,95.412,11.55,0.5
True,True,['title'],LogisticRegression,100,0.01,35918,,,tf-idf,,english,"(1, 1)",19311,96.8846,93.8976,0.56,0.39
True,True,['title'],LinearSVM,100,0.01,35918,10.0,,tf-idf,,english,"(1, 1)",19311,99.8608,92.6392,0.56,0.42


KeyboardInterrupt: 