# 1️⃣ Explore the train dataset, choose a representative sample
---
Your data is rather big, it'll be much faster if you begin experimenting with a smaller subset of it.

In [1]:
import pandas as pd

# Load only a sample of the dataset first
texts = pd.read_csv("dataset/data.csv")
print(texts.head())

   label                                              title  \
0      1  As U.S. budget fight looms, Republicans flip t...   
1      1  U.S. military to accept transgender recruits o...   
2      1  Senior U.S. Republican senator: 'Let Mr. Muell...   
3      1  FBI Russia probe helped by Australian diplomat...   
4      1  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   


In [2]:
print(texts.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39942 entries, 0 to 39941
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    39942 non-null  int64 
 1   title    39942 non-null  object
 2   text     39942 non-null  object
 3   subject  39942 non-null  object
 4   date     39942 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.5+ MB
None


In [3]:
print(texts.isnull().sum())  # Count missing values per column

label      0
title      0
text       0
subject    0
date       0
dtype: int64


In [4]:
print(texts['label'].value_counts())  # Count fake (0) vs real (1) news

label
1    19999
0    19943
Name: count, dtype: int64


In [5]:
texts.groupby('subject').label.mean()

subject
Government News    0.0
News               0.0
left-news          0.0
politics           0.0
politicsNews       1.0
worldnews          1.0
Name: label, dtype: float64

### Clean Data:
1. lowering & punctuation (custom_preprocessor)
2. tokenization -> CountVectorizer
3. remove stop words -> CountVectorizer (stop_words)
4. lemmatize -> (custom preprocessor)
5. removed rare words -> CountVectorizer (max_features)
6. CountVectorize -> Bag of Words
7. train test split
8. classification


In [6]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('punkt')  # Correct resource for tokenization
nltk.download('wordnet')  # For lemmatization
nltk.download('stopwords')  # For stopwords removal

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\renad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\renad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\renad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:

stop_words = set(stopwords.words('english'))  # Load stopwords list

def custom_preprocessor(text):
    """Preprocess text by removing numbers, punctuation, and stopwords, then apply lemmatization."""
    
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation & convert to lowercase

    tokens = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(lemmatized_tokens)


In [8]:
# Function to create features from text data using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

def create_features(texts, max_features=3000, ngram_range=(1, 2)):
    """Convert text data into numerical features using TF-IDF Vectorizer."""
    
    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_features=max_features,  # Keep the 5000 most frequent words
        ngram_range=ngram_range
    )

    tokens = vectorizer.fit_transform(texts)
    return tokens, vectorizer

In [9]:
# Function to apply the learned features on new text data
def apply_features(texts, vectorizer):
    """Transform new text data using the trained CountVectorizer."""
    
    tokens = vectorizer.transform(texts)
    features = vectorizer.get_feature_names_out()
    x_df = pd.DataFrame(tokens.toarray(), columns=features)
    return x_df

# 2️⃣ Build a classical NLP model
---
You have several choices here

How will you preprocess the data?

Think about choice between stemming or lemmatization

How many rare words should you remove?

Should you use n-grams?

Think what you want to do with different text fields?

Maybe you can begin with one and check what the accuracy is.

Think about the choice of classifier

SVM, Logisitic Regression or Multinomial Bayes could be a good choice. Do you remember why?

Select the best model and save it

In [10]:

x_train, x_test, y_train, y_test = train_test_split(
    texts[['title', 'text', 'subject']],
    texts['label'],
    test_size=0.2,
    random_state=62
)

In [11]:
x_train

Unnamed: 0,title,text,subject
10093,Obama to delay Spain visit until government is...,MADRID (Reuters) - U.S. President Barack Obama...,politicsNews
31028,GUY WHO MADE MILLIONS Selling “Science” To Kid...,William Stanford Nye or Bill Nye is an America...,politics
11148,U.S. tightens visa waiver rules for visitors a...,WASHINGTON (Reuters) - The United States on Th...,politicsNews
31828,WATCH! Anti-Trump Hag Gets Kicked Off Flight A...,Via: GATEWAY PUNDIT,politics
33475,RIDICULOUS! SECRET SERVICE Investigating “Poss...,Are they kidding? This was even a lead story o...,politics
...,...,...,...
36107,TUCKER ON COMEY’S FIRING: “Dictatorship by the...,TUCKER CARLSON Spoke out tonight on the firing...,Government News
2163,Key in NAFTA talks is 'not tearing apart what ...,WASHINGTON (Reuters) - Mexico’s Economy Minist...,politicsNews
9809,"Driven up the wall by Trump, Mexico looks to r...","MEXICO CITY (Reuters) - At first, Mexico’s gov...",politicsNews
15384,Judge orders Rosneft CEO Sechin to appear as w...,MOSCOW (Reuters) - A Russian judge on Wednesda...,worldnews


In [12]:
# Convert Text into Numerical Representation
from sklearn.feature_extraction.text import TfidfVectorizer

# Concatenate text columns into a single feature 
x_train_text = x_train['title'] + " " + x_train['text'] 
x_test_text = x_test['title'] + " " + x_test['text'] 


X_train_vec, vectorizer = create_features(x_train_text, 3000 , (1,2))
X_test_vec = vectorizer.transform(x_test_text)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='linear')
}

best_model = None
best_accuracy = 0
best_model_name = ""

for name, model in models.items():
    # Train model
    model.fit(X_train_vec, y_train) 
    # Make predictions
    y_pred = model.predict(X_test_vec)  
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)  

    print(f"🔹 Model: {name} - Accuracy: {accuracy:.4f}")  

    # Check if this model is the best so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = name

# Print the best model
print(f"\nBest Model: {best_model_name} - Accuracy: {best_accuracy:.4f}")


🔹 Model: Naive Bayes - Accuracy: 0.9432
🔹 Model: Logistic Regression - Accuracy: 0.9881
🔹 Model: SVM - Accuracy: 0.9945

Best Model: SVM - Accuracy: 0.9945


### max_features , ngram_range Experments


In [72]:
max_features_values = [1000, 3000, 5000]
ngram_ranges = [(1,1), (1,2)]
results = []

for max_features in max_features_values:
    for ngram_range in ngram_ranges:

        X_train_vec, vectorizer = create_features(x_train_text, max_features, ngram_range)
        X_test_vec = vectorizer.transform(x_test_text)
        
        # the best model
        model = SVC(kernel='linear')
        model.fit(X_train_vec, y_train)
        
        y_pred = model.predict(X_test_vec)
        accuracy = accuracy_score(y_test, y_pred)
        
        results.append({
            "max_features": max_features,
            "ngram_range": ngram_range,
            "accuracy": accuracy
        })

results_df = pd.DataFrame(results)
print(results_df)

   max_features ngram_range  accuracy
0          1000      (1, 1)  0.989611
1          1000      (1, 2)  0.991739
2          3000      (1, 1)  0.992490
3          3000      (1, 2)  0.994492
4          5000      (1, 1)  0.992114
5          5000      (1, 2)  0.993992


Increasing max_features and optimizing the ngram_range improved the model's accuracy, reaching a peak performance of 99.45% at max_features=3000 and ngram_range=(1,2).

In [14]:
# Save the best model and vectorizer
import joblib

joblib.dump(best_model, 'fake_news_classifier_svc.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

# 3️⃣ Build a Word2Vec-based classifier
---
You have 2 options with regards to embeddings:

You can create you own

You can take ready-to-use embeddings (you can finetune it too)



You have also a couple of options with regards to the model

Calculate average/max over the document vector and use any classical classifier

Use Conv1D Classifier or Kim's CNN architecture

Note that if you use this, you'll have to make all documents the same length (the same number of tokens). You can do it either by padding or by truncating

### Option 2: Use Pre-trained Word2Vec Embeddings

In [15]:
import gensim

word2vec_model_google = gensim.models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin',
    binary=True
)

###
We chose to use both classical classifier SVM and Kim’s CNN because each model processes data differently.

* Classical classifiers require fixed-length inputs, so we used word embedding averaging to convert each text into a single fixed-length vector.
* CNNs need to preserve word order, so we used word embedding sequences with padding to ensure all inputs have the same length.

Each model has its own way of handling data, which is why we applied two different preprocessing methods.

### SVC classical classifier (average)

In [16]:
import numpy as np

def embed_text(text, word2vec_model):
    word_vectors = [word2vec_model[word] for word in text if word in word2vec_model]
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)  
    return np.mean(word_vectors, axis=0)  

def embed_texts(texts, word2vec_model):
    return np.array([embed_text(text, word2vec_model) for text in texts])

x_train_embed = embed_texts(x_train_text, word2vec_model_google)
x_test_embed = embed_texts(x_test_text, word2vec_model_google)

In [17]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Train an SVM classifier
clf = SVC() 
clf.fit(x_train_embed, y_train)

# Make predictions
y_pred_classical = clf.predict(x_test_embed)

# Evaluate accuracy
accuracy_classical = accuracy_score(y_test, y_pred_classical)
print(f"Accuracy of the Classical Classifier (SVC): {accuracy_classical}")


Accuracy of the Classical Classifier (SVC): 0.8564275879334085


* TF-IDF + SVM (Accuracy: 0.9945)

Uses TF-IDF, which preserves word importance and distribution.
Works well with SVM, leading to higher accuracy.

* Word2Vec + SVM (Accuracy: 0.8597)

Uses Word2Vec, averaging word embeddings into a single vector.
Loses contextual details, reducing accuracy.

### Kim's CNN architecture (padding)

In [18]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Dense, Conv1D, MaxPooling1D, Concatenate, Dropout, Flatten, Embedding
)
from tensorflow.keras.utils import to_categorical

In [19]:
# Define constants
MAX_VOCAB_SIZE = 10000  # Limit vocabulary size
MAX_SEQUENCE_LENGTH = 100  # Limit max sequence length (prevents excessive memory usage)
EMBEDDING_DIM = 300  # Word2Vec embedding size
FILTER_SIZES = [3, 4, 5]  # Different filter sizes
NUM_FILTERS = 100  # Number of filters per channel
DROPOUT_RATE = 0.5  # Dropout rate

# Tokenize the text
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train_text)

x_train_seq = tokenizer.texts_to_sequences(x_train_text)
x_test_seq = tokenizer.texts_to_sequences(x_test_text)

# Apply padding
x_train_padded = pad_sequences(x_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
x_test_padded = pad_sequences(x_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Convert labels to categorical format
num_classes = len(set(y_train))
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

# Create embedding matrix using pre-trained Word2Vec
def create_embedding_matrix(word2vec_model, tokenizer, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, index in tokenizer.word_index.items():
        if index < vocab_size:
            embedding_matrix[index] = word2vec_model[word] if word in word2vec_model else np.zeros(embedding_dim)
    return embedding_matrix

embedding_matrix = create_embedding_matrix(word2vec_model_google, tokenizer, MAX_VOCAB_SIZE, EMBEDDING_DIM)


In [20]:
# Define CNN Model
def yoon_kim_cnn():
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

    # Channel 1: Static pre-trained word2vec (non-trainable)
    embedding_static = Embedding(
        input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM,
        weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,
        trainable=False, name='static_channel'
    )(sequence_input)

    # Channel 2: Non-static pre-trained word2vec (trainable)
    embedding_non_static = Embedding(
        input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM,
        weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,
        trainable=True, name='non_static_channel'
    )(sequence_input)

    # Channel 3: Random initialized embeddings (trainable)
    embedding_random = Embedding(
        input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM,
        input_length=MAX_SEQUENCE_LENGTH, trainable=True, name='random_channel'
    )(sequence_input)

    # Apply convolutions
    conv_blocks = []
    for embedding_layer, name in zip([embedding_static, embedding_non_static, embedding_random], ["static", "non_static", "random"]):
        for filter_size in FILTER_SIZES:
            conv = Conv1D(
                filters=NUM_FILTERS // 3, kernel_size=filter_size,
                padding='valid', activation='relu', strides=1,
                name=f'conv_{name}_{filter_size}'
            )(embedding_layer)
            max_pool = MaxPooling1D(pool_size=MAX_SEQUENCE_LENGTH - filter_size + 1, name=f'maxpool_{name}_{filter_size}')(conv)
            conv_blocks.append(max_pool)

    # Concatenate all pooled features
    z = Concatenate()(conv_blocks)
    z = Flatten()(z)
    z = Dropout(DROPOUT_RATE)(z)

    outputs = Dense(num_classes, activation='softmax')(z)
    model = Model(sequence_input, outputs)
    
    return model

In [21]:
# Create and compile the model
model = yoon_kim_cnn()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(x_train_padded, y_train_cat, validation_data=(x_test_padded, y_test_cat), epochs=10, batch_size=64)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(x_test_padded, y_test_cat, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")



Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 121ms/step - accuracy: 0.9519 - loss: 0.1102 - val_accuracy: 0.9984 - val_loss: 0.0055
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 153ms/step - accuracy: 0.9987 - loss: 0.0051 - val_accuracy: 0.9990 - val_loss: 0.0045
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 164ms/step - accuracy: 0.9994 - loss: 0.0016 - val_accuracy: 0.9987 - val_loss: 0.0045
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 167ms/step - accuracy: 0.9999 - loss: 4.1401e-04 - val_accuracy: 0.9989 - val_loss: 0.0043
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 147ms/step - accuracy: 0.9998 - loss: 5.8657e-04 - val_accuracy: 0.9986 - val_loss: 0.0062
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 124ms/step - accuracy: 0.9999 - loss: 4.2188e-04 - val_accuracy: 0.9987 - val_loss: 0.006

In [23]:
import pickle

# Save trained CNN model
model.save("cnn_model.keras")

# Save tokenizer
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle)