# Testing the Validation Data Using the Trained Model and TF-IDF Vectorizer


In [1]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def custom_preprocessor(text):
    """Preprocess text by removing numbers, punctuation, and stopwords, then apply lemmatization."""
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation & convert to lowercase

    tokens = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(lemmatized_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\renad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\renad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import pandas as pd
import joblib

# Load the original validation data
validation_data = pd.read_csv("dataset/validation_data.csv")

# Load the trained model and TF-IDF Vectorizer
model = joblib.load('fake_news_classifier_svc.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Transform text into numerical features using TF-IDF
validation_texts = validation_data['title'] + " " + validation_data['text'] 
validation_features = vectorizer.transform(validation_texts)

# Predict labels (replace 2 with either 0 or 1)
predicted_labels = model.predict(validation_features)

# Update the label column with the new predictions
validation_data['label'] = predicted_labels

# Save the updated file while preserving the original format
validation_data.to_csv("dataset/validation_data_predicted_.csv", index=False)

print("✅ The file has been successfully updated!")


✅ The file has been successfully updated!


In [4]:
validation_data

Unnamed: 0,label,title,text,subject,date
0,1,UK's May 'receiving regular updates' on London...,LONDON (Reuters) - British Prime Minister Ther...,worldnews,"September 15, 2017"
1,1,UK transport police leading investigation of L...,LONDON (Reuters) - British counter-terrorism p...,worldnews,"September 15, 2017"
2,1,Pacific nations crack down on North Korean shi...,WELLINGTON (Reuters) - South Pacific island na...,worldnews,"September 15, 2017"
3,1,Three suspected al Qaeda militants killed in Y...,"ADEN, Yemen (Reuters) - Three suspected al Qae...",worldnews,"September 15, 2017"
4,1,Chinese academics prod Beijing to consider Nor...,BEIJING (Reuters) - Chinese academics are publ...,worldnews,"September 15, 2017"
...,...,...,...,...,...
4951,0,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
4952,0,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
4953,0,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
4954,0,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"
