In [1]:
import pandas as pd
import re
import nltk
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import metrics
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load dataset
file_path = r'C:\Users\Nikki\OneDrive\Desktop\Pictures\Documents\BTECH\SLASHMARK\BASIC PROJECTS\Fake_News_Detection-master\train.csv'
train_df = pd.read_csv(file_path)

# Ensure required columns exist
if 'Statement' not in train_df.columns or 'Label' not in train_df.columns:
    raise ValueError("Dataset must contain 'Statement' and 'Label' columns.")

# Drop missing values
train_df.dropna(subset=['Statement', 'Label'], inplace=True)

# Convert categorical labels ('FAKE'/'REAL') to binary (0/1) if necessary
if train_df['Label'].dtype == 'object':
    train_df['Label'] = train_df['Label'].map({'FAKE': 0, 'REAL': 1})

# Text preprocessing function
lemmatizer = WordNetLemmatizer()
stpwrds = set(stopwords.words('english'))
from nltk.tokenize import wordpunct_tokenize

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    words = wordpunct_tokenize(text)  # Alternative tokenizer
    words = [lemmatizer.lemmatize(word) for word in words if word not in stpwrds]
    return ' '.join(words)


# Apply preprocessing
# train_df['Statement'] = train_df['Statement'].astype(str).apply(clean_text)

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(train_df['Statement'], train_df['Label'], test_size=0.3, random_state=1)

# Convert text to TF-IDF vectors
tfidf_v = TfidfVectorizer()
tfidf_X_train = tfidf_v.fit_transform(X_train)
tfidf_X_test = tfidf_v.transform(X_test)

# Train model
classifier = PassiveAggressiveClassifier()
classifier.fit(tfidf_X_train, Y_train)

# Evaluate model
Y_pred = classifier.predict(tfidf_X_test)
accuracy = round(metrics.accuracy_score(Y_test, Y_pred) * 100, 2)
print(f'Model Accuracy: {accuracy}%')

# Save model and vectorizer
pickle.dump(classifier, open('./model.pkl', 'wb'))
pickle.dump(tfidf_v, open('./vectorizer.pkl', 'wb'))

# Load model and vectorizer
loaded_model = pickle.load(open('./model.pkl', 'rb'))
loaded_vectorizer = pickle.load(open('./vectorizer.pkl', 'rb'))

# Function to detect fake news
def fake_news_det(news):
    processed_news = clean_text(news)
    vectorized_input = loaded_vectorizer.transform([processed_news])
    prediction = loaded_model.predict(vectorized_input)
    print("🛑 FAKE News 📰" if prediction[0] == 0 else "✅ REAL News 📰")

# Example usage
fake_news_det("Breaking news! The president just announced a major policy change.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nikki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nikki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nikki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Accuracy: 56.15%
✅ REAL News 📰
