In [92]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk

In [70]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [72]:
def process_text(text):
    if not isinstance(text, str):
        return ''
    tokens = word_tokenize(re.sub('[^a-zA-Z]', ' ', text.lower()))
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(lemmatized)

In [73]:
news_dataset = pd.read_csv('/content/WELFake_Dataset.csv')

In [74]:
news_dataset['content'] = news_dataset['title'] + ' ' + news_dataset['text']

In [75]:
!pip install nltk
import nltk

nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
print("Applying lemmatization. This might take some time...")
news_dataset['content'] = news_dataset['content'].apply(process_text)
print("Lemmatization complete.")

Applying lemmatization. This might take some time...
Lemmatization complete.


In [None]:
news_dataset.to_csv('/content/preprocessed_dataset.csv', index=False)
print("Preprocessed data has been saved to 'preprocessed_news_dataset.csv'.")

Preprocessed data has been saved to 'preprocessed_news_dataset.csv'.


In [76]:
data = pd.read_csv('/content/preprocessed_dataset.csv')

In [77]:
print(data.head())

   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  \
0  No comment is expected from Barack Obama Membe...      1   
1     Did they post their votes for Hillary already?      1   
2   Now, most of the demonstrators gathered last ...      1   
3  A dozen politically active pastors came here f...      0   
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1   

                                             content  
0  law enforcement high alert following threat co...  
1                                                NaN  
2  unbelievable obama attorney general say char

In [78]:
print(data['content'].isnull().sum())

641


In [79]:
data['content'] = data['content'].fillna('No content available')

In [80]:
print(data['content'].isnull().sum())

0


In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(data['content']).toarray()

print(X.shape)


(72134, 5000)


In [82]:
y = data['label'].values

print(y[:10])

[1 1 1 0 1 1 1 1 1 1]


In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")


Training set shape: (57707, 5000), Testing set shape: (14427, 5000)


In [84]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

y_pred_log_reg = log_reg.predict(X_test)

from sklearn.metrics import accuracy_score
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression Accuracy: {log_reg_accuracy * 100:.2f}%")


Logistic Regression Accuracy: 94.67%


In [85]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()

naive_bayes.fit(X_train, y_train)

y_pred_naive_bayes = naive_bayes.predict(X_test)

naive_bayes_accuracy = accuracy_score(y_test, y_pred_naive_bayes)
print(f"Naive Bayes Accuracy: {naive_bayes_accuracy * 100:.2f}%")


Naive Bayes Accuracy: 84.91%


In [86]:
if log_reg_accuracy > naive_bayes_accuracy:
    print("Logistic Regression is the best model.")
    best_model = log_reg
else:
    print("Naive Bayes is the best model.")
    best_model = naive_bayes

Logistic Regression is the best model.


In [87]:
import pickle

In [88]:
with open('/content/best_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

with open('/content/vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [89]:
with open('/content/best_model.pkl', 'rb') as model_file:
    best_model = pickle.load(model_file)

with open('/content/vectorizer.pkl', 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

In [95]:
new_article = input("Enter the new article text to classify as Real or Fake: ")

new_article_transformed = vectorizer.transform([new_article]).toarray()

prediction = best_model.predict(new_article_transformed)

print(f"Prediction: {'Real' if prediction == 1 else 'Fake'}")

Enter the new article text to classify as Real or Fake: america give grand piano horse wednesday november lucas wilde america give grand piano horse america given grand piano horse expecting quality tune particularly looking forward beethoven ninth beamed horse supporter piano enthusiast jay cooper horse never given piano frankly establishment allow last change come america change better lot doubter doubter soon silenced graceful note chopin mozart maybe even little richard horse dobbin williams said really sure expected horse absolutely qualified play piano mean look hoof way general even sit chair properly earth anyone think good idea cooper grinned made piano great democrat elizabeth king said wanted get pianist low medium standard piano thumped anything exciting would perfectly reasonable background music people spoken people wanted horse god bless america get best newsthump story mailbox every friday free currently witterings add
Prediction: Real
