In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [24]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pravallika\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load data
fake_data = pd.read_csv("fake.csv")
true_data = pd.read_csv("true.csv")

# Add a "class" column
fake_data["class"] = 0
true_data["class"] = 1

# Drop some rows for manual testing
fake_data_manual_testing = fake_data.tail(10)
fake_data.drop(fake_data.index[-10:], inplace=True)

true_data_manual_testing = true_data.tail(10)
true_data.drop(true_data.index[-10:], inplace=True)

# Concatenate data
data_merge = pd.concat([fake_data, true_data], axis=0)

# Preprocess text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

data_merge["text"] = data_merge["text"].apply(preprocess_text)

# Split data
x = data_merge["text"]
y = data_merge["class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Vectorization
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

# logistic regression Model training
LR = LogisticRegression()
LR.fit(xv_train, y_train)

# Predictions
y_pred = LR.predict(xv_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report_result)


Accuracy: 0.9872549019607844

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      5894
           1       0.99      0.99      0.99      5326

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [26]:
from sklearn.tree import DecisionTreeClassifier
DT=DecisionTreeClassifier()
DT.fit(xv_train,y_train)

DecisionTreeClassifier()

<1x173262 sparse matrix of type '<class 'numpy.float64'>'
	with 161 stored elements in Compressed Sparse Row format>

In [27]:
pred_df=DT.predict(xv_test)

In [28]:
DT.score(xv_test,y_test)

0.9967023172905526

In [29]:
print(classification_report(y_test,pred_df))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5894
           1       1.00      1.00      1.00      5326

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [30]:
def output_label(n):
    if n == 0:
        return "fake news"
    elif n == 1:
        return "not fake news"

def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(preprocess_text)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    
    print("\n\nLogistic Regression Prediction:", output_label(pred_LR[0]))
    print("Decision Tree Prediction:", output_label(pred_DT[0]))

# Get user input for testing
news_input = str(input("Enter the news for testing: "))
manual_testing(news_input)


Enter the news for testing: hi


Logistic Regression Prediction: fake news
Decision Tree Prediction: fake news


In [31]:
import pickle

In [32]:
with open('model.pkl', 'wb') as file:
    pickle.dump(DT, file)


In [33]:
with open('model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [34]:
with open('vect.pkl', 'wb') as file:
    pickle.dump(vectorization, file)

In [35]:
k=loaded_model.predict(xv_test[1])
k

array([1], dtype=int64)