<a href="https://colab.research.google.com/github/SanjayTummala/Fake-News-Detection-using-ML-and-NLP/blob/main/Fake_News_Detection_using_ML_and_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1.Import Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
import itertools
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics
import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string
import re
import nltk
import collections
from nltk.corpus import stopwords
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
!pip install empath
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# **2.Import Dataset**

In [None]:
# load the data
df_true = pd.read_csv("True.csv")
df_fake = pd.read_csv("Fake.csv")

# **3.Perform Exploratory Data Analysis**

In [None]:
# add a target class column to indicate whether the news is real or fake
df_true['isfake'] = 1
df_true.head()

In [None]:
df_fake['isfake'] = 0
df_fake.head()

In [None]:
# Concatenate Real and Fake News
df = pd.concat([df_true, df_fake]).reset_index(drop = True)
df

In [None]:
df.drop(columns = ['date'], inplace = True)

In [None]:
# combine title and text together
df['original'] = df['title'] + ' ' + df['text']
df.head()

In [None]:
df['original'][0]

# **4.Perform Data Cleaning**

In [None]:
nltk.download("stopwords")

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
            result.append(token)

    return result

In [None]:
# Import the required library
import gensim
from gensim.parsing.preprocessing import STOPWORDS

# Define the preprocess function
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:  # Assuming stop_words is not required
            result.append(token)
    return result

# Apply the function to the dataframe
df['clean'] = df['original'].apply(preprocess)


In [None]:
df['original'][0]

In [None]:
print(df['clean'][0])

In [None]:
df

In [None]:
list_of_words = []
for i in df.clean:
    for j in i:
        list_of_words.append(j)

In [None]:
list_of_words

In [None]:
len(list_of_words)

In [None]:
total_words = len(list(set(list_of_words)))
total_words

In [None]:
df['clean_joined'] = df['clean'].apply(lambda x: " ".join(x))

In [None]:
df

In [None]:
df['clean_joined'][0]

# **5.Visualize Cleaned Up Dataset**

In [None]:
df

In [None]:
# Import the required libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the number of samples in 'subject'
plt.figure(figsize=(8, 8))
sns.countplot(y="subject", data=df)
plt.title("Number of Samples in Each Subject")
plt.xlabel("Count")
plt.ylabel("Subject")
plt.show()


In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the count of 0 and 1 in the 'isfake' column
plt.figure(figsize=(8, 6))
sns.countplot(x="isfake", data=df, palette="coolwarm")

# Add title and labels
plt.title("Count of 0 and 1 in 'isfake'")
plt.xlabel("Is Fake (0 or 1)")
plt.ylabel("Count")
plt.xticks([0, 1], labels=["Not Fake (0)", "Fake (1)"])  # Optional for custom labels
plt.show()


In [None]:
# Import the required library
from wordcloud import WordCloud

# Plot the word cloud
plt.figure(figsize=(20, 20))
wc = WordCloud(
    max_words=2000,
    width=1600,
    height=800,
    stopwords=stop_words  # Ensure stop_words is defined earlier
).generate(" ".join(df[df.isfake == 1].clean_joined))

plt.imshow(wc, interpolation="bilinear")
plt.axis("off")  # Turn off axes for better visualization
plt.show()


In [None]:
# Import the required library
from wordcloud import WordCloud

# Plot the word cloud for text that is not fake (isfake == 0)
plt.figure(figsize=(20, 20))
wc = WordCloud(
    max_words=2000,
    width=1600,
    height=800,
    stopwords=stop_words  # Ensure stop_words is defined earlier
).generate(" ".join(df[df.isfake == 0].clean_joined))

plt.imshow(wc, interpolation="bilinear")
plt.axis("off")  # Turn off axes for better visualization
plt.show()


# **6.Confusion Matrix**

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Import required libraries
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame containing the 'clean_joined' column with preprocessed text
# And the 'isfake' column contains the target labels (binary classification)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(df['clean_joined'], df['isfake'], test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
tfidf1_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data (do not fit again, only transform using the already fitted vectorizer)
tfidf1_test = tfidf_vectorizer.transform(X_test)

# Initialize the Multinomial Naive Bayes classifier
clf = MultinomialNB()

# Train the model using the training data
clf.fit(tfidf1_train, Y_train)

# Save the trained model to a file using pickle
pickle.dump(clf, open('tfidf_nb', 'wb'))

# Make predictions on the test data
pred = clf.predict(tfidf1_test)

# Calculate the accuracy of the model
score = metrics.accuracy_score(Y_test, pred)
print("Accuracy with Multinomial Naive Bayes:   %0.3f" % score)


In [None]:
# Importing required libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Confusion Matrix
cm = confusion_matrix(Y_test, pred, labels=[0, 1])  # Assuming 0 = 'Fake' and 1 = 'Real'

# Plotting the confusion matrix
def plot_confusion_matrix(cm, classes):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()

# Call the function to plot the confusion matrix
plot_confusion_matrix(cm, classes=['Fake', 'Real'])


In [None]:
clf = RandomForestClassifier()
clf.fit(tfidf1_train, Y_train)
pickle.dump(clf, open('tfidf_rf', 'wb'))
pred = clf.predict(tfidf1_test)
score = metrics.accuracy_score(Y_test, pred)
print("Accuracy with RandomForestClassifier:   %0.3f" % score)

In [None]:
# Importing required libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix with numeric labels (0 = Fake, 1 = Real)
cm = confusion_matrix(Y_test, pred, labels=[0, 1])  # Assuming 0 = 'Fake' and 1 = 'Real'

# Plotting the confusion matrix
def plot_confusion_matrix(cm, classes):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()

# Call the function to plot the confusion matrix with proper class names
plot_confusion_matrix(cm, classes=['Fake', 'Real'])
