<a href="https://colab.research.google.com/github/SURESHBEEKHANI/Natural-Language-Processing/blob/main/fake_news_classifier_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing pandas library and aliasing it as pd
import pandas as pd

# Importing text preprocessing techniques from the sklearn library
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

# Importing stopwords from the nltk.corpus module
from nltk.corpus import stopwords

# Importing PorterStemmer from the nltk.stem module
from nltk.stem import PorterStemmer

# Importing re module for regular expressions
import re


In [None]:
# Reading the CSV file into a DataFram
df = pd.read_csv('/content/train.csv')

In [None]:
# Displaying the first few rows of the DataFrame
df.head()

In [None]:
# Drop the target variable column named 'label' from the dataset
x = df.drop('label', axis=1)


In [None]:
# Displaying the first few rows of the DataFrame x
x.head()


In [None]:
# Assigning the 'label' column from DataFrame df to variable y
y = df['label']


In [None]:
# Displaying the first few values of the Series y (from the 'label' column)
y.head()


In [None]:
# Retrieving the dimensions (shape) of the DataFrame df
df.shape


In [None]:
# Removing rows with null values from the DataFrame df
df = df.dropna()

In [None]:
# Retrieving the dimensions (shape) of the DataFrame df
df.shape

In [None]:
# Displaying the first 10 rows of the DataFrame df
df.head(10)

In [None]:
# Create a copy of the dataset DataFrame df
messages = df.copy()


In [None]:
# Resetting the index of the dataset DataFrame messages to be sequential and ordered
messages.reset_index(inplace=True)

In [None]:
messages.head(10)

In [None]:
# Displaying the first 10 rows of the DataFrame messages
messages.head(10)


In [None]:
# Downloading the stopwords dataset from NLTK
import nltk
nltk.download('stopwords')


In [None]:
# Initializing a PorterStemmer instance
ps = PorterStemmer()

# Initializing an empty list to store processed text
corpus = []

# Iterating through each row in the 'title' column of the DataFrame messages
for i in range(0, len(messages)):
    # Removing non-alphabetic characters and replacing them with a space
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    # Converting all characters to lowercase
    review = review.lower()
    # Splitting the text into a list of words
    review = review.split()
    # Applying stemming using PorterStemmer and excluding stopwords
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    # Joining the stemmed words back into a single string separated by spaces
    review = ' '.join(review)
    # Appending the processed text to the corpus list
    corpus.append(review)


In [None]:
print(corpus)

In [None]:
# Apply CountVectorizer to create a Bag of Words model
# Convert text into vectors
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with parameters:
# - max_features=5000: Limits the number of features (words or n-grams) to the top 5000 by frequency.
# - ngram_range=(1,3): Considers unigrams, bigrams, and trigrams as features.
cv = CountVectorizer(max_features=5000, ngram_range=(1, 3))

# Transform the corpus (list of preprocessed text) into a matrix of token counts (X)
X = cv.fit_transform(corpus).toarray()


In [None]:
print(X)

In [None]:
# Retrieving the dimensions (shape) of the matrix X
X.shape



In [None]:
# Assigning the 'label' column from the messages DataFrame to Y
Y = messages['label']

In [None]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

# Using train_test_split to divide the data into:
# - X_train: Training data for features (X)
# - X_test: Testing data for features (X)
# - Y_train: Training data for target variable (Y)
# - Y_test: Testing data for target variable (Y)
# test_size=0.33 indicates that 33% of the data will be used for testing, and 67% for training
# random_state=0 ensures reproducibility of results
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=0)


In [None]:
# Assuming X_train, X_test, Y_train, Y_test are already defined
# Example shapes:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)

In [None]:
# Assuming 'cv' is your CountVectorizer object
# Retrieve the feature names from the CountVectorizer object
feature_names = cv.get_feature_names_out()

# Print the first 20 feature names
print(feature_names[:20])



In [None]:
# Retrieve the parameters of the CountVectorizer object
cv.get_params()


In [None]:
# Get the feature names using get_feature_names_out
feature_names = cv.get_feature_names_out()

# Create a DataFrame using the feature names as column headers
count_df = pd.DataFrame(X_train,columns=feature_names)
# Print the first few rows of the DataFrame
print(count_df.head())

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example:
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.

    Parameters:
    - cm: Confusion matrix, a 2D numpy array.
    - classes: List of class labels (e.g., ['class1', 'class2']).
    - normalize: If True, normalize the confusion matrix.
    - title: Title of the plot.
    - cmap: Color map for the plot.
    """
    # Plotting the confusion matrix as an image plot
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)  # Setting the title of the plot
    plt.colorbar()    # Adding a color bar to the plot
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)  # Setting x-axis labels with rotation
    plt.yticks(tick_marks, classes)              # Setting y-axis labels

    # Normalizing the confusion matrix if normalize=True
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    # Setting the threshold for text color based on the maximum value in the confusion matrix
    thresh = cm.max() / 2.

    # Adding text annotations to each cell of the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], '.2f' if normalize else 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    # Adjusting layout and setting labels for y-axis and x-axis
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
# Importing the Multinomial Naive Bayes classifier from scikit-learn
from sklearn.naive_bayes import MultinomialNB

# Creating an instance of the Multinomial Naive Bayes classifier
classifier = MultinomialNB()


In [None]:
# Importing necessary modules for metrics and utility functions
from sklearn import metrics
import numpy as np
import itertools


In [None]:
# Training the classifier on the training data
classifier.fit(X_train, Y_train)

# Making predictions on the test data
pred = classifier.predict(X_test)

# Calculating the accuracy score of the classifier
score = metrics.accuracy_score(Y_test, pred)
print("Accuracy: %0.3f" % score)

# Generating the confusion matrix
cm = metrics.confusion_matrix(Y_test, pred)

# Plotting the confusion matrix
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


In [None]:
# Training the classifier on the training data
classifier.fit(X_train, Y_train)

# Making predictions on the test data
pred = classifier.predict(X_test)

# Calculating the accuracy score of the classifier
score = metrics.accuracy_score(Y_test, pred)

# Outputting the accuracy score
score


# Passive Aggressive Classifier Algorithm

In [None]:
# Importing the PassiveAggressiveClassifier from scikit-learn
from sklearn.linear_model import PassiveAggressiveClassifier

# Initializing the Passive Aggressive Classifier
linear_clf = PassiveAggressiveClassifier()


In [None]:
# Initialize the classifier
linear_clf = PassiveAggressiveClassifier()

# Fit the classifier
linear_clf.fit(X_train, Y_train)

# Predict on the test set
pred = linear_clf.predict(X_test)

# Evaluate the classifier
score = metrics.accuracy_score(Y_test, pred)
print("accuracy:   %0.3f" % score)

# Compute and print confusion matrix
cm = metrics.confusion_matrix(Y_test, pred)
print("confusion matrix:")
print(cm)
plot_confusion_matrix(cm, classes=['FAKE Data', 'REAL Data'])

Multinomial Classifier with Hyperparameter

In [None]:
# Initialize a Multinomial Naive Bayes classifier with alpha=0.1
classifier = MultinomialNB(alpha=0.1)


In [None]:
previous_score = 0

# Loop through different values of alpha from 0 to 1 (exclusive), with a step of 0.1
for alpha in np.arange(0, 1, 0.1):
    # Initialize a Multinomial Naive Bayes classifier with the current alpha value
    sub_classifier = MultinomialNB(alpha=alpha)

    # Train the classifier using the training data
    sub_classifier.fit(X_train, Y_train)

    # Make predictions on the test data
    y_pred = sub_classifier.predict(X_test)

    # Calculate the accuracy score of the predictions
    score = metrics.accuracy_score(Y_test, y_pred)

    # Check if the current score is higher than the previous highest score
    if score > previous_score:
        classifier = sub_classifier  # Update the classifier to the current best classifier
        previous_score = score  # Update the previous highest score

    # Print the alpha value and corresponding score
    print("Alpha: {}, Score: {}".format(alpha, score))


In [None]:
# Divide the dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Using train_test_split to split the features (X) and target variable (Y) into training and testing sets
# test_size=0.33 indicates that 33% of the data will be used for testing, and 67% for training
# random_state=0 ensures reproducibility of results
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=0)


In [None]:
# Importing the Multinomial Naive Bayes classifier from scikit-learn
from sklearn.naive_bayes import MultinomialNB

# Initializing the Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Training the classifier using the training data X_train and labels Y_train
classifier.fit(X_train, Y_train)

# Making predictions on the test data X_test
pred = classifier.predict(X_test)

# Calculating the accuracy score of the predictions
score = metrics.accuracy_score(Y_test, pred)
print("Accuracy:   %0.3f" % score)

# Computing the confusion matrix between the true labels Y_test and predicted labels pred
cm = metrics.confusion_matrix(Y_test, pred)

# Plotting the confusion matrix with specified class labels ['FAKE', 'REAL']
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
