<a href="https://colab.research.google.com/github/SURESHBEEKHANI/Natural-Language-Processing/blob/main/faker_news_classifier_using_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# fake news classification

In [None]:
# Importing the Pandas library as 'pd' to work with datasets
import pandas as pd


In [None]:
# Load the dataset from a CSV file
data = pd.read_csv('/content/train.csv')

In [None]:
# Show the first 5 rows of the dataset
data.head(5)

In [None]:
# Display information about the dataset
data.info()

In [None]:
# Display the dimensions of the dataset (rows, columns)
data.shape


In [None]:
# Check for null values in the dataset
data.isnull().sum()

In [None]:
# Remove rows with null values from the dataset
data = data.dropna()

In [None]:
# Check for null values in the dataset
data.isnull().sum()

In [None]:
# Display the dimensions of the dataset (rows, columns)
data.shape


In [None]:
# Get the independent features for training by dropping the target variable 'label'
X = data.drop('label', axis=1)

In [None]:
# Get the dependent feature (target variable) for training
y = data['label']

In [None]:
# Print the shape of the X and y variables
print("The shape of X variable:", X.shape)
print("The shape of y variable:", y.shape)

In [None]:
# Import TensorFlow library
import tensorflow as tf

# Check the version of TensorFlow
tf.__version__

In [None]:
# Import the Embedding layer from tensorflow.keras
from tensorflow.keras.layers import Embedding

# Import the pad_sequences function for setting sequence representation to one size
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Import the Sequential model from tensorflow.keras
from tensorflow.keras.models import Sequential

# Import the one_hot function for one-hot encoding of text
from tensorflow.keras.preprocessing.text import one_hot

# Import the LSTM layer from tensorflow.keras
from tensorflow.keras.layers import LSTM

# Import the Dense layer from tensorflow.keras
from tensorflow.keras.layers import Dense


In [None]:
#Provide the Vocabulary Size Of from data
vocab_size = 5000

# Text_Preprocessing

In [None]:
#Create The Copy  of X Variable  input Vrible
messenge = X.copy()

In [None]:
# Reset the index of the DataFrame
messenge.reset_index(inplace=True)


In [None]:
# Import the nltk library for text preprocessing
import nltk

# Import the re library for removing punctuations using regular expressions
import re

# Import the stopwords from nltk.corpus to filter out unmeaningful words
from nltk.corpus import stopwords


In [None]:
# Download stopwords from nltk
nltk.download('stopwords')


In [None]:
# Data Preprocessing

# Import the PorterStemmer from nltk for reducing word size to their root form
from nltk.stem.porter import PorterStemmer

# Define the PorterStemmer object
ps = PorterStemmer()

# Define an empty list to store the processed documents
corpus = []

for i in range(0, len(messenge)):
    # Remove punctuation from the title
    review = re.sub(r'[^\w\s]', '', messenge['title'].iloc[i])
    print(i)
    # Convert to lowercase
    review = review.lower()
    # Split the review into words
    review = review.split()
    # Remove stopwords and apply stemming
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    # Join the processed words back into a single string and add to corpus
    review = ' '.join(review)
    corpus.append(review)


In [None]:
corpus

# One_hot_Representation

In [None]:
#Use One_Hot Representation  Functin Convert Text into vecter
one_hot_rep = [one_hot(words, vocab_size) for words in corpus]
one_hot_rep

# Embeddings Representation

In [None]:
# Set the length of each sentence to 20 words
sent_length = 20

# Use one-hot encoding to convert the text into numbers, then pad the sequences
# 'one_hot_rep' should be the one-hot encoded representation of the text
embedded_docs = pad_sequences(one_hot_rep, padding='pre', maxlen=sent_length)

# Print the padded and encoded sentences
print(embedded_docs)


In [None]:
#The Length Of Embedded_docs
print(len(embedded_docs))

In [None]:
# Print the number of sentences in the padded and encoded data
print(len(embedded_docs))


In [None]:
#Print the Embedding Doucments Throug Index
print(embedded_docs[100])

In [None]:
# define The Dimmminsion Of Model
dim=40
# Create the LSTM model
model = Sequential()

# Add an embedding layer
# input_dim is the size of the vocabulary
# output_dim is the dimension of the dense embedding
# input_length is the length of input sequences
model.add(Embedding(input_dim=vocab_size, output_dim=40, input_length=sent_length))

# Add an LSTM layer with 100 units
model.add(LSTM(100))

# Add a Dense layer with 1 unit and a sigmoid activation function for binary classification
model.add(Dense(1, activation='sigmoid'))
#Compile The Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#Summary  of model
model.summary()

In [None]:
import numpy as np

# Converting embedded_docs to a numpy array
X_final = np.array(embedded_docs)

# Converting y to a numpy array
y_final = np.array(y)


# Split the dataset For Test and Training

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
# 30% of the data will be used for testing, and the rest for training
X_train, X_test, Y_train, Y_test = train_test_split(X_final, y_final, test_size=0.30, random_state=42)

# Print the training and testing data
print("X_train:", X_train)
print("X_test:", X_test)
print("Y_train:", Y_train)
print("Y_test:", Y_test)


# Model_Training

In [None]:
# Train the model with training data and validate using testing data
# 'epochs' specifies the number of times the model will go through the entire training dataset
# 'batch_size' specifies the number of samples per gradient update

model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=5, batch_size=64)


# Adding Dropout

In [None]:
from tensorflow.keras.layers import Dropout
embedding_vector_features = 40
# Initialize the model
model = Sequential()

# Add embedding layer
# 'vocab_size' is the size of the vocabulary
# 'embedding_vector_features' is the dimension of the dense embedding
# 'input_length' is the length of input sequences
model.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))

# Add dropout layer to prevent overfitting
# '0.3' is the dropout rate, which means 30% of the neurons will be randomly set to zero
model.add(Dropout(0.3))

# Add LSTM layer
# '100' is the number of units in the LSTM layer
model.add(LSTM(100))

# Add another dropout layer to prevent overfitting
# '0.3' is the dropout rate
model.add(Dropout(0.5))

# Add output layer
# 'Dense(1)' means we have one output neuron
# 'activation='sigmoid'' means we use the sigmoid activation function for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
# 'loss='binary_crossentropy'' is the loss function used for binary classification
# 'optimizer='adam'' is the optimizer used to update the weights
# 'metrics=['accuracy']' means we want to track accuracy during training
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary of the model
model.summary()

# Performance Metrics And Accuracy

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score

# Predict classes using model.predict and np.argmax
y_pred = np.argmax(model.predict(X_test), axis=-1)

# Compute confusion matrix
cm = confusion_matrix(Y_test, y_pred)

# Compute accuracy score
accuracy = accuracy_score(Y_test, y_pred)

# Print the confusion matrix and accuracy score
print("Confusion Matrix:")
print(cm)
print("Accuracy:", accuracy)
