In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D,LSTM, Dropout
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### Loading Datasets for Training

In [None]:
# Load dataset
df_fake = pd.read_csv("datasets/news/Fake.csv")
df_true = pd.read_csv("datasets/news/True.csv")
df_true2 =pd.read_csv("datasets/news/News.csv")

In [None]:
#Set the labels as 0 and 1 to fake and real news datasets
df_fake["label"] = 0
df_true["label"] = 1
df_true.head()

In [None]:
df_marge = pd.concat([df_fake, df_true, df_true2], axis =0 )
df_marge.head(10)

In [None]:
# dropping unwanted columns
df = df_marge.drop(["title", "subject","date"], axis = 1)
df.columns

In [None]:
df.size

### Final Datasets is df

### Data Preprocessing

In [None]:
# Drop the row which have null value in column 'text'
df = df.dropna(axis=0, subset=['text'])
df.size

In [None]:
# Randomly shuffling the dataframe 
df = df.sample(frac = 1)
df.head()

In [None]:
# Removing index column which was generated after reshuffling the dataframe
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)
df.columns

In [None]:
df.head()

#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [None]:
import re
import string
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [None]:
df["text"] = df["text"].apply(wordopt)

#### Lemmatization
##### Lemmatization is the process of reducing words to their base or root form, which can help to group together words with similar meanings and reduce the number of unique words in a dataset. 

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
# Download necessary resources for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
# Create a lemmatizer object
lemmatizer = WordNetLemmatizer()
# Define a function to lemmatize a single word
def lemmatize_word(word):
    return lemmatizer.lemmatize(word)
# Define a function to lemmatize a list of words
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatize_word(word) for word in words]
    return ' '.join(lemmatized_words)

In [None]:
# Apply the lemmatization function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(lemmatize_text)

In [None]:
# Defining dependent and independent variable as x and y
X = df["text"]
Y = df["label"]

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=42)

### Feature Extraction

#### Tokenization
##### It is the process of dividing a text into smaller units (each word will be an index in an array)

In [None]:
# defining tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
# Converting text to sequence
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
max_len = 500
# padding
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)
# Exporting Tokenizer
import joblib
joblib.dump(tokenizer,"models/fakeNews/tokenizer")

### Convolutional Neural Networks (CNNs)
#### CNNs are commonly used for text classification tasks such as fake news detection. They can learn to detect patterns and features in the text by using convolutional layers and pooling layers.

In [None]:
# Define the Model
CNN = Sequential()
CNN.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=max_len))
CNN.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
CNN.add(GlobalMaxPooling1D())
CNN.add(Dense(units=64, activation='relu'))
CNN.add(Dropout(rate=0.2))
CNN.add(Dense(units=1, activation='sigmoid'))
# Compile the model
CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
CNN.fit(X_train, y_train, epochs=10, verbose=1, validation_data=(X_test, y_test), batch_size=32)

In [None]:
# Print Accuracy and Confusion Matrix
y_pred = CNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
CNN.save('models/fakeNews/CNN.h5')

### Recurrent Neural Networks (RNNs)
#### RNNs are another popular choice for text classification tasks. They can process sequential data by using feedback loops, allowing them to capture the context and meaning of the text.

In [None]:
# Define the model
RNN = Sequential()
RNN.add(Embedding(5000, 128, input_length=X_train.shape[1]))
RNN.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
RNN.add(Dense(1, activation='sigmoid'))
# Compile the model
RNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
# RNN.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.2)
RNN.fit(X_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

In [None]:
# Print Accuracy and Confusion Matrix
y_pred = RNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
RNN.save('models/fakeNews/RNN.h5')

In [None]:
def manual_testing(news):
    new_article=news
    new_article = wordopt(new_article)
    new_article = lemmatize_text(new_article)
    new_article = tokenizer.texts_to_sequences([new_article])
    padded = pad_sequences(new_article, maxlen=X_train.shape[1])
    pred_CNN = CNN.predict(new_article)
    pred_RNN = RNN.predict(padded)
    return print("\n\nCNN Prediction: {} \nRNN Prediction: {}".format(pred_CNN,pred_RNN))

In [None]:
news = str(input())
manual_testing(news)