# Main Library

In [1]:
# Reading Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Data And Analysis
import string
import nltk, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Building Model
import tensorflow.keras as k
from tensorflow.keras.layers import Dense, LSTM, GlobalAveragePooling1D, Bidirectional, Embedding
from sklearn.metrics import accuracy_score, confusion_matrix

# Reading Data

In [2]:
fake_data = pd.read_csv(r"D:\Courses language programming\Natural Language Processing\Projects For NLP\Data\Fake and real news dataset\Fake.csv")
real_data = pd.read_csv(r"D:\Courses language programming\Natural Language Processing\Projects For NLP\Data\Fake and real news dataset\True.csv")

In [3]:
fake_data.head(3)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"


In [4]:
real_data.head(3)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"


# Analysis Data

In [5]:
def Drop_col(data):
    columns = ["subject", "date"]
    for col in columns:
        data = data.drop(col, axis=1)
    return data

fake_data = Drop_col(fake_data)
real_data = Drop_col(real_data)

In [6]:
real_data.head(2)

Unnamed: 0,title,text
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...


In [7]:
fake_data.head(2)

Unnamed: 0,title,text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...


In [8]:
real_data["Label"] = "Real"
fake_data["Label"] = "Fake"

In [9]:
fake_data.head(2)

Unnamed: 0,title,text,Label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,Fake


In [10]:
real_data.head(2)

Unnamed: 0,title,text,Label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,Real
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,Real


In [11]:
training_data = pd.concat([fake_data, real_data], axis=0)

In [12]:
training_data.head(5)

Unnamed: 0,title,text,Label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,Fake


In [13]:
training_data.shape

(44898, 3)

In [14]:
training_data.loc[training_data["Label"]=="Fake", "Label"] = 1
training_data.loc[training_data["Label"]=="Real", "Label"] = 0

training_data["Label"] = training_data["Label"].astype("int32")

In [15]:
training_data.head(5)

Unnamed: 0,title,text,Label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,1


In [16]:
training_data["new_text"] = training_data["title"] + "  " + training_data["text"]

In [17]:
training_data.head(3)

Unnamed: 0,title,text,Label,new_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",1,Sheriff David Clarke Becomes An Internet Joke...


In [18]:
training_data.head(2)

Unnamed: 0,title,text,Label,new_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,Drunk Bragging Trump Staffer Started Russian ...


# Preprocessing ==> Data

In [19]:
stop_word = stopwords.words("english")
punc = list(string.punctuation)
def remove_stopwords(text):
    text = text.lower()
    
    text = " ".join([word for word in word_tokenize(text) if ((word not in stop_word) and word not in punc)])
    
    text = re.sub("[^a-z]", " ", text)
    
    return text

In [20]:
training_data["Final_text"] = training_data["new_text"].apply(remove_stopwords)

In [21]:
training_data.drop(["title", "text", "new_text"], axis=1, inplace=True)

In [22]:
training_data.head(5)

Unnamed: 0,Label,Final_text
0,1,donald trump sends embarrassing new year eve...
1,1,drunk bragging trump staffer started russian c...
2,1,sheriff david clarke becomes internet joke thr...
3,1,trump obsessed even obama name coded website...
4,1,pope francis called donald trump christmas spe...


# Splitting Data

In [23]:
X = training_data["Final_text"]
Y = training_data["Label"]

# Make Tokenization

In [24]:
Max_vocab_size = 20000
max_sequance_lenght = 100
Embeding_dim = 100

In [25]:
tokenize = Tokenizer(oov_token="<OOV>")
tokenize.fit_on_texts(X)
word_idx = tokenize.word_index

text2seq = tokenize.texts_to_sequences(X)

pad_seq = pad_sequences(text2seq, maxlen=max_sequance_lenght, padding="pre", truncating="pre")

print("The Padding Sequance Shape is  --> ", pad_seq.shape)

The Padding Sequance Shape is  -->  (44898, 100)


In [26]:
len(word_idx)

115946

In [27]:
input_length = max(len(seq) for seq in text2seq)

vocabulary_size = len(word_idx) + 1

vocabulary_size, input_length

(115947, 5086)

In [28]:
x_train, x_test, y_train, y_test = train_test_split(pad_seq, Y, train_size=0.7, shuffle=True)

# Building Model

In [29]:
model = k.models.Sequential(
[
    Embedding(vocabulary_size, input_length, input_length=max_sequance_lenght),
    Bidirectional(LSTM(15, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 5086)         589706442 
                                                                 
 bidirectional (Bidirection  (None, 100, 30)           612240    
 al)                                                             
                                                                 
 global_average_pooling1d (  (None, 30)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 31        
                                                                 
Total params: 590318713 (-1933692444.00 Byte)
Trainable params: 590318713 (-1933692444.00 Byte)
Non-trainable params: 0 (0.00 Byte)
________________________________________________________

  total_memory_size += weight_shape * per_param_size


In [None]:
history = model.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test), verbose=2)

In [None]:
plt.plot(history.history["loss"], label="Loss")
plt.plot(history.history["val_loss"], label="Val_Loss")

plt.xlabel("Epochs")
plt.ylabel("Loss")

plt.title("Loss Vs Epochs")

plt.legend()
plt.grid()

In [None]:
plt.plot(history.history["accuracy"], label="accuracy")
plt.plot(history.history["val_accuracy"], label="val_accuracy")

plt.xlabel("Epochs")
plt.ylabel("Accuracy")

plt.title("Accuracy Vs Epochs")

plt.legend()
plt.grid()

In [None]:
text = input().split()

new_text = ""
for word in word_tokenize(text):
    if word not in stop_words and word not in punc:
        new_text += word
        new_text += " "

test_sequace = tokenize.texts_to_sequences([new_text])
test_padding = pad_sequences(test_sequace, maxlen=116, padding="pre", truncating="pre")


# test_sequace
prediction = model.predict(test_padding)

if np.argmax(prediction) == 1: print("This Massage is -->  SPAM ")
else: print("This Massage is -->  HAM ")