In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import re
import string

#for machine learning models
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling1D, Dense, Dropout, LSTM, GlobalMaxPool1D
from tensorflow.keras import preprocessing
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis.

In [None]:
df_true = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")
df_true.head()

In [None]:
df_fake = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
df_fake.head()

In [None]:
df_true["label"] = 0
df_fake["label"] = 1

In [None]:
df = pd.concat([df_true,df_fake],axis = 0)
df.head()

In [None]:
df.isnull().sum()

In [None]:
len(df)

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
df.head()

In [None]:
contractions = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", 
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

def fix_contractions(text:str)->str:
    for word in text.split():
        word_cl = re.sub(b'\xe2\x80\x99'.decode("utf8"), "'", word)
        if word_cl.lower() in contractions.keys():
            text = text.replace(word, contractions[word_cl.lower()])
    return text

def word_abbrev(word:str)->str:
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

# Replace all abbreviations
def replace_abbrev(text:str)->str:
    string = ""
    for word in text.split():
        string += word_abbrev(word) + " "        
    return string

def clean_text(raw_html:str)->str:
    '''Clean raw html.
    Input:    raw_html    Text from tweets
    return:   clean_text  Clean text
    
    '''
    
    #fixing contractions
    clean_text = fix_contractions(raw_html)
    
    
    pattern = re.compile('<.*?>')
    clean_text = re.sub(pattern, '', clean_text)
    clean_text = re.sub(r"http\S+","",clean_text)
    clean_text = re.sub("@[A-Za-z0-9_]+","", clean_text)
    clean_text = re.sub("#[A-Za-z0-9_]+","", clean_text)
    clean_text = clean_text.lower()
    clean_text = re.sub(r'([!?.]){2,}',"", clean_text)
    clean_text = re.sub(r'<3',"heart",clean_text)
    clean_text = re.sub(r'[-+]?[.\d]*[\d]+[:,.\d]*',"",clean_text)
    
    clean_text = ''.join([word for word in clean_text if word in string.printable])
    
    #removing emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    clean_text = emoji_pattern.sub(r"",clean_text)
    
    #removing sad faces
    clean_text = re.sub(r'[8:=;][\'\-]?[(\\/]',"sadface",clean_text)
    
    #removing smileys 
    clean_text = re.sub(r'[8:=;][\'\-]?[)dDp]',"smile",clean_text)
    
    #removing puntuaction
    table = str.maketrans('','',string.punctuation)
    clean_text = clean_text.translate(table)
    clean_text = replace_abbrev(clean_text) 
    return clean_text   

In [None]:
df["clean_text"] = df["text"].apply(clean_text)

## Text length

In [None]:
df["text_length"] = df["clean_text"].apply(lambda x:len(x))

In [None]:
fig, axs = plt.subplots(1,2, figsize = (16,6))
fig.tight_layout(pad=5.0)


sns.distplot(df[df["label"]==0]["text_length"], ax = axs[0], label = "True News")
sns.distplot(df[df["label"]==1]["text_length"], ax = axs[1], label = "Fake News")
axs[0].set_title(f"Text Length True News", fontsize = 18)
axs[1].set_title(f"Text Length Fake News", fontsize = 18)

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(df[df["label"]==0]["text_length"], label = "True News",ax = plt.gca())
sns.distplot(df[df["label"]==1]["text_length"], label = "Fake News", ax = plt.gca())
plt.legend(loc="best")

## Count Words

In [None]:
df["count_words"] = df["clean_text"].apply(lambda x:len(x.split(" ")))

In [None]:
fig, axs = plt.subplots(1,2, figsize = (16,6))
fig.tight_layout(pad=5.0)


sns.distplot(df[df["label"]==0]["count_words"], ax = axs[0], label = "True News")
sns.distplot(df[df["label"]==1]["count_words"], ax = axs[1], label = "Fake News")
axs[0].set_title(f"Word count True News", fontsize = 18)
axs[1].set_title(f"Word count Fake News", fontsize = 18)

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(df[df["label"]==0]["count_words"], label = "True News",ax = plt.gca())
sns.distplot(df[df["label"]==1]["count_words"], label = "Fake News", ax = plt.gca())
plt.legend(loc="best")

In [None]:
plt.figure(figsize=(14,6))
sns.countplot(df["label"])

In [None]:
df["label"].value_counts()

## Text processing

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
x_train, x_test, y_train,y_test = train_test_split(df["text"],df["label"],test_size = 0.2)

In [None]:
#length of the tweet
maxlen = 60

#The max number of words to keep
max_words = 10000

#Tokenization
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(x_train)

#word index
word_index = tokenizer.word_index

#creating sequences.
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

#padding the sequences (maxlen was defined previously, here maxlen =100)
data_train = pad_sequences(train_sequences, maxlen = maxlen)
data_test = pad_sequences(test_sequences, maxlen = maxlen)

#word index revers to maps indexes to words
word_index_reverse = {val:key for key,val in word_index.items()}


In [None]:
#word index revers to maps indexes to words
word_index_reverse = {val:key for key,val in word_index.items()}

for i in range(1,4):
    print("\n")
    print("Index Sequence:")
    print(data_train[i])
    print("\n")
    print("Text Sequence:")
    text_sequence = [word_index_reverse[idx] if idx != 0 else "-" for idx in data_train[i]]
    print(' '.join(text_sequence))

## Building the model.

Using Transfer Learning + LSTM

In [None]:
#GloVe  embedding directory
glove_dir = "../input/glove6b/glove.6B.100d.txt"

#Parsing the text file
embeddings_index = {}

f = open(glove_dir)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

**Preparing the GloVe word-embedding matrix**

In [None]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words,embedding_dim))
for word , i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

**Creating the model**

In [None]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(LSTM(64,return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer="adam",loss="binary_crossentropy", metrics=["acc"])
model.summary()

In [None]:
#setting the weights
model.layers[0].set_weights([embedding_matrix])

#freezing the layer
model.layers[0].trainable = False

In [None]:
# callbacks

#implementing early stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3,
                                              restore_best_weights=True)

#reduceOnplateau
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                                                  patience=2, min_lr=0.00001,verbose=1)


#fitting the model
history = model.fit(data_train,y_train,
    epochs=100,
    batch_size=32,
    validation_data = (data_test,y_test),
    callbacks = [early_stop,reduce_lr],
)

In [None]:
plt.figure(figsize=(16,6))
plt.plot(history.history["acc"],label = "Training Acc")
plt.plot(history.history["val_acc"],label = "Validation Acc")
plt.xlabel("Epochs", fontsize = 14)
plt.ylabel("Acc", fontsize = 14)
plt.legend(loc ="best")
plt.grid()

In [None]:
plt.figure(figsize=(16,6))
plt.plot(history.history["loss"],label = "Training Loss")
plt.plot(history.history["val_loss"],label = "Validation Loss")
plt.xlabel("Epochs", fontsize = 14)
plt.ylabel("Loss", fontsize = 14)
plt.legend(loc ="best")
plt.grid()

## Model Evaluation

In [None]:
predictions = model.predict(data_test)
predictions = [1 if p >= 0.5 else 0 for p in predictions]
print(classification_report(predictions,y_test))

In [None]:
model.save("fake_and_real_news_detector.h5")