In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline 

In [None]:
true_news_df = pd.read_csv('True.csv')
fake_news_df = pd.read_csv('Fake.csv')

In [None]:
true_news_df.head()


In [None]:
fake_news_df.tail()


In [None]:
true_news_df['subject'].value_counts()


In [None]:
sns.countplot(data = true_news_df, y = 'subject')
plt.show()


In [None]:
sns.countplot(data = fake_news_df, y = 'subject', 
              order = fake_news_df['subject'].value_counts().index)
plt.show()

In [None]:
true_news_df.head()


In [None]:
true_news_df['is_fake'] = 0
fake_news_df['is_fake'] = 1
df_final = pd.concat([true_news_df, fake_news_df]).reset_index(drop = True)


In [None]:
df_final.head()


In [None]:
sns.countplot(data = df_final, x = 'is_fake', palette = 'viridis')
plt.show()


In [None]:
df_final.head()


In [None]:
df_final.drop(['date'], axis = 1, inplace = True)


In [None]:
df_final.head()


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



In [None]:
english_stopwords = stopwords.words('english')


In [None]:
english_stopwords[0: 10]


In [None]:
import string
import wordcloud
import missingno as msno



In [None]:
msno.matrix(df_final, color = (0.5, 0.5, 0.6))
plt.show()


In [None]:
df_final.info()


In [None]:
df_final.head()


In [None]:
wordcloud.WordCloud()


In [None]:
def text_preprocessor(text):
    
    text_preprocessed = []
    text = text.translate(str.maketrans('', '', string.punctuation)) # Removes punctuations
    for word in text.split(' '): # Splits the text into words
        # print(word)
        word = word.lower()
        lemmatizer = WordNetLemmatizer()
        word = lemmatizer.lemmatize(word)
        if word not in english_stopwords:
            text_preprocessed.append(word)
    text_preprocessed = ' '.join(text_preprocessed)
    return text_preprocessed


In [None]:
df_final.head()


In [None]:
df_final['title preprocessed'] = df_final['title'].apply(text_preprocessor)


In [None]:
df_final.head()


In [None]:
df_final['text preprocessed'] = df_final['text'].apply(text_preprocessor)


In [None]:
df_final.head()


In [None]:
df_final.drop(['title', 'text'], axis = 1, inplace = True)


In [None]:
df_final.head()


In [None]:
fake_news = df_final[df_final.is_fake == 1]['title preprocessed']
true_news = df_final[df_final.is_fake == 0]['title preprocessed']


In [None]:
fake_news_title = ''.join(fake_news.to_list())
true_news_title = ''.join(true_news.to_list())


In [None]:
# Fake News Titles
word_cloud = wordcloud.WordCloud().generate(fake_news_title)
plt.imshow(word_cloud, interpolation = 'bilinear')
plt.show()


In [None]:
# True News Titles
word_cloud = wordcloud.WordCloud().generate(true_news_title)
plt.imshow(word_cloud, interpolation = 'bilinear')
plt.show()

In [None]:
#encoding
X = df_final.drop(['is_fake'], axis = 1)
y = df_final['is_fake']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)


In [None]:
print("The shape of training input data: {}".format(X_train.shape))
print("The shape of testing input data: {}".format(X_test.shape))
print("The shape of training output data: {}".format(y_train.shape))
print("The shape of testing output data: {}".format(y_test.shape))

In [None]:
X_train.head()


In [None]:
vectorizer = CountVectorizer()
title_train_bow = vectorizer.fit_transform(X_train['title preprocessed'])
title_test_bow = vectorizer.transform(X_test['title preprocessed'])

text_train_bow = vectorizer.fit_transform(X_train['text preprocessed'])
text_test_bow = vectorizer.transform(X_test['text preprocessed'])

In [None]:
X_train_transformed = hstack((title_train_bow, text_train_bow))
X_test_transformed = hstack((title_test_bow, text_test_bow))

Logistic Regression


In [None]:
model = LogisticRegression()
model.fit(X_train_transformed, y_train)
y_predictions = model.predict(X_test_transformed)

In [None]:
y_predictions


In [None]:
accuracy_score(y_predictions, y_test)


In [None]:
#K Neighbors Classifier


In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train_transformed, y_train)
y_predictions = model.predict(X_test_transformed)


In [None]:
print(accuracy_score(y_predictions, y_test))
print(confusion_matrix(y_predictions, y_test))
print(classification_report(y_predictions, y_test))

In [None]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(X_train_transformed, y_train)
y_predictions = model.predict(X_test_transformed)
print(accuracy_score(y_predictions, y_test))
print(confusion_matrix(y_predictions, y_test))
print(classification_report(y_predictions, y_test))


In [None]:
#Decision Tree Classifier
model = DecisionTreeClassifier()
model.fit(X_train_transformed, y_train)
y_predictions = model.predict(X_test_transformed)
print(accuracy_score(y_predictions, y_test))
print(confusion_matrix(y_predictions, y_test))
print(classification_report(y_predictions, y_test))


In [None]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train_transformed, y_train)
y_predictions = model.predict(X_test_transformed)
print(accuracy_score(y_predictions, y_test))
print(confusion_matrix(y_predictions, y_test))
print(classification_report(y_predictions, y_test))


In [None]:
#TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
title_train_tfidf = vectorizer.fit_transform(X_train['title preprocessed'])
title_test_tfidf = vectorizer.transform(X_test['title preprocessed'])

text_train_tfidf = vectorizer.fit_transform(X_train['text preprocessed'])
text_test_tfidf = vectorizer.transform(X_test['text preprocessed'])


In [None]:
X_train_transformed = hstack((title_train_tfidf, text_train_tfidf))
X_test_transformed = hstack((title_test_tfidf, text_test_tfidf))


In [None]:
#Word2Vec Vectorizer
import gensim
from gensim.models import Word2Vec

train_title = [text.split(' ') for text in X_train['title preprocessed']]
test_title = [text.split(' ') for text in X_test['title preprocessed']]


In [None]:
word2vec = Word2Vec(train_title, min_count = 2)


In [None]:
word2vec.wv.similarity('trump', 'hillary')


In [None]:
word2vec.wv.similarity('apple', 'email')


In [None]:
word2vec.wv.most_similar('trump')


In [None]:
word2vec.wv.most_similar('laptop')


In [None]:
train_title[0]


In [None]:
avgword2vec_train = []
for i in range(len(train_title)):
    word2vec_sum = 0
    for j in range(len(train_title[i])):
        try: 
            word2vec_sum = word2vec_sum + word2vec.wv.word_vec(train_title[i][j])           
        except KeyError:
            continue
    avg_word2vec = word2vec_sum / (j + 1)
    avgword2vec_train.append(avg_word2vec)

In [None]:
avgword2vec_df_train = pd.DataFrame()
avgword2vec_df_test = pd.DataFrame()

In [None]:
from tqdm import tqdm 
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)



In [None]:
# This file can take about 5 minutes to run the code cell 

y_train_transformed = []
for i in tqdm(range(len(avgword2vec_train))):
    try:
        avgword2vec_df_train = avgword2vec_df_train.append(pd.DataFrame(avgword2vec_train[i]).T)
        y_train_transformed.append(y_train[i])
    except ValueError:
        continue

In [None]:
avgword2vec_test = []
for i in range(len(test_title)):
    word2vec_sum = 0
    for j in range(len(test_title[i])):
        try: 
            word2vec_sum = word2vec_sum + word2vec.wv.word_vec(test_title[i][j])
        except KeyError: 
            continue
    avg_word2vec = word2vec_sum / (j + 1)
    avgword2vec_test.append(avg_word2vec)

In [None]:
# This file can take about 5 minutes to run the code cell 

y_test_transformed = []
for i in tqdm(range(len(avgword2vec_test))):
    try:
        avgword2vec_df_test = avgword2vec_df_test.append(pd.DataFrame(avgword2vec_test[i]).T)
        y_test_transformed.append(y_test[i])
    except ValueError:
        continue

In [None]:

model = LogisticRegression()
model.fit(avgword2vec_df_train, y_train_transformed)
y_predictions = model.predict(avgword2vec_df_test)

In [None]:
y_predictions


In [None]:
accuracy_score(y_predictions, y_test_transformed)


In [None]:
print(classification_report(y_predictions, y_test_transformed))
df_final.head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 101, test_size = 0.3)


In [None]:
X_train.shape


In [None]:
#Embedding Representation

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
X_train.head()


In [None]:
text_train = X_train['text preprocessed'].tolist()
text_test = X_test['text preprocessed'].tolist()

In [None]:
vocab_size = 5000


In [None]:
text_train_one_hot = [one_hot(words, vocab_size) for words in text_train]
text_test_one_hot = [one_hot(words, vocab_size) for words in text_test]

In [None]:
text_train_one_hot[0][0: 10]


In [None]:
sent_length = 1000
embedded_docs = pad_sequences(text_train_one_hot, padding = 'post', maxlen = sent_length)
print(embedded_docs)

In [None]:
len(embedded_docs)


In [None]:
len(embedded_docs[0])


In [None]:
embedding_vector_features = 100


In [None]:
y_train.shape


In [None]:
X_train.shape


In [None]:
#Recurrent Neural Network (RNN)

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN

In [None]:
embedding_vector_features = 100


In [None]:
model = Sequential()

model.add(Embedding(vocab_size, embedding_vector_features, input_length = sent_length))
model.add(SimpleRNN(128, activation = 'relu', return_sequences = True))
model.add(Dropout(0.2))

model.add(SimpleRNN(64, activation = 'relu', return_sequences = True))
model.add(Dropout(0.2))

model.add(SimpleRNN(32, activation = 'relu', return_sequences = True))
model.add(Dropout(0.2))

model.add(Dense(16, activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(1, activation = 'sigmoid'))
print(model.summary())

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate = 1e-3, epsilon = 1e-07)


In [None]:
model.compile(optimizer = opt, loss = 'BinaryCrossentropy',
             metrics = ['accuracy'])

In [None]:
X_train_embedded = np.array(embedded_docs).astype('float32')
y_train_new = np.array(y_train).astype('float32').reshape((-1, 1))

In [None]:
X_train_embedded.shape


In [None]:
y_train_new.shape


In [None]:
import tensorflow as tf


In [None]:
tf.config.list_physical_devices('CPU')


In [None]:
tf.config.list_physical_devices('GPU')


In [None]:
model.fit(X_train_embedded, y_train_new, epochs = 2)


In [None]:
print(tf.__version__)


In [None]:
#Gated Recurrent Units (GRU)
from tensorflow.keras.layers import GRU

model = Sequential()

model.add(GRU(128, input_shape = (avgword2vec_df_train.shape[1], 1),
         activation = 'relu', return_sequences = True))
model.add(Dropout(0.2))

model.add(GRU(64, activation = 'relu', return_sequences = True))
model.add(Dropout(0.2))

model.add(GRU(32, activation = 'relu', return_sequences = True))
model.add(Dropout(0.1))

model.add(Dense(8, activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(1, activation = 'sigmoid'))


In [None]:
opt = tf.keras.optimizers.Adam(learning_rate = 1e-3, epsilon = 1e-07)


In [None]:
model.compile(optimizer = opt, loss = 'BinaryCrossentropy',
             metrics = ['accuracy'])

In [None]:
model.fit(X_train, y_train_new, epochs = 2)


In [None]:
#Long Short Term Memory (LSTM)

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.models import Sequential


In [None]:
model = Sequential()

model.add(LSTM(128, input_shape = (avgword2vec_df_train.shape[1], 1), activation = 'relu',
               return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(128, activation = 'relu', return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(64, activation = 'relu', return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(32, activation = 'relu', return_sequences = True))
model.add(Dropout(0.2))

model.add(Dense(16, activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(1, activation = 'sigmoid'))

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate = 1e-3, epsilon = 1e-07)


In [None]:
model.compile(optimizer = opt, loss = 'BinaryCrossentropy',
             metrics = ['accuracy'])

In [None]:
avgword2vec_df_train.head()

In [None]:
numpy_matrix = avgword2vec_df_train.to_numpy()


In [None]:
model.fit(X_train, y_train_new, epochs = 2)


In [None]:
#Bidirectional LSTM

from tensorflow.keras.layers import Bidirectional
model = Sequential()

model.add(Bidirectional(LSTM(128, input_shape = (avgword2vec_df_train.shape[1], 1), 
                       activation = 'relu', return_sequences = True)))
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(64, activation = 'relu', return_sequences = True)))
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(32, activation = 'relu', return_sequences = True)))
model.add(Dropout(0.1))

model.add(Dense(16, activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(1, activation = 'sigmoid'))


In [None]:
opt = tf.keras.optimizers.Adam(learning_rate = 1e-3, epsilon = 1e-7)


In [None]:
model.compile(optimizer = opt, metrics = ['accuracy'], loss = 'BinaryCrossentropy')


In [None]:
model.fit(X_train, y_train_new, epochs = 2)


In [None]:
def model_training(train_input, train_output, classifier = None):
    
    if classifier == "Logistic Regression":
        model = LogisticRegression()
        model.fit(train_input, train_output)
        
    elif classifier == "Decision Tree Classifier":
        model = DecisionTreeClassifier()
        model.fit(train_input, train_output)
        
    elif classifier == "Random Forest Classifier":
        model = RandomForestClassifier()
        model.fit(train_input, train_output)
        
    elif classifier == "Gradient Boosting Classifier":
        model = GradientBoostingClassifier()
        model.fit(train_input, train_output)
        
    elif classifier == "Support Vector Classifier":
        model = SVC()
        model.fit(train_input, train_output)
        
    elif classifier == "Naive Bayes Classifier":
        model = GaussianNB()
        model.fit(train_input, train_output)
        
    elif classifier == "K Neighbors Classifier":
        model = KNeighborsClassifier()
        model.fit(train_input, train_output)
    
    return model       

In [None]:
def model_testing(test_input, model, print_results = True):
    
    y_predictions = model.predict(test_input)
    
    if print_results == True:
        
        print(accuracy_score(y_predictions, y_test))
        print(confusion_matrix(y_predictions, y_test))
        print(classification_report(y_predictions, y_test))
    
    return y_predictions   