#### Import Dependencies

In [47]:
import pandas as pd
from io import StringIO
import string
import requests
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### Load Data into DataFrames

In [48]:
fake_url="https://raw.githubusercontent.com/Shannon-Watts/fake_news_ML/main/data/kaggle/Fake.csv"
fake_request = requests.get(fake_url).text
fake_df = pd.read_csv(StringIO(fake_request))
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [49]:
true_url= "https://raw.githubusercontent.com/Shannon-Watts/fake_news_ML/main/data/kaggle/True.csv"
true_request = requests.get(true_url).text
true_df = pd.read_csv(StringIO(true_request))
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


#### Data Exploration

In [50]:
fake_df.nunique()

title      17903
text       17455
subject        6
date        1681
dtype: int64

In [51]:

true_df.nunique()

title      20826
text       21192
subject        2
date         716
dtype: int64

In [52]:
fake_df['subject'].value_counts()

News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: subject, dtype: int64

In [53]:
true_df['subject'].value_counts()

politicsNews    11272
worldnews       10145
Name: subject, dtype: int64

In [54]:
fake_df.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [55]:
true_df.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [56]:
fake_df.duplicated().sum()

3

In [57]:
true_df.duplicated().sum()

206

In [58]:
fake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [59]:
true_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [60]:
fake_df.shape

(23481, 4)

In [61]:
true_df.shape

(21417, 4)

#### Add `class` column, merge Dataframes, Clean Data

In [62]:
true_df['class'], fake_df['class'] = 1, 0

merged_df = pd.concat([true_df, fake_df], ignore_index = True, sort = False)

In [63]:
import copy
news_df = copy.deepcopy(merged_df)

In [64]:
import re
URLS = r'(https?://\S+)'
news_df['urlcount'] = news_df['date'].apply(lambda x: re.findall(URLS, x)).str.len()

news_df.loc[news_df['urlcount'] > 0]

Unnamed: 0,title,text,subject,date,class,urlcount
30775,https://100percentfedup.com/served-roy-moore-v...,https://100percentfedup.com/served-roy-moore-v...,politics,https://100percentfedup.com/served-roy-moore-v...,0,1
36924,https://100percentfedup.com/video-hillary-aske...,https://100percentfedup.com/video-hillary-aske...,politics,https://100percentfedup.com/video-hillary-aske...,0,1
36925,https://100percentfedup.com/12-yr-old-black-co...,https://100percentfedup.com/12-yr-old-black-co...,politics,https://100percentfedup.com/12-yr-old-black-co...,0,1
37256,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,politics,https://fedup.wpengine.com/wp-content/uploads/...,0,1
37257,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,politics,https://fedup.wpengine.com/wp-content/uploads/...,0,1
38849,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,Government News,https://fedup.wpengine.com/wp-content/uploads/...,0,1
38850,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,Government News,https://fedup.wpengine.com/wp-content/uploads/...,0,1
43286,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,left-news,https://fedup.wpengine.com/wp-content/uploads/...,0,1
43287,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,left-news,https://fedup.wpengine.com/wp-content/uploads/...,0,1


In [65]:
news_df['urlcount'].sum()

9

In [66]:
news_df.drop(news_df.loc[news_df['urlcount'] > 0].index, inplace=True)

In [67]:
news_df['urlcount'].sum()

0

In [68]:
news_df.duplicated().sum()

209

In [69]:
news_df.drop_duplicates(inplace=True)

In [70]:
news_df.duplicated().sum()

0

In [72]:
news_df.columns

Index(['title', 'text', 'subject', 'date', 'class', 'urlcount'], dtype='object')

In [73]:
news_df.info(memory_usage = True, verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44680 entries, 0 to 44897
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     44680 non-null  object
 1   text      44680 non-null  object
 2   subject   44680 non-null  object
 3   date      44680 non-null  object
 4   class     44680 non-null  int64 
 5   urlcount  44680 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 2.4+ MB


In [74]:
news_df.shape

(44680, 6)

In [75]:
news_df['class'].value_counts()

0    23469
1    21211
Name: class, dtype: int64

In [76]:
news_df['subject'].value_counts()

politicsNews       11220
worldnews           9991
News                9050
politics            6833
left-news           4457
Government News     1568
US_News              783
Middle-east          778
Name: subject, dtype: int64

In [77]:
news_df.drop(['text', 'date', 'subject', 'urlcount'], axis = 1, inplace = True)
news_df

Unnamed: 0,title,class
0,"As U.S. budget fight looms, Republicans flip t...",1
1,U.S. military to accept transgender recruits o...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1
3,FBI Russia probe helped by Australian diplomat...,1
4,Trump wants Postal Service to charge 'much mor...,1
...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,0
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,0
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,0
44896,How to Blow $700 Million: Al Jazeera America F...,0


In [78]:
news_df.shape

(44680, 2)

In [79]:
news_df

Unnamed: 0,title,class
0,"As U.S. budget fight looms, Republicans flip t...",1
1,U.S. military to accept transgender recruits o...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1
3,FBI Russia probe helped by Australian diplomat...,1
4,Trump wants Postal Service to charge 'much mor...,1
...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,0
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,0
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,0
44896,How to Blow $700 Million: Al Jazeera America F...,0


In [80]:
news_df.columns

Index(['title', 'class'], dtype='object')

In [81]:
# Shuffle the columns randomly
news_df = news_df.sample(frac = 1)

In [82]:
from collections import Counter
print(Counter(news_df['class'].values))

Counter({0: 23469, 1: 21211})


# Data Processing

In [83]:
import nltk
# from nltk.corpus import stopwords
# from textblob import Word
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk import pos_tag
# from nltk.corpus import wordnet

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [84]:
news_df


Unnamed: 0,title,class
8324,Senators press Mylan on 'exorbitantly expensiv...,1
15729,U.S. carries out first strikes against Islamic...,1
7860,Trump tax plan would aid Wall St. despite clos...,1
16099,One body found near wreckage of Russian helico...,1
25693,Elizabeth Warren Slapped The Hell Out Of Trum...,0
...,...,...
16873,France gets serious over sexual harassment aft...,1
35372,“I Think My Dog’s A Democrat” [VIDEO],0
41304,NEW YORK TIMES Publishes Trump Tax Return From...,0
32684,TREY GOWDY ON SPYING ON AMERICAN CITIZENS…Like...,0


In [85]:
def lower_title(x):
  try:
    return x.lower()
  except Exception as e:
    print('Error in lower_title function', str(e))
    print("Error", x)
    return x

In [86]:
# Tokenization
from nltk.tokenize import word_tokenize
# Stop words removing
from nltk.corpus import stopwords
# Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#create an object of class PorterStemmer
porter = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')

# news_df['title'] = news_df['title'].replace('[^a-zA-Z0-9]', '')
news_df['title'] = news_df.title.apply(lambda x: lower_title(x))

news_df = news_df.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
news_df['title'] = news_df.title.apply(lambda x:tokenizer.tokenize(x))
eng_stopwords = stopwords.words('english') 


# news_df['title'] = news_df['title'].apply(word_tokenize)
news_df['title'] = news_df['title'].apply(lambda words: [word for word in words if word not in eng_stopwords])
news_df['title'] = news_df['title'].apply(lambda x : ' '.join([porter.stem(token) for token in x]))

In [87]:
news_df

Unnamed: 0,title,class
8324,senat press mylan exorbitantli expens epipen,1
15729,u carri first strike islam state somalia,1
7860,trump tax plan would aid wall st despit close ...,1
16099,one bodi found near wreckag russian helicopt s...,1
25693,elizabeth warren slap hell trump busi lose cli...,0
...,...,...
16873,franc get seriou sexual harass weinstein scand...,1
35372,think dog democrat video,0
41304,new york time publish trump tax return 20 year...,0
32684,trey gowdi spi american citizen like presid tr...,0


In [88]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

import tensorflow as tf
import tensorflow.keras as keras

In [89]:
# Split our preprocessed data into our features and target arrays
X = news_df['title']
y = news_df['class']
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.25, random_state=42)

In [90]:
# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

In [91]:
# Get the shape of X_train data
X_train.shape

(33510, 12523)

In [98]:
# Get the input feature 
input_features = X_train.shape[1]

In [92]:
!pip install keras-tuner


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.1.3-py3-none-any.whl (135 kB)
[K     |████████████████████████████████| 135 kB 5.4 MB/s 
Collecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 52.4 MB/s 
Installing collected packages: jedi, kt-legacy, keras-tuner
Successfully installed jedi-0.18.1 keras-tuner-1.1.3 kt-legacy-1.0.4


In [95]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value= 90,
        step=5), activation=activation, input_dim=input_features))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 5)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=5),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [96]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train,y_train,epochs=20,validation_data=(X_test,y_test))

Trial 46 Complete [00h 01m 10s]
val_accuracy: 0.9452103972434998

Best val_accuracy So Far: 0.9512085914611816
Total elapsed time: 01h 18m 53s

Search: Running Trial #47

Value             |Best Value So Far |Hyperparameter
sigmoid           |sigmoid           |activation
21                |21                |first_units
2                 |2                 |num_layers
21                |21                |units_0
21                |21                |units_1
6                 |6                 |units_2
6                 |6                 |units_3
6                 |6                 |units_4
20                |7                 |tuner/epochs
7                 |3                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
2                 |1                 |tuner/round
0042              |0035              |tuner/trial_id

Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20

In [None]:
# Top 3 model hyperparameters 
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
  print(param.values)

In [None]:
# Top 3 models
top_models = tuner.get_best_models(3)
for model in top_models:
  model_loss, model_accuracy = model.evaluate(X_test,y_test,verbose=2)
  print(f'Loss: {model_loss}, Accuracy: {model_accuracy}')

In [None]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters()[0]
best_hyper.values

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")