# Fake News Detector



In [2]:

# Import libraries.
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from PIL import Image
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

import re
from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import Counter

# Set Plotly theme.
pio.templates.default = "gridon"

# Set global variables.
RANDOM_STATE = 5

ModuleNotFoundError: No module named 'chart_studio'

In [3]:
true = pd.read_csv("data/True.csv")
fake = pd.read_csv("data/Fake.csv")

# Show first rows for each dataset.
display(true.head())
display(fake.head())

# Print the number of real and fake news.
print('\nThere are {} real and {} fake news'.format(true.shape[0], fake.shape[0]))

FileNotFoundError: [Errno 2] No such file or directory: 'data/True.csv'

In [None]:
# Create the 'label' column.
true['label'] = 'True'
fake['label'] = 'Fake'

# Concatenate the 2 dfs.
df = pd.concat([true, fake])

# To save a bit of memory we can set fake and true to None.
fake = true = None

#  Shuffle data.
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# Show first rows.
df.head()

In [None]:
# Check df.
df.info()

In [None]:
# Show counts for each class.
fig = px.bar(df.groupby('label').count().reset_index(), x='label', y='title', text='title', opacity=0.6)
fig.update_layout(title_text='Distribution of News')
fig.update_xaxes(showgrid=False, title_text=None)
fig.update_yaxes(showgrid=False, title_text=None)
fig.update_yaxes(showticklabels=False)
fig.show()

In [None]:
# Show counts for each class.
fig = px.bar(df.groupby('subject').count()['title'].reset_index().sort_values(by='title'),
             x='subject', y='title', text='title', opacity=0.6)
fig.update_layout(title_text='Distribution of News Subjects')
fig.update_xaxes(showgrid=False, title_text=None)
fig.update_yaxes(showgrid=False, title_text=None)
fig.update_yaxes(showticklabels=False)
fig.show()

In [None]:
df_sum = df.groupby(['label', 'subject']).count().reset_index()
fig = px.bar(df_sum, x='label', y='title', color='subject', text='title', opacity=0.6)
fig.update_xaxes(showgrid=False, title_text=None)
fig.update_yaxes(showgrid=False, title_text=None)
fig.update_yaxes(showticklabels=False)
fig.show()

In [None]:
# Convert date str into date object. Take care of any errors for invalid dates.
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df_date = df.groupby(['label', 'date'])['title'].count().reset_index()

fig = px.line(df_date, x='date', y='title', color='label')
fig.update_xaxes(title_text=None)
fig.update_yaxes(title_text=None)
fig.update_layout(legend_title_text=None)
fig.show()

In [None]:
# Filter df based on date.
df_filtered = df[(df['date'] < '2017-08-31') & (df['date'] > '2016-02-01')].copy()
df_filtered.loc[:, 'weekday'] = df_filtered['date'].dt.dayofweek
df_filtered.loc[:, 'week'] = df_filtered['date'].dt.weekofyear
df_filtered.loc[:, 'month'] = df_filtered['date'].dt.month
df_filtered.loc[:, 'quarter'] = df_filtered['date'].dt.quarter

df_weekday = df_filtered.groupby(['label', 'weekday']).count()['title'].reset_index()

fig = px.line(df_weekday, x='weekday', y='title', color='label')
fig.update_layout(title_text='Day of Week')
fig.update_xaxes(title_text=None)
fig.update_yaxes(title_text=None)
fig.update_layout(legend_title_text=None)
fig.show()

In [None]:
df_week = df_filtered.groupby(['label', 'week']).count()['title'].reset_index()

fig = px.line(df_week, x='week', y='title', color='label')
fig.update_layout(title_text='Week of the Year')
fig.update_xaxes(title_text=None)
fig.update_yaxes(title_text=None)
fig.update_layout(legend_title_text=None)
fig.show()

In [None]:
df_month = df_filtered.groupby(['label', 'month']).count()['title'].reset_index()

fig = px.line(df_month, x='month', y='title', color='label')
fig.update_layout(title_text='Monthly')
fig.update_xaxes(title_text=None)
fig.update_yaxes(title_text=None)
fig.update_layout(legend_title_text=None)
fig.show()

In [None]:
df_quarter = df_filtered.groupby(['label', 'quarter']).count()['title'].reset_index()

fig = px.line(df_quarter, x='quarter', y='title', color='label')
fig.update_layout(title_text='Quarterly')
fig.update_xaxes(title_text=None)
fig.update_yaxes(title_text=None)
fig.update_layout(legend_title_text=None)
fig.show()

In [None]:
print('Fake News\n')
print(df[df.label == 'Fake']['text'].tolist()[3])
print()
print(df[df.label == 'Fake']['text'].tolist()[5])
print()
print('\n\nTrue News\n')
print(df[df.label == 'True']['text'].tolist()[0])
print()
print(df[df.label == 'True']['text'].tolist()[2])

In [None]:
# Create a function to create a word cloud.
def make_wordcloud(text, mask, color):
    wordcloud = WordCloud(max_words=200, mask=mask,
                          background_color='white',
                          contour_width=2,
                          contour_color=color).generate(text)
    plt.figure(figsize=(17,12))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# Read an image in order to use it as a shape for our word cloud.
fake_mask = np.array(Image.open("data/fake.png"))
true_mask = np.array(Image.open("data/true.png"))

# Get the fake and true news.
fake_text = " ".join(text for text in df[df.label == 'Fake']['text'])
true_text = " ".join(text for text in df[df.label == 'True']['text'])

# Render word clouds.
make_wordcloud(fake_text, fake_mask, 'blue')
make_wordcloud(true_text, true_mask, 'orange')

In [None]:
# Create a new 'tqdm' instance to time and estimate the progress of functions.
tqdm.pandas()

# Create a function to clean and prepare text.
def clean_text(text):
   
    
    # Replace numbers with the str 'number'.
    text = re.sub('\d+', 'number', text)
    
    # Replace newlines with spaces.
    text = re.sub('\n', ' ', text)
    
    # Replace punctuation with spaces.
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Remove HTML tags.
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Replace links with the str 'link'
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                   'link', text, flags=re.MULTILINE)

    # Replace emails with the str 'email'
    text = re.sub('\S+@\S+', 'email', text, flags=re.MULTILINE)
    
    # Convert all letters to lower case.
    text = text.lower()
    
    # Create the stemmer.
    stemmer = SnowballStemmer('english')
    
    # Split text into words.
    words = text.split()
    
    # Remove stopwords.
    words = [w for w in words if w not in stopwords.words('english')]
    
    # Stem words.
    words = [stemmer.stem(w) for w in words]
    
    return words

# Apply the cleaning function to the dataset.
df.text = df.text.progress_apply(clean_text)

In [None]:
# Create a function to count and return the most frequent words.
def frequent_words(label, max_words):
    # Gather text and concatenate.
    text = df[df['label'] == label]['text'].values
    text = np.concatenate(text)
    
    # Count words.
    counts = Counter(text)
    
    # Create a pandas df from the Counter dictionary.
    df_counts = pd.DataFrame.from_dict(counts, orient='index')
    df_counts = df_counts.rename(columns={0:'counts'})
    
    # Return a df with the most frequent words.
    return df_counts.sort_values(by='counts', ascending=False).head(max_words).sort_values(by='counts')

# Get the 50 most frequent words.
df_fake_counts = frequent_words(label='Fake', max_words=50)
df_true_counts = frequent_words(label='True', max_words=50)

# Plot horizontal bar charts.
fig = make_subplots(rows=1, cols=2, subplot_titles=("Fake News", "True News"))

fig.add_trace(go.Bar(x=df_fake_counts.counts.tolist(),
                     y=df_fake_counts.index.values.tolist(),
                     orientation='h', opacity=0.6), 1, 1)

fig.add_trace(go.Bar(x=df_true_counts.counts.tolist(),
                     y=df_true_counts.index.values.tolist(),
                     orientation='h', opacity=0.6), 1, 2)

fig.update_layout(height=900, width=900, title_text="Most Frequent Words", showlegend=False)
fig.update_xaxes(showgrid=False, title_text=None)
fig.update_yaxes(showgrid=False, title_text=None)
fig.show()