In [2]:
import pandas as pd
from textblob import TextBlob
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
import string
import re
from nltk.tokenize import RegexpTokenizer, word_tokenize
from spellchecker import SpellChecker


In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
DATA_PATH = '../data/'
FAKE_DATA_FILE = 'fake.csv'
TRUE_DATA_FILE = 'true.csv'

In [5]:
fake_df = pd.read_csv(DATA_PATH + FAKE_DATA_FILE)
true_df = pd.read_csv(DATA_PATH + TRUE_DATA_FILE)
fake_df['label'] = 'fake'
true_df['label'] = 'real'

In [6]:
news_df = pd.concat([fake_df, true_df], ignore_index=True)

In [7]:
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake


In [8]:
news_df.shape

(44898, 5)

In [9]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


In [10]:
news_df['subject'].value_counts()

subject
politicsNews       11272
worldnews          10145
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: count, dtype: int64

# Clean Title and Text Data

In [11]:
def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return cleaned_text

In [12]:
news_df['cleaned_text'] = news_df['text'].apply(clean_text)

In [13]:
news_df

Unnamed: 0,title,text,subject,date,label,cleaned_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake,Donald Trump just couldn t wish all Americans ...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake,House Intelligence Committee Chairman Devin Nu...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake,On Friday it was revealed that former Milwauke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake,On Christmas day Donald Trump announced that h...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake,Pope Francis used his annual Christmas Day mes...
...,...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",real,BRUSSELS Reuters NATO allies on Tuesday welco...
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",real,LONDON Reuters LexisNexis a provider of legal...
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",real,MINSK Reuters In the shadow of disused Soviet...
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",real,MOSCOW Reuters Vatican Secretary of State Car...


In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
def remove_single_characters(word_list, exception_list):
    return [w for w in word_list if (len(w) > 1 or w in exception_list)]

## Remove Punctuation from Text



In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

In [18]:
news_df['cleaned_text'] = news_df['cleaned_text'].apply(lambda x: remove_punctuations(x))
news_df.head()

Unnamed: 0,title,text,subject,date,label,cleaned_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake,Donald Trump just couldn t wish all Americans ...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake,House Intelligence Committee Chairman Devin Nu...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake,On Friday it was revealed that former Milwauke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake,On Christmas day Donald Trump announced that h...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake,Pope Francis used his annual Christmas Day mes...


## Remove stopwords from Text



In [19]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

In [20]:
news_df['tokens_no_stopwords'] = news_df['cleaned_text'].apply(remove_stopwords)
nostopWords = [token for sublist in news_df['tokens_no_stopwords'] for token in sublist]
fdist_no_stopwords = FreqDist(nostopWords)

In [21]:
print(fdist_no_stopwords.most_common(10))

[('said', 129910), ('Trump', 115190), ('would', 54660), ('US', 53625), ('people', 39002), ('President', 33643), ('one', 31064), ('also', 30507), ('Reuters', 28708), ('Donald', 27492)]


## Saving the File



In [22]:
# DATA_PATH = '../data/'  
# FILENAME = 'cleaned_news.csv' 

In [23]:
# file_path = DATA_PATH + FILENAME

In [24]:
# news_df.to_csv(file_path, index=False)