In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords

import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
#nltk packages
nltk.download('stopwords')
nltk.download('wordnet')
nltk.data.path.append('/root/nltk_data/corpora/')
stop_words = stopwords.words('english')
print(stop_words) # some words I like to remove are not included
new_words = ['said','like','year','would','house','also','sends']
stop_words.extend(new_words)
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#Helper functions

# Create function to automatically lemmatization and remove stopwords
def lemmatization_and_stopwords(text):
    clean_text = []
    # Set all text into lowercase to match the stopwords
    text = text.lower()
    # Tokenize the text before processing
    tokens = nltk.word_tokenize(text)
    lemmatizer = nltk.WordNetLemmatizer()

    for token in tokens:
        if token not in stop_words and len(token)>3:
            token = lemmatizer.lemmatize(token)
            clean_text.append(token)

    text = " ".join(clean_text)

    return text

In [None]:
#importing data

fake = "https://raw.githubusercontent.com/Shacham-R/fake_news_detector/main/data/Fake.csv"
true = "https://raw.githubusercontent.com/Shacham-R/fake_news_detector/main/data/True.csv"

df_fake = pd.read_csv(fake)
df_real = pd.read_csv(true)

# First database processing

## Data cleaning of df_fake

In [None]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
df_fake['subject'].unique() #It's all news, this column doesn't add anything

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [None]:
df_fake = df_fake[['title','text']]

In [None]:
df_fake.head()

Unnamed: 0,title,text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ..."
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...


In [None]:
df_fake['title_text'] = df_fake['title'] +' '+ df_fake['text']

In [None]:
df_fake_clean = df_fake[['title_text']]

In [None]:
df_fake_clean['fake'] = 1

In [None]:
df_fake_clean.head()

Unnamed: 0,title_text,fake
0,Donald Trump Sends Out Embarrassing New Year’...,1
1,Drunk Bragging Trump Staffer Started Russian ...,1
2,Sheriff David Clarke Becomes An Internet Joke...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,1
4,Pope Francis Just Called Out Donald Trump Dur...,1


### Data Cleaning for df_real

In [None]:
df_real.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [None]:
#TODO remove CITY(newspaper) from df_real['text']
df_real['text'] = df_real['text'].str.extract(r"- (.*)", expand=False)


In [None]:
df_real['text'].head()

0    The head of a conservative Republican faction ...
1    Transgender people will be allowed for the fir...
2    The special counsel investigation of links bet...
3    Trump campaign adviser George Papadopoulos tol...
4    President Donald Trump called on the U.S. Post...
Name: text, dtype: object

In [None]:
df_real = df_real[['title','text']]

In [None]:
df_real.head()

Unnamed: 0,title,text
0,"As U.S. budget fight looms, Republicans flip t...",The head of a conservative Republican faction ...
1,U.S. military to accept transgender recruits o...,Transgender people will be allowed for the fir...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,The special counsel investigation of links bet...
3,FBI Russia probe helped by Australian diplomat...,Trump campaign adviser George Papadopoulos tol...
4,Trump wants Postal Service to charge 'much mor...,President Donald Trump called on the U.S. Post...


In [None]:
df_real['title_text'] = df_real['title'] +' '+ df_real['text']

In [None]:
df_real_clean = df_real[['title_text']]

In [None]:
df_real_clean['fake'] = 0

In [None]:
df_real_clean.head()

Unnamed: 0,title_text,fake
0,"As U.S. budget fight looms, Republicans flip t...",0
1,U.S. military to accept transgender recruits o...,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,0
3,FBI Russia probe helped by Australian diplomat...,0
4,Trump wants Postal Service to charge 'much mor...,0


## Dataframes merging

In [None]:
df = pd.concat([df_real_clean,df_fake_clean])

In [None]:
df = df.sample(frac=1)

In [None]:
df.reset_index(inplace=True,drop=True)

In [None]:
df

Unnamed: 0,title_text,fake
0,BUSTED! FBI Hid Clinton-Lynch Tarmac Meeting D...,1
1,DEMOCRAT SENATOR Makes Up FAKE Anti-Trump Stor...,1
2,Congressman Tells Trump He’s In ‘Deep S**t’ I...,1
3,WATCH MSNBC “Objective” Host’s Loud Outburst W...,1
4,U.S. lawmakers reach agreement on new North Ko...,0
...,...,...
44893,Trump taps top Sessions attorney to lead civil...,0
44894,Trump Adviser: Trump Should’ve Had A ‘Backdro...,1
44895,Ukrainian police clash with Saakashvili suppor...,0
44896,Cher Just Went Nuclear On Trump In Incredible...,1


# Preprocessing

In [None]:
# Lemmatization

In [None]:
df['title_text'] = df['title_text'].astype(str)

In [None]:
df['token_text'] = df['title_text'].apply(lambda x: nltk.word_tokenize(x))
df['token_text']

0        [BUSTED, !, FBI, Hid, Clinton-Lynch, Tarmac, M...
1        [DEMOCRAT, SENATOR, Makes, Up, FAKE, Anti-Trum...
2        [Congressman, Tells, Trump, He, ’, s, In, ‘, D...
3        [WATCH, MSNBC, “, Objective, ”, Host, ’, s, Lo...
4        [U.S., lawmakers, reach, agreement, on, new, N...
                               ...                        
44893    [Trump, taps, top, Sessions, attorney, to, lea...
44894    [Trump, Adviser, :, Trump, Should, ’, ve, Had,...
44895    [Ukrainian, police, clash, with, Saakashvili, ...
44896    [Cher, Just, Went, Nuclear, On, Trump, In, Inc...
44897    [Henningsen, on, CrossTalk, Debating, ‘, Trump...
Name: token_text, Length: 44898, dtype: object

In [None]:
# Count all words in the data
list_words = []
for i in df.token_text:
    for w in i:
        list_words.append(w)

In [None]:
num_words = len(list(set(list_words)))
num_words

262302

In [None]:
df['title_text'] = df['title_text'].apply(lemmatization_and_stopwords)

In [None]:
del df['token_text']

# ready dataframe

In [None]:
df.head()

Unnamed: 0,title_text,fake
0,busted clinton-lynch tarmac meeting documents…...,1
1,democrat senator make fake anti-trump story in...,1
2,congressman tell trump deep wiretapping conspi...,1
3,watch msnbc objective host loud outburst latin...,1
4,u.s. lawmaker reach agreement north korea sanc...,0


In [None]:
df.to_csv('Kaggle_split_ready_df')

# Second database cleaning
https://www.kaggle.com/datasets/rajatkumar30/fake-news/data

In [None]:
file2 = 'https://raw.githubusercontent.com/Shacham-R/fake_news_detector/main/data/2ed%20dataset.csv'
data=pd.read_csv(file2)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
del data['Unnamed: 0']

In [None]:
data['title_text'] = data['title'] + ' ' + data['text']

In [None]:
data['label'] = data['label'].apply(lambda x : 0 if x == 'REAL' else 1)

In [None]:
data.drop(columns=['title','text'],inplace=True)

In [None]:
data.rename(columns={'label':'fake'},inplace=True)

In [None]:
data.head()

Unnamed: 0,fake,title_text
0,1,You Can Smell Hillary’s Fear Daniel Greenfield...
1,1,Watch The Exact Moment Paul Ryan Committed Pol...
2,0,Kerry to go to Paris in gesture of sympathy U....
3,1,Bernie supporters on Twitter erupt in anger ag...
4,0,The Battle of New York: Why This Primary Matte...


## preprocessing

In [None]:
data['title_text'] = data['title_text'].astype(str)

In [None]:
data['title_text'] = data['title_text'].apply(lemmatization_and_stopwords)

In [None]:
data.head()

Unnamed: 0,fake,title_text
0,1,smell hillary fear daniel greenfield shillman ...
1,1,watch exact moment paul ryan committed politic...
2,0,kerry paris gesture sympathy u.s. secretary st...
3,1,bernie supporter twitter erupt anger tried war...
4,0,battle york primary matter primary york front-...


In [None]:
data.to_csv('Kaggle_split_ready_df2')

In [None]:
main_df = pd.concat([df,data])

In [None]:
main_df.to_csv('main_df_51k')