In [None]:
import pandas as pd # data processing 
import os # access to dir structure
import matplotlib.pyplot as plt # plotting 
import numpy as np # linear algebra 
import seaborn as sns 

# import kaggle 

Dataset in use 
https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset?select=True.csv

In [None]:
# view files in downloaded dataset folder 

csv_files = os.listdir(r"/Users/Taurai/Iza/CFG/Data Science : Part 2/Project Folder /archive(3)")
for file in csv_files: 
    print(file)
print(csv_files.count)

In [None]:
fake_df1 = pd.read_csv("./archive(3)/Fake.csv")
real_df1 = pd.read_csv("./archive(3)/True.csv")

In [None]:
fake_df1.head(10)

In [None]:
real_df1.head(20)

In [None]:
real_df1['subject'].value_counts

In [None]:
fake_df1['subject'].unique

In [None]:
print(fake_df1.shape)
print(real_df1.shape)

In [None]:
# label the diff df for easier ref

fake_df1['true'] = 0
real_df1['true'] = 1

# print new shape 
print(fake_df1.shape)
print(real_df1.shape)

In [None]:
# just to see 

real_df1.describe


In [None]:
# concat the two datasets  

raw_df = pd.concat([fake_df1, real_df1])

In [None]:
raw_df

In [None]:
raw_df.true.value_counts 

In [None]:
# counting by subject

for key, count in raw_df['subject'].value_counts().iteritems():
    print(f"{key}:\t{count}")

In [None]:
# spread of information sources/ subjects in raw df 

plt.figure(figsize=(10,6))
sns.countplot('subject', data = raw_df)
plt.show()

In [None]:
# word cloud of fake vs real df for quick comparison before cleaning and processing 

from wordcloud import WordCloud, STOPWORDS
import nltk


text = ''
for news in fake_df1.text.values:
    text += f" {news}"
wordcloud = WordCloud(
    width = 750, height = 400, 
    background_color = 'purple', 
    stopwords=set(nltk.corpus.stopwords.words("english"))).generate(text)
fig = plt.figure(
    figsize = (20,15), 
    facecolor = 'm',
    edgecolor = 'm')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show
del text

In [None]:
# Real news word cloud 

text = ''
for news in real_df1.text.values:
    text += f" {news}"
wordcloud = WordCloud(
    width = 750, height = 400, 
    background_color = 'black', 
    stopwords=set(nltk.corpus.stopwords.words("english"))).generate(text)
fig = plt.figure(
    figsize = (20,15), 
    facecolor = 'm',
    edgecolor = 'm')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show
del text

## Data Cleaning 

Df needs cleaning. Ultimate Goal is:  
* Check for null values
* check for empty spaces 
* check for nonsense texts 
* remove urls, https, special characters? 
* remove REUTERS()
* group text by subject? and label


In [None]:
# check null values
raw_df.isnull().sum()

In [None]:
# drop missing values

raw_df = raw_df.dropna()
raw_df.shape

# Note, dropna() not picking up all null values as some text missing. 

In [None]:
# Make [index] for rows that don't have publication info (REUTERS)


no_publisher = []
for index, row in enumerate(raw_df.text.values):
    try:
        record = row.split(" -", maxsplit=1)
        
        # if no text present, this should raise error 
        record[1]
        assert(len(record[0]) < 260)
    except:
        no_publisher.append(index)

In [None]:
# check for the 'clean' data, that doesn't start with REUTERS
raw_df.iloc[no_publisher].text

In [None]:
# create list of indexes for rows that have publication info (REUTERS)
# this will seperate the text from the publisher info (i.e seperate REUTERS())

publisher = []
pt_text=[]

for index, row in enumerate(raw_df.text.values):
    if index in no_publisher: 
        
#         if no publisher mentioned, add unknown
        pt_text.append(row)
        publisher.append("Unknown")
        continue 
        
    record = row.split(" -", maxsplit=1)
    publisher.append(record[0])
    pt_text.append(record[1])

In [None]:
# replace current text column with the new one made above 
# add new seperate column for publisher info (i.e REUTERS())

raw_df["publisher"] = publisher 
raw_df["text"] = pt_text

del publisher, pt_text, record, no_publisher

In [None]:
raw_df.head(10)

In [None]:
raw_df.tail(15)

In [None]:
# check for empty rows and list them

empty = [index for index, text in enumerate(raw_df.text.values) if str(text).strip()=='']
print(f"Number of empty rows: {len(empty)}")
raw_df.iloc[empty].tail()

In [None]:
raw_df.head()

In [None]:
#  drop these empty row records 

# raw_df = raw_df.drop(empty, axis=0)
# raw_df = raw_df[raw_df['text'].notna()]

In [None]:
# use numpy to replace empty with NaN function in order to pick up using dropna()

raw_df['text'].replace(' ', np.nan, inplace=True)

In [None]:
# now can drop the null values 

raw_df = raw_df.dropna(subset=['text'])

In [None]:
raw_df

### Why isn't dropna working?  

it now is after adding white space to empty text str.

In [None]:
# confirm no empty rows and not white spaces once more 
# note lost more rows 

raw_df = raw_df.drop([index for index, text in enumerate(raw_df.text.values) if str(text).strip()==''])

In [None]:
raw_df

In [None]:
# text has bool true so should return false if empty 

raw_df['text'].astype(bool)

In [None]:
# raw_df = raw_df.drop(empty, inplace=True)
# raw_df['text'].str.strip().astype(bool)
# raw_df['text'].astype(bool)
# df.dropna()
# raw_df.iloc[empty].tail()
# raw_df = (raw_df.iloc[empty]).replace('', np.nan)
# print(raw_df)
# print(type(df))
# raw_df.empty

## Let's clean up the DF 

Delete unecessary rows - publisher, date, title, subject 
They won't be needed going forwards


In [None]:
del raw_df['title']
del raw_df['subject']
del raw_df['date']
del raw_df['publisher']


raw_df.head(10)

### Lets Clean up the data

* Remove punctuation
* remove special char 
* convert upper to lower 
* remove stopwords
* remove urls
* Lemmatization - The stemming of words without loss of meaning to context


We'll clean the first news article only for now... 

In [None]:
text_1 = raw_df['text'][0]
type(text_1)

In [None]:
#  use contractions lib for context when expanding contractions (i'd -> i would)
import contractions 

In [None]:
text_1 = ' '.join([contractions.fix(word) for word in text_1.str.split()])
text_1

# will tokenizer work instead? 

In [None]:
# Remove special characters and punctuation 
import re 


text_1 = re.sub('\[[^]]*\]', ' ', text_1)
text_1 = re.sub('[^a-zA-Z]', ' ', text_1)

#  convert from lower to upper 
text_1 = text_1.lower()

text_1

In [None]:
#  remove stopwords 

from nltk.corpus import stopwords 

text_1 = nltk.word_tokenize(text_1)


In [None]:
text_1 = [ word for word in text_1 if not word in set(stopwords.words("english"))]

In [None]:
#  removal of HTML content

from bs4 import BeautifulSoup

soup = BeautifulSoup(text_1, "html.parser")
text_1 = soup.get_text()
text_1

In [None]:
type(text_1)

# remove emojies 
# sentiment analysis - says if dataset is positive, neg or neutral
# top to beck groups text into categories (topic model?)1
# look at stopwords being removed and make sure list is inclusive 
# dashboard for visualisation (powerbi)
# word cloud for after 
# pull out a few words when describing 
# use time frames to compare e.g. most pop topic in feb 2020 was
# 30% train and 70% test for ML 
# topic model