In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import datetime
import os
import time
import matplotlib.pyplot as plt
import gc
%matplotlib inline

The corpus is formatted as a CSV and contains the following fields:

* id
* domain
* type
* url
* content
* scraped_at
* inserted_at
* updated_at
* title
* authors
* keywords
* meta_keywords
* meta_description
* tags
* summary
* source (opensources, nytimes, or webhose)

Examples
0,141,awm.com,unreliable,http://awm.com/church-congregation-brings-gift-to-waitresses-working-on-christmas-eve-has-them-crying-video/,"Sometimes the power of Christmas.....the year.",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"Church Congregation Brings Gift to Waitresses Working on Christmas Eve, Has Them Crying (video)",Ruth Harris,,[''],,,

# **Splitting Data**
Because the the main data file is 6.45 GB. We needs to splits the data to smaller chunks. This reduces the processing time for our data pipelines and prevents out of memory errors. First we will select only the neccessary variables from the main file. The following variables are ignored based on these criterias
* **Necessary**
    * scraped_at
    * inserted_at
    * updated_at

* **Unbalanced data (Too many empty values**
    * authors
    * keywords
    * meta_keywords
    * meta_description
    * tags
    * summary
    * source (opensources, nytimes, or webhose)

In [None]:
# Defined chosen columns
columns = ['id', 'domain', 'type', 'content', 'title']
dtypes = {
    'id':'object',
    'domain':'object',
    'type':'object',
    'content':'object',
    'title': 'object',
}

In [None]:
%%time
df = pd.read_csv('/kaggle/input/fake-news-corpus-smaller/news_cleaned_2018_02_13.csv', usecols=columns, dtype=dtypes, engine='python', on_bad_lines='skip')

In [None]:
# Preview data
print(df.info())
df.head()

## **Cleaning null data**
Now we need to check whether there are empty values

In [None]:
#Detect Null and NaN values
df1 = df[df.isna().any(axis=1)]
df1

We can see that there are `NaN` values at type columns, making these columns unusable. Therefore, we have to delete it

In [None]:
# Drop NaN values
df1 =  df.dropna() 
df1.reset_index(drop = True, inplace = True)

Let's preview it again.

In [None]:
print(df1.info())
df1.tail()

In [None]:
output_dir = '/kaggle/working/'
#Import dropNAN data
df1.to_csv(f'{output_dir}news_noNull_.csv', index=False)

## **Split files to smaller chunks**
1. Now that the orginal looks cleaned. We needed to divide the dataset into smaller chunks otherwise it will takes days to finished preprocessing our data. We will split it into 250,000 rows per chunk.


In [None]:
# Define the path to the main CSV file
main_csv_path = f'{output_dir}news_noNull_.csv'

# Define the number of rows per chunk
rows_per_chunk = 250000  # Adjust the number of rows per chunk as needed

# Read the headers separately
with open(main_csv_path, 'r') as f:
    header = f.readline().strip().split(',')

Because each chunks will be splitted by rows, only the first chunk will have headers. So we have to manually add it.

In [None]:
header

The below codes will interate throgh the main csv until there are no more rows. The `Error: EmptyDataError occurred while reading the file.` can be ignored

In [None]:
# Initialize the starting index for reading chunks
start_index = 1  # Start after the header

# Initialize the counter for naming the output files
file_counter = 1

# Read the main CSV file in chunks until there are no more rows
while True:
    try:
        # Read the next chunk of rows
        chunk = pd.read_csv(main_csv_path, skiprows=start_index, nrows=rows_per_chunk, header=None)

        # Check if the chunk is empty
        if chunk.empty:
            print("Warning: Empty chunk encountered. Skipping...")
            break
    
        # Define the path to the output CSV file
        output_csv_path = f'news_file_{file_counter}.csv'

        # Write the chunk to the output CSV file with headers
        chunk.to_csv(output_csv_path, index=False, header=header)

        # Increment the file counter
        file_counter += 1

        # Update the starting index for the next chunk
        start_index += rows_per_chunk
    except pd.errors.EmptyDataError:
        print("Error: EmptyDataError occurred while reading the file.")
        break

### **Preview the chunk files**

Here we use datatable to quickly read the tables without exhausting the memory

In [None]:
!pip install datatable

In [None]:
import datatable as dt

%%time
raw_1 = dt.fread("/kaggle/working/news_file_1.csv")
raw_2 = dt.fread("/kaggle/working/news_file_2.csv")
raw_3 = dt.fread("/kaggle/working/news_file_3.csv")
raw_4 = dt.fread("/kaggle/working/news_file_4.csv")
raw_5 = dt.fread("/kaggle/working/news_file_5.csv")
raw_6 = dt.fread("/kaggle/working/news_file_6.csv")
raw_7 = dt.fread("/kaggle/working/news_file_7.csv")

In [None]:
print(raw_1.shape)
print(raw_2.shape)
print(raw_3.shape)
print(raw_4.shape)
print(raw_5.shape)
print(raw_6.shape)
print(raw_7.shape)

## **Text Preprocessing**
The following steped were used to preprocess raw data

1. Removing HTML tags
2. Removing accented characters
3. Expanding Contractions
4. Removing Special Characters
5. Lemmatization
6. Removing Stopwords

The below code will be used to incrementally preprocess all data files

In [None]:
%%time
# ADD PATH TO DATA FILE HERE
file_path = "/kaggle/input/fake-news-chunks/news_file_1.csv" # Edit for each increments

rawdf = pd.read_csv(file_path)

In [None]:
# Preview dataset
rawdf.head()

We can use a sample text to test each function

In [None]:
sample_text = "The Los Angeles Police Department has been denied $3 million in federal aid for law enforcement. While there is no official announcement as to why, it is more than likely that it has everything to do with LA’s “sanctuary city” status for harboring illegal aliens. Donald Trump and Attorney General Jeff Sessions have repeatedly said"

### **spaCy**

[`spaCy`](https://spacy.io/) is a free, open-source Python library that provides advanced capabilities to conduct natural language processing (NLP) on large volumes of text at high speed. These models are the power engines of spaCy. These models enable spaCy to perform several NLP related tasks, such as part-of-speech tagging, named entity recognition, and dependency parsing.

I’ve listed below the different statistical models in spaCy along with their specifications:

* en_core_web_sm: English multi-task CNN trained on OntoNotes. Size – 11 MB
* en_core_web_md: English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Size – 91 MB
* en_core_web_lg: English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Size – 789 MB

For this notebook, we used `en_core_web_md`

In [None]:
# Only run if SpaCy package is not installed
# !pip install -U spacy

In [None]:
!python -m spacy download en_core_web_md

In [None]:
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import requests
import re
from bs4 import BeautifulSoup

import unicodedata
nlp = spacy.load('en_core_web_md')
import en_core_web_md
nlp = en_core_web_md.load()
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

### **1. Removing HTML tags**

Often, unstructured text contains a lot of noise, especially if you use techniques like web or screen scraping. HTML tags are typically one of these components which don’t add much value towards understanding and analyzing text.

In [None]:
# Remove HTML Tags
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [None]:
# Test the function
strip_html_tags(sample_text)

### **2. Removing accented characters**

Usually in any text corpus, you might be dealing with accented characters/letters, especially if you only want to analyze the English language. Hence, we need to make sure that these characters are converted and standardized into ASCII characters. A simple example — converting é to e.

In [None]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

### **3. Expanding Contractions**

Contractions are shortened version of words or syllables. They often exist in either written or spoken forms in the English language. These shortened versions or contractions of words are created by removing specific letters and sounds. In case of English contractions, they are often created by removing one of the vowels from the word. Examples would be, do not to don’t and I would to I’d. Converting each contraction to its expanded, original form helps with text standardization.
We leverage a standard set of `CONTRACTION_MAP` below

In [None]:
# -*- coding: utf-8 -*-
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                     flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

### **4. Removing Special Characters**

Special characters and symbols are usually non-alphanumeric characters or even occasionally numeric characters (depending on the problem), which add to the extra noise in unstructured text. Usually, simple regular expressions (regexes) can be used to remove them.

In [None]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [None]:
remove_special_characters(sample_text, remove_digits=False):

### **5. Lemmatization** 
Lemmatization is very similar to stemming, where we remove word affixes to get to the base form of a word. However, the base form in this case is known as the root word, but not the root stem. The difference being that the root word is always a lexicographically correct word (present in the dictionary), but the root stem may not be so. Thus, root word, also known as the lemma, will always be present in the dictionary. Both nltk and spacy have excellent lemmatizers. We will be using spacy here.

*Do note that the lemmatization process is considerably slower than stemming, because an additional step is involved where the root form or lemma is formed by removing the affix from the word if and only if the lemma is present in the dictionary.*

In [None]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [None]:
lemmatize_text(sample_text)

### **6. Removing Stopwords**
Words which have little or no significance, especially when constructing meaningful features from text, are known as stopwords or stop words. These are usually words that end up having the maximum frequency if you do a simple term or word frequency in a corpus. Typically, these can be articles, conjunctions, prepositions and so on. Some examples of stopwords are **a, an, the, and** the like.

In [None]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
remove_stopwords(sample_text)

## **TextProcessing Pipelines**

Now we put everything together

In [None]:
import math
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    num_docs = len(corpus)
    processed_percent = 0
    percent_increment = 5
    processed_docs = 0
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
        processed_docs += 1
        normalized_corpus.append(doc)
        
        # Calculate and print the progress at each percentage increment
        percent_done = math.floor((processed_docs / num_docs) * 100)
        if percent_done >= processed_percent + percent_increment:
            processed_percent = percent_done
            print(f'Processed {processed_percent}% of documents')
        
    return normalized_corpus

In [None]:
# Uncommon this if you want content and title in one columns. this pre-process text and store the same
#rawdf['full_text'] = rawdf["title"].map(str)+ '. ' + rawdf["content"]
#rawdf['full_text']

In [None]:
#Drop content and title column
#news_text_df = rawdf.drop(['content', 'title'], axis=1)

In [None]:
# pre-process content
cleaned_content = normalize_corpus(rawdf['content'])

In [None]:
# pre-process title
cleaned_title = normalize_corpus(rawdf['title'])

In [None]:
# Store to new content and title columns
rawdf['cleaned_content'] = cleaned_content
rawdf = rawdf.assign(cleaned_content = cleaned_content)

rawdf['cleaned_title'] = cleaned_title
rawdf = rawdf.assign(cleaned_title = cleaned_title)

In [None]:
rawdf

## **Detecting Non-English Text**

In [None]:
!pip install langdetect

In [None]:
from langdetect import detect

def detect_language(text):
    try:
        detected_language = detect(text)
        return detected_language
    except Exception as e:
        print("An error occurred:", e)
        return None

In [None]:
# Get certain row
sample_text = rawdf.iloc[6].case_content

In [None]:
detected_language = detect_language(sample_text)
print(f"Detected language: {detected_language}")

In [None]:
def dropNAN(dataframe):
    # Because the cleaned_content delete Russian text. We can safely detect Null and NaN values row
    cleaned_dataframe = dataframe[dataframe.isna().any(axis=1)]
        
    # Drop NaN values
    cleaned_dataframe =  dataframe.dropna() 
    cleaned_dataframe.reset_index(drop = True, inplace = True)
    
    return cleaned_dataframe  

In [None]:
def dropNoise(df):
    # Remove corupted data
    # Filter rows based on the condition
    noise_corpus = "report typo following text simply click send typo report button complete report also include comment"
    noise_df = df['cleaned_content'].str.contains(noise_corpus)
    
    cleaned_df = df[~noise_df]
    
    return cleaned_df

In [None]:
df1 = dropNAN(rawdf)
df1 = dropNoise(rawdf)
df1.head()

In [None]:
import math
# Clean nonenglish text
def detect_nonenglish(corpus):
    num_docs = len(corpus)
    processed_percent = 0
    percent_increment = 5
    processed_docs = 0
    detected_corpus = []
    for doc in corpus:
        # For other languages. We apply language detection to each content and create a new column 'language'
        doc = detect_language(doc)
        processed_docs += 1
        detected_corpus.append(doc)
        
        # Calculate and print the progress at each percentage increment
        percent_done = math.floor((processed_docs / num_docs) * 100)
        if percent_done >= processed_percent + percent_increment:
            processed_percent = percent_done
            print(f'Processed {processed_percent}% of documents')
    return detected_corpus

In [None]:
# Detecting language for content column
lang_col = detect_nonenglish(df1['content'])

In [None]:
# Store new column
df1['language'] = lang_col

In [None]:
# Filter out non-English articles
df1 = df1[df1['language'] == 'en']

# Drop the 'language' column as it's no longer needed
df1 = df1.drop(columns=['language'])

# Save the cleaned DataFrame back to CSV
df1.to_csv('news_file_en_1.csv', index=False)

## **Computing Vocabulary and Reduction Rate**
This is the result of the below code

**Report of news file 1**
* Size of vocabulary in news content:  2063986
* Size of vocabulary in cleaned news content:  558106
* Reduction  rate of vocabulary size: 72.96%

**Report of news file 2**
* Size of vocabulary in news content:  2048903
* Size of vocabulary in cleaned news content:  527854
* Reduction  rate of vocabulary size: 74.24%

**Report of news file 3**
* Size of vocabulary in news content:  2021347
* Size of vocabulary in cleaned news content:  539056
* Reduction  rate of vocabulary size: 73.33%

**Report of news file 4**
* Size of vocabulary in news content:  1521905
* Size of vocabulary in cleaned news content:  478514
* Reduction  rate of vocabulary size: 68.56%

**Report of news file 5**
* Size of vocabulary in news content:  1646381
* Size of vocabulary in cleaned news content:  474122
* Reduction  rate of vocabulary size: 71.20%

**Report of news file 6**
* Size of vocabulary in news content:  2041941
* Size of vocabulary in cleaned news content:  552321
* Reduction  rate of vocabulary size: 72.95%

**Report of news file 7**
* Size of vocabulary in news content:  1768009
* Size of vocabulary in cleaned news content:  455136
* Reduction  rate of vocabulary size: 74.26%

In [None]:
# Compute the size of the vocabulary after removing stopwords and lemitization
# Compute the reduction rate of the vocabulary size after removing stopwords & lemitization.
def vocab_reduct(dataframe):
    # Tokenization content and cleaned_content
    content_tokens = [word for line in dataframe['content'] for word in str(line).split()]
    cleaned_content_tokens = [word for line in dataframe['cleaned_content'] for word in str(line).split()]
    
    #Compute vocabulary sizes
    content_vocab_size = len(set(content_tokens))
    cleaned_content_vocab_size = len(set(cleaned_content_tokens))
    
    #Compute reduction rate
    reduction_rate = ((content_vocab_size - cleaned_content_vocab_size) / content_vocab_size) * 100
    
    print("Size of vocabulary in news content: ", content_vocab_size)
    print("Size of vocabulary in cleaned news content: ", cleaned_content_vocab_size)
    print("Reduction  rate of vocabulary size: {:.2f}%".format(reduction_rate))

In [None]:
# Incrementally computing each csv file
# Process each CSV file
path_name = f'/kaggle/input/fake-news-content-and-title'
for i in range(1, 8):
    file_path = f'{path_name}/news_file_en_{i}.csv'
    # Read the dataframe
    chunk_df = pd.read_csv(file_path)
    print(f'Report of news file {i}')
    vocab_reduct(chunk_df)