In [7]:
import re
import pandas as pd
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import *
import matplotlib.pyplot as plt
from cleantext import clean
from io import StringIO
import numpy as np

### Hent data

In [3]:

# URL of the CSV file
url = 'https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv'

# Fetching the content from the URL
response = requests.get(url)

# Checking if the request was successful (status code 200)
if response.status_code == 200:
    # Reading CSV data using pandas
    csv_data = StringIO(response.text)
    df = pd.read_csv(csv_data)

    # Displaying the first few rows of the DataFrame
    #print(df.head())
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")


In [57]:
def clean_text(text):
    text = text.lower()
    # remove multiple white spaces/tabs or newlines
    mul_uni_whitespaces = re.compile(r'(\s)\s+')
    text = mul_uni_whitespaces.sub(r'\1',text)
    # extract dates with form
    # YY-MM-DD HH-MM:SS
    date = re.compile(r"""
        # YY
        ([0-1][0-9]{3}|20[0-1][0-9]|202[0-3])
        -
        # MM 01-12, DD: 01-31
        (1[0-2]|0[1-9])
        -
        ([1-3]0|[0-2][1-9]|31)
        # whitespace
        \ {1}
        # HH-MM:SS format
        ([0-1][0-9]|2[0-4]):([0-5][0-9]|60):([0-5][0-9].\d*)
        """, re.X)
    text = date.sub(r'<DATE>',text)
    # extract date with alphabetic name, year
    # example: Jan DD, YYYY
    date_2 = re.compile(r"""
        ([A-Z]|[a-z])([a-z]+\ )([1-3]0|[0-2][1-9]|31)
        ,\ {1}
        # YY 0000-2023
        ([0-1][0-9]{3}|20[0-1][0-9]|202[0-3])
        """, re.X)
    text = date_2.sub(r'<DATE>',text)
    
    number = re.compile(r"""
        (\d+[,\.]\d+)|(\d+)  
        (?![\w])             # next character has to be a non-word letter
        """, re.X)
    text = number.sub(r'<NUM>',text)
    
    email = re.compile(r"""
        [\w-]+@[\.\w-]+              # if we have @ we assume it's an email 
        """, re.X)
    text = email.sub(r'<EMAIL>',text)
    
    # website with prefix http and/or www.
    website_http = re.compile(r"""
        (https?://www\.|              # http(s)://www. or www.
        https?://|www\.)              # or http(s)://
        ([^ \t\n\r\f\v,]+)             # capture rest of website    
        """, re.X)
    text = website_http.sub(r'<URL>',text)
    # website with prefix http and/or www.
    website = re.compile(r"""        # no https or www
        [\w-]+\.[\.\w-]+             # assume website if we have a dot .
        (/[^ \t\n\r\f\v,]*)?         # /<address> optional     
        """, re.X)
    text = website.sub(r'<URL>',text)
    return text

def tokenize_content(content):
    news_sample = {
    'content': [],
    'url_count': [],
    'date_count': [],
    'number_count': [],
    'pre_100_freq': [],
    'post_100_freq': [],
    }
    df = pd.DataFrame(news_sample)
    for i in range (len(content)):
        cleaned_news = clean_text(content['content'][i])
        tokens = nltk.word_tokenize(cleaned_news)
        # tokenize all alphabetic words
        tokens = [token for token in tokens if token.isalpha()]

        # count vocab before removal of stop words
        pre_vocab, pre_vocab_freq = np.unique(tokens, return_counts=True)

        pre_vocab = pre_vocab[0:50]
        pre_vocab_freq = pre_vocab_freq[0:50]
        
        # error check
        if np.where(pre_vocab == 'URL')[0].size > 0:
            index = np.where(pre_vocab == 'URL')[0][0]
            url_count = pre_vocab_freq[index]
        else:
            url_count = 0
        
        if np.where(pre_vocab == 'DATE')[0].size > 0:
            index = np.where(pre_vocab == 'DATE')[0][0]
            date_count = pre_vocab_freq[index]
        else:
            date_count = 0
        
        if np.where(pre_vocab == 'NUM')[0].size > 0:
            index = np.where(pre_vocab == 'NUM')[0][0]
            number_count = pre_vocab_freq[index]
        else:
            number_count = 0
        

        pre_vocab_freq_sorted = np.argsort(pre_vocab_freq)[::-1]
        pre_100_freq = pre_vocab[pre_vocab_freq_sorted[0:100]]
        # list of stop words
        stop_words = stopwords.words('english')
        # remove stop words by filtering 
        tokens_no_stop = [token for token in tokens if not token in stop_words]

        # stem words using Porterstemmer
        stemmer = PorterStemmer()
        tokens_stem = [stemmer.stem(token) for token in tokens_no_stop]

        # count vocab after removal of stop words
        post_vocab, post_vocab_freq = np.unique(tokens_stem, return_counts=True)
        post_vocab_freq_sorted = np.argsort(post_vocab_freq)[::-1]
        post_100_freq = post_vocab[post_vocab_freq_sorted[0:100]]

        df.loc[len(df.index)] = [content['content'][i],
                                url_count, 
                                date_count,
                                number_count,
                                str(pre_100_freq),
                                str(post_100_freq),
                                ]

    return df

In [58]:
tokenize_content(df)

Unnamed: 0,content,url_count,date_count,number_count,pre_100_freq,post_100_freq
0,Sometimes the power of Christmas will make you...,0,0,1,['a' 'and' 'at' 'congregation' 'about' 'act' '...,['sermon' 'congreg' 'act' 'ladi' 'waffl' 'hous...
1,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,5,1,7,['NUM' 'URL' 'as' 'a' 'awakening' 'dna' 'is' '...,['num' 'strand' 'url' 'dna' 'awaken' 'strang' ...
2,Never Hike Alone: A Friday the 13th Fan Film U...,3,0,7,['by' 'and' 'a' 'NUM' 'as' 'alone' 'camp' 'URL...,['friday' 'num' 'film' 'fan' 'disanti' 'kyle' ...
3,"When a rare shark was caught, scientists were ...",0,0,6,['a' 'and' 'NUM' 'but' 'can' 'because' 'creatu...,['shark' 'num' 'fish' 'found' 'viper' 'first' ...
4,Donald Trump has the unnerving ability to abil...,0,1,7,['a' 'NUM' 'and' 'ability' 'americans' 'abc' '...,['poll' 'num' 'trump' 'interview' 'presid' 'ne...
...,...,...,...,...,...,...
245,"Prison for Rahm, God’s Work And Many Others\n\...",2,0,20,['and' 'a' 'NUM' 'as' 'about' 'all' 'be' 'beca...,['num' 'kid' 'would' 'peopl' 'right' 'law' 'me...
246,4 Useful Items for Your Tiny Home\n\nHeadline:...,0,0,1,['a' 'are' 'and' 'as' 'but' 'can' 'also' 'beca...,['home' 'tini' 'space' 'use' 'without' 'refrig...
247,Former CIA Director Michael Hayden said Thursd...,0,0,0,['a' 'all' 'be' 'bit' 'collectively' 'countrie...,['presid' 'hayden' 'bit' 'collect' 'trump' 'im...
248,Antonio Sabato Jr. says Hollywood's liberal el...,0,0,9,['and' 'a' 'NUM' 'but' 'because' 'been' 'addre...,['num' 'tv' 'think' 'newsmax' 'winfrey' 'said'...
