In [1]:
import pandas as pd
import numpy as np
import regex as re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from tqdm import tqdm

In [2]:
import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("GPUs: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0
GPUs:  0


In [2]:
df = pd.read_csv('./Data/covid19_articles_20201231.csv')

# Date to datetime
df['date'] = pd.to_datetime(df['date'])

# Drop duplicates
num_articles_pre_drop = len(df)
df = df.drop_duplicates(subset='content')
df = df.reset_index(drop=True)
print('no. articles dropped: ', num_articles_pre_drop - len(df))
print('no. of articles: ', len(df))

no. articles dropped:  127
no. of articles:  368920


In [3]:
# Drop long articles
num_articles_pre_drop = len(df)
df = df[df['content'].str.len() < 100000].reset_index(drop=True)
print('no. articles dropped: ', num_articles_pre_drop - len(df))
print('no. of articles: ', len(df))

no. articles dropped:  801
no. of articles:  368119


In [4]:
# Drop articles with annual report as title
num_articles_pre_drop = len(df)
df = df[~df['title'].str.contains("Annual Report", na=False)].reset_index(drop=True)
print('no. articles dropped: ', num_articles_pre_drop - len(df))
print('no. of articles: ', len(df))

no. articles dropped:  172
no. of articles:  367947


In [5]:
df.head(3)

Unnamed: 0,author,date,domain,title,url,content,topic_area
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business
1,Thomas Hughes,2020-01-03,marketbeat,Labor Stocks Are Going To Break Out In 2020,https://www.marketbeat.com/originals/labor-sto...,The labor markets were one of the most closely...,business
2,Steve Anderson,2020-01-03,marketbeat,"Tesla (TSLA) Breaks Shipment Record, Beats Est...",https://www.marketbeat.com/originals/teal-brea...,"It could be forgiven, that some might think th...",business


### Cleaning Functions

In [6]:
def remove_emails_tags(text):
    """
    : remove emails and tags (e.g @sleepingbeauty)
    """
    # Add a space before commas to seperate from preceeding word
    text = re.sub(",", " ,", text)
    
    regex_email = re.compile(r'[\w\.-]+@[\w\.-]+.') # email with character at end, e.g. space
    regex_email2 = re.compile(r'[\w\.-]+@[\w\.-]+') # email with no character at end
    regex_tag = re.compile(r'@[\w\.-]+') # tags e.g @sleepingbeauty
    regex_tag = re.compile(r'\S+@[\w\.-]+.') # tags plust character e.g (@sleepingbeauty)
    email = regex_email.findall(text)
    email2 = regex_email2.findall(text)
    tag = regex_tag.findall(text)
    word_removals = email + email2 + tag
    
    tokens = text.split(" ")
    # Remove if email
    tokens_filtered = [word for word in tokens if word not in word_removals]
    
    text_clean = (" ").join(tokens_filtered)
    
    return text_clean


def is_number(text):
    """
    :utility function to test if text is a number
    """
    try:
        float(text) if '.' in text else int(text)
        return True
    except ValueError:
        return False

def remove_numbers(text):
    """
    :remove numbers, percentages, dollar values
    """
    
    regex_percent = re.compile(r'[\-\+]?[\d]+\.?\d*[%]+\.?') # e.g. 3%, -3.8%
    regex_number_fullstop = re.compile(r'\d+[\.]') # e.g. 2019.
    regex_dollar_number = re.compile(r'[\$]+[\d,]+\.?\d*') # e.g. $2 or $2.4567
    
    percent = regex_percent.findall(text)
    number_fullstop = regex_number_fullstop.findall(text)
    dollar_number = regex_dollar_number.findall(text)
    word_removals = percent + number_fullstop + dollar_number

    tokens = text.split(" ")
    
    # remove integers and floats
    tokens_filtered = [word for word in tokens if not is_number(word)]
    # remove percentages
    tokens_filtered = [word for word in tokens_filtered if word not in word_removals]
    
    text_clean = (" ").join(tokens_filtered)
    
    return text_clean

def remove_contraction_possesive_apostrophes(text):
    """
    :remove contraction e.g. can't, won't, she'll -> cant, and possesive apostrophes +s e.g The president's dog -> president
    note: not completely correct as removes 's from e.g. that's which is a contraction rather that possessive, but this is not seen as an issue
    """
    
    regex_contraction = re.compile(r"[a-zA-Z]+'[a-rt-zA-RT-Z]+|[a-zA-Z]+’[a-rt-zA-RT-Z]+")
    contraction = regex_contraction.findall(text)
    
    regex_possessive = re.compile(r"[a-zA-Z]+'[sS]+|[a-zA-Z]+’[sS]+")
    possessive = regex_possessive.findall(text)
    
    tokens = text.split(" ")
    
    # remove contractions
    tokens_filtered = [word if word not in contraction else word.replace("'", "").replace("’","") for word in tokens]
    # replace 's
    tokens_filtered = [word if word not in possessive else word.replace("'s", "").replace("’s","").replace("'S","").replace("’S","") for word in tokens_filtered]
    
    text_clean = (" ").join(tokens_filtered)
    
    return text_clean
    

In [7]:
test = remove_emails_tags(df['content'][2])
test = remove_contraction_possesive_apostrophes(test)
#test

In [8]:
def split_sentences(text):
    """
    return a list of sentences from text based on common separators
    """
    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
    sentences = sentence_delimiters.split(text)
    
    return sentences

In [9]:
test = split_sentences(test)
#test

In [10]:
def build_stop_word_regex():
    """
    :used in function generate_phrases to remove stopwords before
    compiling multi word phrases
    """
    stop_word_list = spacy.lang.en.stop_words.STOP_WORDS
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = r'\b' + word + r'(?![\w-])'
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern

def generate_phrases(text, stopword_regex):
    """
    :compile list of multi word phrases by splitting sentences on stopwords
    """
    
    phrase_list = []
    for sentence in text:
        # replace stopwords with | in order to break sentences up
        tmp = re.sub(stopword_regex, '|', sentence.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            phrase = ' '.join(phrase.split())
            if phrase != "":
                phrase_list.append(phrase)
    phrase_list = [phrase for phrase in phrase_list if re.search(r'\w+', phrase)]
    
    return phrase_list

def remove_starting_symbol(phrase_list):
    """
    :remove starting symbols or numbers from phrases e.g '“whilst rewearing' or '-based website'
    """
    phrase_list_cleaned = []
    
    for phrase in phrase_list:
        try:
            while not phrase[0].isalpha():
                phrase = phrase[1:]
        except IndexError:
            pass
        phrase_list_cleaned.append(phrase)
            
    return phrase_list_cleaned

def remove_single_character_phrase(phrase_list):
    
    phrase_list_cleaned = [phrase for phrase in phrase_list if len(phrase) > 1]
            
    return phrase_list_cleaned

In [11]:
stopword_regex = build_stop_word_regex()
test = generate_phrases(test, stopword_regex)
#test

### Combined Function

In [12]:
def data_preprocessing(df):
    df_new = df.copy()
    df_new['content_processed'] = ''
    
    stopword_regex = build_stop_word_regex()
    
    for idx, content in enumerate(df_new['content']):
        #print(idx, 'content: \n',content, '\n')
        text = remove_emails_tags(content)
        #print('emails: \n',text, '\n')
        text = remove_numbers(text)
        #print('numbers: \n', text, '\n')
        text = remove_contraction_possesive_apostrophes(text)
        text = split_sentences(text)
        #print('split sentences: \n', text, '\n')
        text = generate_phrases(text, stopword_regex)
        text = remove_starting_symbol(text)
        #print('PHRASES: \n', text, '\n')
        text = remove_single_character_phrase(text)
        df_new.at[idx, 'content_processed'] = text
        if idx % 25000 == 0:
            print(f'{idx} records processed')
    
    return df_new
    

# Complete Preprocessing

In [13]:
# ~ 1 hour to run
df_processed = data_preprocessing(df)
df_processed.to_pickle('./Data/df_processed.pickle')

0 records processed
25000 records processed
50000 records processed
75000 records processed
100000 records processed
125000 records processed
150000 records processed
175000 records processed
200000 records processed
225000 records processed
250000 records processed
275000 records processed
300000 records processed
325000 records processed
350000 records processed


# TO DO

### Strip company names out of text
Just an idea to come back to because otherwise they may form significance for topic

In [255]:
print(df['content'][0])

With the end of the year just around the corner, it’s past time to think about positioning for2020. When it comes to earnings power in 2020, the Industrial sector is going to be the market leader and that is where I like my money to be. To be clear, when I say Industrial Sector I mean the S&P 500 Industrial Sector as represented by the ETF (XLI). Yes, the Energy Sector is expected to post EPS growth double that of the Industrials but investors should take that news with a grain of salt. The Energy Sector’s (XLE) consensus EPS growth estimate for 2020 is 21% but only after falling -28% this year. The Industrial Sector is expected to grow by 15% next year (2nd fastest pace for the S&P 500) after contracting only -3% this year. That means the Energy Sector’s earnings will still be down on a two-year basis while the Industrial’s will rise. Energy may yet turn out to be a good investment for 2020 but, on an earnings basis, the Industrials are a much better choice. Don’t Bet On Boeing Boeing