## _Text Pre-Processing Script_

**Script to pre-process text including cleaning up text (convert to lower case, remove stop words and remove punctuation) and lemmatize text**

### _Import Libraries_

In [26]:
import pandas as pd
import numpy as np
import tensorflow
import ast1
import re
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
stop_words = stopwords.words("english")
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
wordnet_lemmatizer = WordNetLemmatizer()
import string
punctuations = string.punctuation
import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

### _Read in 8-K documents data and fix data types_

In [27]:
df_1 = pd.read_csv("./docs_fin_default.csv")
df = df_1

df.shape

(80, 17)

In [28]:
df['release_date'] = pd.to_datetime(df['release_date'])
df.dtypes

Unnamed: 0                    int64
ticker                       object
cik                         float64
txt_link                     object
doc_name                     object
GICS Sector                  object
GICS Sub Industry            object
text                         object
release_date         datetime64[ns]
items                        object
price_change                float64
vix                         float64
rm_week                     float64
rm_month                    float64
rm_qtr                      float64
rm_year                     float64
signal                       object
dtype: object

### _Filter data based on dates needed_

In [29]:
# filter_train = (df['release_date'] <= '2014-01-01')
# filter_test = (df['release_date'] > '2014-01-01')

# train_data = df[filter_train]
# test_data = df[filter_test]

# train_data.tail()

In [30]:
df['release_date'] = df['release_date'].map(lambda x: pd.to_datetime(x))
df['items'] = df['items'].map(lambda x: ast.literal_eval(x))

### _Pre-Process Text_

In [32]:
def nltk_tokenizer(text):
    """
    Function to clean up text including converting to lower case,removing stop words
    removing punctuation and tokenization.
    
    Args:
        text (str): Text to be processed.
        
    Returns:
        list: List of cleaned tokens from text.
    """
    try:
        tokens = [word for word in word_tokenize(text) if word.isalpha()]
        tokens = list(filter(lambda t: t not in punctuations, tokens))
        tokens = list(filter(lambda t: t.lower() not in stop_words, tokens))
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
        filtered_tokens = list(
            map(lambda token: wordnet_lemmatizer.lemmatize(token.lower()), filtered_tokens))
        filtered_tokens = list(filter(lambda t: t not in punctuations, filtered_tokens))
        return filtered_tokens
    except Exception as e:
        raise e

def dask_tokenizer(df):
    """
    Function to map text from a DataFrame to pre-process text.
    
    Args:
        DataFrame: DataFrame containing text to be pre-processed.
        
    Retuns:
        DataFrame: Preprocessed text.
        DataFrame: Length of pre-processed text.
    """
    df['processed_text'] = df['text'].map(nltk_tokenizer)
    df['text_len'] = df['processed_text'].map(lambda x: len(x))
    return df

### _Run Functions to Pre-Process Text_

In [33]:
pbar = ProgressBar()
pbar.register()
ddata = dd.from_pandas(df, npartitions=20)
df = ddata.map_partitions(dask_tokenizer).compute(get=get)

[########################################] | 100% Completed |  2min 13.5s
[########################################] | 100% Completed |  2min 13.6s
[########################################] | 100% Completed |  2min 13.7s
[########################################] | 100% Completed |  2min 13.6s
[########################################] | 100% Completed |  2min 13.7s


In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,ticker,cik,txt_link,doc_name,GICS Sector,GICS Sub Industry,text,release_date,items,price_change,vix,rm_week,rm_month,rm_qtr,rm_year,signal,processed_text,text_len
0,1962,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-18-045761.txt,Information Technology,Information Technology,0001193125-18-045761.txt : 20180214 0001193125...,2018-02-14 16:54:21,[],2.0,2698.629883,4.96,-8.67,-12.19,21.31,up,"[accession, number, conformed, submission, typ...",914
1,1963,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-18-000005.txt,Information Technology,Information Technology,0000320193-18-000005.txt : 20180201 0000320193...,2018-02-01 16:30:17,"[Item 2.02, Item 9.01]",0.88,2821.97998,-2.26,-4.93,3.1,36.79,stay,"[accession, number, conformed, submission, typ...",993
2,1964,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-341015.txt,Information Technology,Information Technology,0001193125-17-341015.txt : 20171113 0001193125...,2017-11-13 16:44:57,[],1.23,2584.840088,0.7,11.58,8.45,39.04,up,"[accession, number, conformed, submission, typ...",15829
3,1965,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-17-000067.txt,Information Technology,Information Technology,0000320193-17-000067.txt : 20171102 0000320193...,2017-11-02 16:30:16,[],5.36,2579.850098,5.5,6.52,5.45,28.88,up,"[accession, number, conformed, submission, typ...",994
4,1966,Apple Inc.,320193.0,https://www.sec.gov/Archives/edgar/data/320193...,0001193125-17-308859.txt,Information Technology,Information Technology,0001193125-17-308859.txt : 20171012 0001193125...,2017-10-12 16:31:33,[],2.15,2550.929932,0.82,-4.53,5.59,28.49,up,"[accession, number, conformed, submission, typ...",305


### _Save Pre-Processed Text to CSV_

In [35]:
df.to_csv("Data/lemmatized_text_default.csv")