# Capstone Project - Data Preprocessing

_Author: Yifei Tong_

---
## Goal

The goal of this notebook is to preprocess the previously collected financial news dataset. The main steps include removing stopwords, stemming words, vectorizing news articles, and etc. 

## Steps

1. Read collected data from a pickle file in the S3 bucket
2. Preprocess data
    - Split dataset into a training dataset and a test dataset
    - Turn characters to lower case
    - Remove punctuations
    - Remove stopwords
    - Stemmed all words
    - Create a word dictionary
    - Vectorize texts
3. Save data in a file in proper format

## Step 1: Read Data

In [1]:
import pickle
import boto3
import sagemaker

In [2]:
session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket_name = session.default_bucket()
print(bucket_name)

sagemaker-us-east-2-989457217313


In [3]:
import os

data_dir = '../data'
os.makedirs(data_dir, exist_ok=True)

dataset_prefix = 'financial-news-dataset'
pkl_file_name = 'news_dataset.pickle'
pkl_file_key = os.path.join(dataset_prefix, pkl_file_name)

s3_client = boto3.client('s3')
pickle_file = s3_client.get_object(Bucket=bucket_name, Key=os.path.join(dataset_prefix, pkl_file_name))['Body'].read()
dataset = pickle.loads(pickle_file)

In [4]:
google_X = [art['text'] for art in dataset['google']]
google_y = [art['percentage_change']*100 for art in dataset['google']]
amazon_X = [art['text'] for art in dataset['amazon']]
amazon_y = [art['percentage_change']*100 for art in dataset['amazon']]
facebook_X = [art['text'] for art in dataset['facebook']]
facebook_y = [art['percentage_change']*100 for art in dataset['facebook']]
microsoft_X = [art['text'] for art in dataset['microsoft']]
microsoft_y = [art['percentage_change']*100 for art in dataset['microsoft']]

In [5]:
google_X[:5]

['u.s. news texas serial bomber made video confession before blowing himself up: police the serial bomber whose deadly attacks terrorized austin, texas, for weeks left a 25-minute video "confession" on a cell phone. the suspect blew himself up on wednesday as officers closed in to make an arrest. the video failed to reveal a coherent motive for the attacks. published 6 hours ago reuters source: sinclair broadcast group mark anthony conditt, suspect in the austin bombings scene on security video at a fedex facility. \nthe serial bomber whose deadly attacks terrorized austin, texas, for weeks left a 25-minute video "confession" on a cell phone found after he blew himself up on wednesday as officers closed in to make an arrest, police said. \nmark conditt, 23, an unemployed man from the suburb of pflugerville, detailed how he made all seven bombs that have been accounted for — five that exploded, one that was recovered before it went off and a seventh that he detonated as officers rushed 

In [6]:
google_y[:5]

[-3.0317640292906467,
 -0.7433071978199797,
 0.40603849693958644,
 -1.33516004603624,
 -2.6165929254508913]

## Step 2: Preprocess Data

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re

In [8]:
def convert_news_to_words(news):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    news = re.sub(r"[^a-zA-Z0-9]", " ", news.lower())
    words = news.split()
    words = [w for w in words if w not in stopwords.words("english")]
    words = [PorterStemmer().stem(w) for w in words]
    
    return words

In [9]:
convert_news_to_words(google_X[0])

['u',
 'news',
 'texa',
 'serial',
 'bomber',
 'made',
 'video',
 'confess',
 'blow',
 'polic',
 'serial',
 'bomber',
 'whose',
 'deadli',
 'attack',
 'terror',
 'austin',
 'texa',
 'week',
 'left',
 '25',
 'minut',
 'video',
 'confess',
 'cell',
 'phone',
 'suspect',
 'blew',
 'wednesday',
 'offic',
 'close',
 'make',
 'arrest',
 'video',
 'fail',
 'reveal',
 'coher',
 'motiv',
 'attack',
 'publish',
 '6',
 'hour',
 'ago',
 'reuter',
 'sourc',
 'sinclair',
 'broadcast',
 'group',
 'mark',
 'anthoni',
 'conditt',
 'suspect',
 'austin',
 'bomb',
 'scene',
 'secur',
 'video',
 'fedex',
 'facil',
 'serial',
 'bomber',
 'whose',
 'deadli',
 'attack',
 'terror',
 'austin',
 'texa',
 'week',
 'left',
 '25',
 'minut',
 'video',
 'confess',
 'cell',
 'phone',
 'found',
 'blew',
 'wednesday',
 'offic',
 'close',
 'make',
 'arrest',
 'polic',
 'said',
 'mark',
 'conditt',
 '23',
 'unemploy',
 'man',
 'suburb',
 'pflugervil',
 'detail',
 'made',
 'seven',
 'bomb',
 'account',
 'five',
 'explod',


In [11]:
import pickle
from sklearn.model_selection import train_test_split

cache_dir = '../cache'
os.makedirs(cache_dir, exist_ok=True)

def preprocess_data(X, y, cache_file, train_size=0.8):
    
    cache_data = None
    try:
        with open(os.path.join(cache_dir, cache_file), 'rb') as f:
            cache_data = pickle.load(f)
        print("Cache data read from {}".format(cache_file))
    except:
        pass
    
    if cache_data is None:
        words = [convert_news_to_words(news) for news in X]
        
        train_valid_words, test_words, train_valid_y, test_y = train_test_split(words, y, train_size=train_size)
        train_words, valid_words, train_y, valid_y = train_test_split(train_valid_words, train_valid_y, train_size=train_size)
        cache_data = dict(train_words=train_words, test_words=test_words, valid_words=valid_words, train_y=train_y, test_y=test_y, valid_y=valid_y)
        with open(os.path.join(cache_dir, cache_file), 'wb') as f:
            pickle.dump(cache_data, f)
    else:
        train_words, test_words, valid_words, train_y, test_y, valid_y = (cache_data['train_words'], cache_data['test_words'],
            cache_data['valid_words'], cache_data['train_y'], cache_data['test_y'], cache_data['valid_y'])
    
    return train_words, test_words, valid_words, train_y, test_y, valid_y

In [12]:
google_train_words, google_test_words, google_valid_words, google_train_y, google_test_y, google_valid_y = preprocess_data(google_X, google_y, 'google_cache.pickle')



In [13]:
amazon_train_words, amazon_test_words, amazon_valid_words, amazon_train_y, amazon_test_y, amazon_valid_y = preprocess_data(amazon_X, amazon_y, 'amazon_cache.pickle')
fb_train_words, fb_test_words, fb_valid_words, fb_train_y, fb_test_y, fb_valid_y = preprocess_data(facebook_X, facebook_y, 'fb_cache.pickle')
msft_train_words, msft_test_words, msft_valid_words, msft_train_y, msft_test_y, msft_valid_y = preprocess_data(microsoft_X, microsoft_y, 'msft_cache.pickle')



In [14]:
google_train_y[:10]

[-0.05382035347906716,
 0.8707537156320021,
 -1.0610991005164927,
 0.4273119966757732,
 -0.6338152112165558,
 -1.1118349408332278,
 0.6046072976905025,
 0.1488370332861797,
 0.6573143846286322,
 0.7724915003146318]

In [15]:
import numpy as np

def build_dict(words, file_name, vocab_size=5000):
    
    word_count = {}
    for news in words:
        for word in news:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    
    sorted_words = sorted(word_count.keys(), key=lambda x: word_count[x], reverse=True)
    
    word_dict = {}
    for idx, word in enumerate(sorted_words[:vocab_size - 2]):
        word_dict[word] = idx + 2 
        
    with open(os.path.join(data_dir, file_name), "wb") as f:
        pickle.dump(word_dict, f)
    
    return word_dict

In [16]:
google_dict = build_dict(google_train_words, "google_dict.pickle")
amazon_dict = build_dict(amazon_train_words, "amazon_dict.pickle")
fb_dict = build_dict(fb_train_words, "fb_dict.pickle")
msft_dict = build_dict(msft_train_words, "msft_dict.pickle")

In [17]:
def vectorize_and_pad(word_dict, sentence, pad_length=500):
    NOWORD = 0
    INFREQ = 1
    
    vectorized_sentence = [NOWORD] * pad_length
    
    for index, word in enumerate(sentence[:pad_length]):
        if word in word_dict:
            vectorized_sentence[index] = word_dict[word]
        else:
            vectorized_sentence[index] = INFREQ
    
    return vectorized_sentence, min(len(sentence), pad_length)

In [18]:
def vectorize_and_pad_sentences(sentences, word_dict):
    
    vectorized_sentences = []
    lengths = []
    
    for sentence in sentences:
        vectorized_sentence, length = vectorize_and_pad(word_dict, sentence)
        vectorized_sentences.append(vectorized_sentence)
        lengths.append(length)
    
    return np.array(vectorized_sentences), np.array(lengths)

In [19]:
google_vectorized_sentences, google_lengths = vectorize_and_pad_sentences(google_train_words, google_dict)

In [20]:
google_vectorized_sentences[15]

array([ 796,  123,  255, 1746,  492, 1149,   70,  368,  915,  158, 1105,
        162,  729,  124,   49, 2139, 3563, 2004,  255, 1746,  796,  136,
       2782, 4598,  218,   20, 1789,  953,    5,  668,  255, 2748, 1651,
       4087,    1, 1944,  552,  863,    1,  411,  267, 1450, 1283, 3031,
        261,  863, 1944, 1352,  915,  574, 2380, 1105, 4598,    3,  566,
        796,  208, 4404,  162,  555, 1105, 1944, 1176,  329,  814,  162,
          1,    1, 2783,  154,  249,  138,  154,   27, 1944,  162,  570,
        182,  869,  198, 1145,  751, 1660, 1105,  503, 3280,  486,  112,
        153, 4598,    3, 4598,    3, 1944,  560, 1208,  612,  313, 1567,
        162, 1894,  356,  172,  400, 1208,  612, 1944,  671,  468, 1726,
          1,    1,    1,    1,   16, 1105, 4598,  208,  427, 4808,    3,
        255, 1746,  796,  568, 1944,  480,  908,  261, 2711,  162, 1894,
       1105, 1925,   62,  283,  242, 2711, 3417, 1237,    1,  862,  342,
         20,  779, 1944,    1,  269, 3635,  555, 11

In [21]:
amazon_vectorized_sentences, amazon_lengths = vectorize_and_pad_sentences(amazon_train_words, amazon_dict)
fb_vectorized_sentences, fb_lengths = vectorize_and_pad_sentences(fb_train_words, fb_dict)
msft_vectorized_sentences, msft_lengths = vectorize_and_pad_sentences(msft_train_words, msft_dict)

## Step 3: Save Data

In [22]:
import pandas as pd

pd.concat([pd.DataFrame(google_train_y), pd.DataFrame(google_lengths), pd.DataFrame(google_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(data_dir, 'google_train.csv'), header=False, index=False)
pd.concat([pd.DataFrame(amazon_train_y), pd.DataFrame(amazon_lengths), pd.DataFrame(amazon_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(data_dir, 'amazon_train.csv'), header=False, index=False)
pd.concat([pd.DataFrame(fb_train_y), pd.DataFrame(fb_lengths), pd.DataFrame(fb_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(data_dir, 'fb_train.csv'), header=False, index=False)
pd.concat([pd.DataFrame(msft_train_y), pd.DataFrame(msft_lengths), pd.DataFrame(msft_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(data_dir, 'msft_train.csv'), header=False, index=False)

## Step 4: Process and Save Test Data for Testing

In [23]:
google_test_vectorized_sentences, google_test_lengths = vectorize_and_pad_sentences(google_test_words, google_dict)
amazon_test_vectorized_sentences, amazon_test_lengths = vectorize_and_pad_sentences(amazon_test_words, amazon_dict)
fb_test_vectorized_sentences, fb_test_lengths = vectorize_and_pad_sentences(fb_test_words, fb_dict)
msft_test_vectorized_sentences, msft_test_lengths = vectorize_and_pad_sentences(msft_test_words, msft_dict)

In [24]:
test_data_dir = '../test_data'
os.makedirs(test_data_dir, exist_ok=True)

pd.concat([pd.DataFrame(google_test_lengths), pd.DataFrame(google_test_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(test_data_dir, 'google_test.csv'), index=False)
pd.concat([pd.DataFrame(amazon_test_lengths), pd.DataFrame(amazon_test_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(test_data_dir, 'amazon_test.csv'), index=False)
pd.concat([pd.DataFrame(fb_test_lengths), pd.DataFrame(fb_test_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(test_data_dir, 'fb_test.csv'), index=False)
pd.concat([pd.DataFrame(msft_test_lengths), pd.DataFrame(msft_test_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(test_data_dir, 'msft_test.csv'), index=False)

In [25]:
pd.DataFrame(google_test_y).to_csv(os.path.join(test_data_dir, 'google_test_y.csv'), index=False)
pd.DataFrame(amazon_test_y).to_csv(os.path.join(test_data_dir, 'amazon_test_y.csv'), index=False)
pd.DataFrame(fb_test_y).to_csv(os.path.join(test_data_dir, 'fb_test_y.csv'), index=False)
pd.DataFrame(msft_test_y).to_csv(os.path.join(test_data_dir, 'msft_test_y.csv'), index=False)

## Step 5: Process and Save Validation Data

In [26]:
google_valid_vectorized_sentences, google_valid_lengths = vectorize_and_pad_sentences(google_valid_words, google_dict)
amazon_valid_vectorized_sentences, amazon_valid_lengths = vectorize_and_pad_sentences(amazon_valid_words, amazon_dict)
fb_valid_vectorized_sentences, fb_valid_lengths = vectorize_and_pad_sentences(fb_valid_words, fb_dict)
msft_valid_vectorized_sentences, msft_valid_lengths = vectorize_and_pad_sentences(msft_valid_words, msft_dict)

In [27]:
pd.concat([pd.DataFrame(google_valid_y), pd.DataFrame(google_valid_lengths), pd.DataFrame(google_valid_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(data_dir, 'google_valid.csv'), header=False, index=False)
pd.concat([pd.DataFrame(amazon_valid_y), pd.DataFrame(amazon_valid_lengths), pd.DataFrame(amazon_valid_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(data_dir, 'amazon_valid.csv'), header=False, index=False)
pd.concat([pd.DataFrame(fb_valid_y), pd.DataFrame(fb_valid_lengths), pd.DataFrame(fb_valid_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(data_dir, 'fb_valid.csv'), header=False, index=False)
pd.concat([pd.DataFrame(msft_valid_y), pd.DataFrame(msft_valid_lengths), pd.DataFrame(msft_valid_vectorized_sentences)], axis=1) \
        .to_csv(os.path.join(data_dir, 'msft_valid.csv'), header=False, index=False)