In [79]:


import numpy as np
import pandas as pd
import re
import string
import spacy
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from itertools import chain
from datetime import datetime

pd.set_option("display.max_rows",10)

In [80]:
data = pd.read_csv('data_scrapy.csv', encoding='ISO-8859-1')
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data

Unnamed: 0,Date,Title
0,"2:43 PM ET Wed, 31 Jan 2018","WRAPUP 3-Fed keeps interest rates steady, sees..."
1,"6:14 PM ET Wed, 31 Jan 2018",Fed leaves key rate unchanged at Yellen's fina...
2,"5:22 PM ET Wed, 31 Jan 2018",Former Wells Fargo CEO: Interest rates have be...
3,"1:18 PM ET Wed, 31 Jan 2018","US STOCKS-Boeing ends Wall St's slide, Fed cau..."
4,"12:00 PM ET Wed, 31 Jan 2018",EU to put banks through Brexit mill in toughes...
...,...,...
54412,"3:08 PM ET Wed, 13 Jan 2016",Here's what's really needed to stop oil's fall
54413,"12:46 PM ET Wed, 13 Jan 2016",'Great shakeout' leading to big gains: Piper
54414,"2:16 PM ET Wed, 13 Jan 2016",Next activist investor target: Wall St banks?
54415,"11:18 AM ET Wed, 13 Jan 2016",Why 'lower for longer' is hurting commodity st...


In [81]:
import spacy
nlp = spacy.load('en_core_web_lg') 

# update punctuation in spacy
nlp.vocab[u"$"].is_punct = True
nlp.vocab[u"|"].is_punct = True
nlp.vocab[u"+"].is_punct = True
nlp.vocab[u"<"].is_punct = True
nlp.vocab[u">"].is_punct = True
nlp.vocab[u"="].is_punct = True
nlp.vocab[u"^"].is_punct = True
nlp.vocab[u"`"].is_punct = True
nlp.vocab[u"~"].is_punct = True

In [82]:
def nlp_to_remove(token, remove_stop_words, remove_punct):
    to_remove = token.is_space
    if remove_stop_words:
        to_remove = to_remove or (token.lemma_ in nlp.Defaults.stop_words)
    if remove_punct:
        to_remove = to_remove or token.is_punct
    return(to_remove)

def Grace_nlp(Grace_data, n_words=100, remove_punct=True, remove_stop_words=True):

    res = Grace_data.copy()
    processed_text = []
    print('Starting processing text.....')
    for doc in nlp.pipe(res.Title):
        processed_text.append(' '.join([token.lemma_ for token in doc 
                                        if not nlp_to_remove(token, remove_stop_words, remove_punct)])) 
    res['processed_text'] = processed_text
    
    print('-'*50)
    print('Starting generating features with most frequent words...')
    word_count = Counter(list(chain(*[text.split() for text in res.processed_text])))
    for word, count in word_count.most_common(n_words):
        res[word] = res.processed_text.apply(lambda x: x.count(word))
    return(res)

In [86]:
def Date_parser(date_column, date=None):
    
    parsed_date = []
    print('-'*50)
    print('Starting processing date...')
    for i in range(len(date_column)):
        if re.search('Ago', date_column[i]):
            if not date:
                parsed_date.append(datetime.strftime(datetime.today(), "%d %b %Y"))
            else:
                parsed_date.append(datetime.strftime(datetime(*date), '%d %b %Y'))
        else:
            parsed_date.append(re.sub('^[\w\:\s]+\, ','',date_column[i]))
    return(parsed_date)

In [87]:
%%time 
result = Grace_nlp(data)
result['date_processed'] = Date_parser(result.Date)

Starting processing text.....
--------------------------------------------------
Starting generating features with most frequent words...
--------------------------------------------------
Starting processing date...
CPU times: user 4min 1s, sys: 22.9 s, total: 4min 24s
Wall time: 2min 42s


In [89]:
result

Unnamed: 0,Date,Title,processed_text,-PRON-,'s,fed,stock,market,trump,bank,...,watch,drop,gain,2017,10,ecb,day,yield,warn,date_processed
0,"2:43 PM ET Wed, 31 Jan 2018","WRAPUP 3-Fed keeps interest rates steady, sees...",wrapup 3-fed interest rate steady inflation ri...,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31 Jan 2018
1,"6:14 PM ET Wed, 31 Jan 2018",Fed leaves key rate unchanged at Yellen's fina...,fed leave key rate unchanged yellen 's final m...,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31 Jan 2018
2,"5:22 PM ET Wed, 31 Jan 2018",Former Wells Fargo CEO: Interest rates have be...,wells fargo ceo interest rate manipulate fed long,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31 Jan 2018
3,"1:18 PM ET Wed, 31 Jan 2018","US STOCKS-Boeing ends Wall St's slide, Fed cau...",stocks boeing end wall st 's slide fed caution...,0,1,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,31 Jan 2018
4,"12:00 PM ET Wed, 31 Jan 2018",EU to put banks through Brexit mill in toughes...,eu bank brexit mill tough stress test,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,31 Jan 2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54412,"3:08 PM ET Wed, 13 Jan 2016",Here's what's really needed to stop oil's fall,need stop oil 's fall,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13 Jan 2016
54413,"12:46 PM ET Wed, 13 Jan 2016",'Great shakeout' leading to big gains: Piper,great shakeout lead big gain piper,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,13 Jan 2016
54414,"2:16 PM ET Wed, 13 Jan 2016",Next activist investor target: Wall St banks?,activist investor target wall st bank,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,13 Jan 2016
54415,"11:18 AM ET Wed, 13 Jan 2016",Why 'lower for longer' is hurting commodity st...,lower longer hurt commodity stock,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,13 Jan 2016


In [78]:
result.to_csv('CNBC_with_features.csv', index=False,encoding='ISO-8859-1')