In [1]:
import pandas as pd
import numpy as np

## Read in Our Data

In [2]:
aaplNewsStock = pd.read_csv('AppleNewsStock.csv')
msftNewsStock = pd.read_csv('MicrosoftNewsStock.csv')
redditNews    = pd.read_csv('RedditNews.csv')
djia          = pd.read_csv('upload_DJIA_table.csv')

## Combine redditNews and DJIA so that we can get a news column for every date that contains a string of all the news on a given day

In [3]:
uniqueDates = redditNews.groupby('Date')
newsDic = {}
for date,group in uniqueDates:
    for news in (group['News'].to_list()):
        if date in newsDic:
            newsDic[date][0] += news
        else:
            newsDic[date] = [news]
redditNewsByDate = pd.DataFrame.from_dict(newsDic,'index')
redditNewsByDate.columns = ['News']
redditNewsByDate['Date'] = redditNewsByDate.index
redditNewsByDate = redditNewsByDate.reindex(columns = ['Date', 'News'])
redditNewsByDate = redditNewsByDate.reset_index(drop=True)

In [4]:
djiaNewsStock = pd.merge(djia, redditNewsByDate, how ='inner', on ='Date')

## View and Add Labels to the NewsStock Dataframes

In [5]:
aaplNewsStock = aaplNewsStock.reindex(columns = ['Date', 'Open', 'Adj Close', 'Close', 'High', 'Low', 'Volume', 'News']).dropna(axis = 0).reset_index(drop=True).sort_values(by='Date')
print(aaplNewsStock.shape)
labelLst = []
nrows = aaplNewsStock.shape[0]
for i in range(nrows):
    if aaplNewsStock.iloc[i,1] > aaplNewsStock.iloc[i,2]:
        labelLst.append(0)
    else:
        labelLst.append(1)
aaplNewsStock['Label'] = labelLst
aaplNewsStock.head()

(2323, 8)


Unnamed: 0,Date,Open,Adj Close,Close,High,Low,Volume,News,Label
0,2006-12-01,13.114285,13.045714,91.32,13.19,12.871428,198769900,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...,0
1,2006-12-04,13.125714,13.017143,91.120003,13.15,12.928572,177384200,More on Housing Prices : The broadest governme...,0
2,2006-12-06,12.948571,12.832857,89.830002,13.055715,12.81,159546100,Honoring R.W. Apple in Words and Food : About ...,0
3,2006-12-07,12.861428,12.434286,87.040001,12.928572,12.414286,251206900,"Homebuilders, and Worries Over Jobs, Lead a De...",0
4,2006-12-08,12.461429,12.608571,88.259995,12.77,12.428572,196069300,"Homebuilders, and Worries Over Jobs, Lead a De...",1


In [6]:
msftNewsStock = msftNewsStock.drop(columns=['Unnamed: 0']).dropna(axis = 0).reset_index(drop=True).sort_values(by='Date')
msftNewsStock = msftNewsStock.reindex(columns = ['Date', 'Open', 'Adj Close', 'Close', 'High', 'Low', 'Volume', 'News'])
print(msftNewsStock.shape)
labelLst = []
nrows = msftNewsStock.shape[0]
for i in range(nrows):
    if msftNewsStock.iloc[i,1] > msftNewsStock.iloc[i,2]:
        labelLst.append(0)
    else:
        labelLst.append(1)
msftNewsStock['Label'] = labelLst
msftNewsStock.head()

(1341, 8)


Unnamed: 0,Date,Open,Adj Close,Close,High,Low,Volume,News,Label
0,2006-12-04,29.23,29.33,29.33,29.52,29.17,55123400,The Retooling of a Search Engine : Ask.com is ...,1
1,2006-12-06,29.1,28.99,28.99,29.129999,28.870001,48564100,Combat as Usual? Not With These Games : A few ...,0
2,2006-12-07,28.959999,28.85,28.85,29.07,28.809999,46831100,Vista Is Ready. Are You? : Why it might be a b...,0
3,2006-12-12,29.559999,29.43,29.43,29.629999,29.219999,68529400,Take the Hotel Room Home : Hotels have become ...,0
4,2006-12-13,29.6,29.549999,29.549999,29.6,29.32,46002500,Google to Offer Variation on Stock Options : G...,0


In [7]:
djiaNewsStock = djiaNewsStock.reindex(columns = ['Date', 'Open', 'Adj Close', 'Close', 'High', 'Low', 'Volume', 'News']).dropna(axis = 0).sort_values(by='Date').reset_index(drop=True)
print(djiaNewsStock.shape)
labelLst = []
nrows = djiaNewsStock.shape[0]
for i in range(nrows):
    if djiaNewsStock.iloc[i,1] > djiaNewsStock.iloc[i,2]:
        labelLst.append(0)
    else:
        labelLst.append(1)
djiaNewsStock['Label'] = labelLst
djiaNewsStock.head()

(1989, 8)


Unnamed: 0,Date,Open,Adj Close,Close,High,Low,Volume,News,Label
0,2008-08-08,11432.089844,11734.320312,11734.320312,11759.959961,11388.040039,212830000,"b""Georgia 'downs two Russian warplanes' as cou...",1
1,2008-08-11,11729.669922,11782.349609,11782.349609,11867.110352,11675.530273,183190000,b'Why wont America and Nato help us? If they w...,1
2,2008-08-12,11781.700195,11642.469727,11642.469727,11782.349609,11601.519531,173590000,b'Remember that adorable 9-year-old who sang a...,0
3,2008-08-13,11632.80957,11532.959961,11532.959961,11633.780273,11453.339844,182550000,b' U.S. refuses Israel weapons to attack Iran:...,0
4,2008-08-14,11532.070312,11615.929688,11615.929688,11718.280273,11450.889648,159790000,b'All the experts admit that we should legalis...,1


## Add Polarity Scores for Each News Using NLTK

In [8]:
import nltk
nltk.download('vader_lexicon') # one time only (Valence Aware Dictionary and sEntiment Reasoner)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Rodrigo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
negLst = []
neuLst = []
posLst = []
comLst = []

nrows = djiaNewsStock.shape[0]
for i in range(nrows):
    s = str(djiaNewsStock.iloc[i,7])
    pScores = vader.polarity_scores(s)
    neg = pScores['neg']
    negLst.append(neg)
    neu = pScores['neu']
    neuLst.append(neu)
    pos = pScores['pos']
    posLst.append(pos)
    com = pScores['compound']
    comLst.append(com)

djiaNewsStock['neg'] = negLst
djiaNewsStock['neu'] = neuLst
djiaNewsStock['pos'] = posLst
djiaNewsStock['compound'] = comLst

djiaNewsStock = djiaNewsStock.reindex(columns = ['Date', 'Open', 'Adj Close', 'Close', 'High', 'Low', 'Volume', 'News', 'neg', 'neu', 'pos', 'compound', 'Label'])
djiaNewsStock

Unnamed: 0,Date,Open,Adj Close,Close,High,Low,Volume,News,neg,neu,pos,compound,Label
0,2008-08-08,11432.089844,11734.320312,11734.320312,11759.959961,11388.040039,212830000,"b""Georgia 'downs two Russian warplanes' as cou...",0.214,0.724,0.062,-0.9966,1
1,2008-08-11,11729.669922,11782.349609,11782.349609,11867.110352,11675.530273,183190000,b'Why wont America and Nato help us? If they w...,0.135,0.773,0.092,-0.9075,1
2,2008-08-12,11781.700195,11642.469727,11642.469727,11782.349609,11601.519531,173590000,b'Remember that adorable 9-year-old who sang a...,0.139,0.805,0.056,-0.9739,0
3,2008-08-13,11632.809570,11532.959961,11532.959961,11633.780273,11453.339844,182550000,b' U.S. refuses Israel weapons to attack Iran:...,0.146,0.806,0.048,-0.9842,0
4,2008-08-14,11532.070312,11615.929688,11615.929688,11718.280273,11450.889648,159790000,b'All the experts admit that we should legalis...,0.174,0.733,0.093,-0.9774,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1984,2016-06-27,17355.210938,17140.240234,17140.240234,17355.210938,17063.080078,138740000,Barclays and RBS shares suspended from trading...,0.151,0.759,0.090,-0.9683,0
1985,2016-06-28,17190.509766,17409.720703,17409.720703,17409.720703,17190.509766,112190000,"2,500 Scientists To Australia: If You Want To ...",0.137,0.763,0.100,-0.9633,1
1986,2016-06-29,17456.019531,17694.679688,17694.679688,17704.509766,17456.019531,106380000,Explosion At Airport In IstanbulYemeni former ...,0.209,0.715,0.076,-0.9968,1
1987,2016-06-30,17712.759766,17929.990234,17929.990234,17930.609375,17711.800781,133030000,Jamaica proposes marijuana dispensers for tour...,0.211,0.735,0.053,-0.9978,1


In [10]:
negLst = []
neuLst = []
posLst = []
comLst = []

nrows = msftNewsStock.shape[0]
for i in range(nrows):
    s = str(msftNewsStock.iloc[i,7])
    pScores = vader.polarity_scores(s)
    neg = pScores['neg']
    negLst.append(neg)
    neu = pScores['neu']
    neuLst.append(neu)
    pos = pScores['pos']
    posLst.append(pos)
    com = pScores['compound']
    comLst.append(com)

msftNewsStock['neg'] = negLst
msftNewsStock['neu'] = neuLst
msftNewsStock['pos'] = posLst
msftNewsStock['compound'] = comLst

msftNewsStock = msftNewsStock.reindex(columns = ['Date', 'Open', 'Adj Close', 'Close', 'High', 'Low', 'Volume', 'News', 'neg', 'neu', 'pos', 'compound', 'Label'])
msftNewsStock

Unnamed: 0,Date,Open,Adj Close,Close,High,Low,Volume,News,neg,neu,pos,compound,Label
0,2006-12-04,29.230000,29.330000,29.330000,29.520000,29.170000,55123400,The Retooling of a Search Engine : Ask.com is ...,0.137,0.863,0.000,-0.7783,1
1,2006-12-06,29.100000,28.990000,28.990000,29.129999,28.870001,48564100,Combat as Usual? Not With These Games : A few ...,0.051,0.847,0.102,0.9003,0
2,2006-12-07,28.959999,28.850000,28.850000,29.070000,28.809999,46831100,Vista Is Ready. Are You? : Why it might be a b...,0.000,0.810,0.190,0.6597,0
3,2006-12-12,29.559999,29.430000,29.430000,29.629999,29.219999,68529400,Take the Hotel Room Home : Hotels have become ...,0.000,1.000,0.000,0.0000,0
4,2006-12-13,29.600000,29.549999,29.549999,29.600000,29.320000,46002500,Google to Offer Variation on Stock Options : G...,0.000,0.833,0.167,0.5574,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,2016-11-14,59.020000,58.119999,58.119999,59.080002,57.279999,41328400,"Just How Powerful Was the ‘Mule Lobby’? (Yes, ...",0.061,0.757,0.181,0.7351,0
1337,2016-11-15,58.330002,58.869999,58.869999,59.490002,58.320000,35904100,Rising Oil Prices Lead to Market Gains : Bench...,0.133,0.681,0.186,0.1280,1
1338,2016-11-18,60.779999,60.349998,60.349998,61.139999,60.299999,27686300,L.G.B.T. Gamers Huddle for a Postelection Real...,0.113,0.887,0.000,-0.4215,0
1339,2016-11-22,60.980000,61.119999,61.119999,61.259998,60.810001,23206700,"Donald Trump, Japan, Mosul: Your Wednesday Bri...",0.000,1.000,0.000,0.0000,1


In [11]:
negLst = []
neuLst = []
posLst = []
comLst = []

nrows = aaplNewsStock.shape[0]
for i in range(nrows):
    s = str(aaplNewsStock.iloc[i,7])
    pScores = vader.polarity_scores(s)
    neg = pScores['neg']
    negLst.append(neg)
    neu = pScores['neu']
    neuLst.append(neu)
    pos = pScores['pos']
    posLst.append(pos)
    com = pScores['compound']
    comLst.append(com)

aaplNewsStock['neg'] = negLst
aaplNewsStock['neu'] = neuLst
aaplNewsStock['pos'] = posLst
aaplNewsStock['compound'] = comLst

aaplNewsStock = aaplNewsStock.reindex(columns = ['Date', 'Open', 'Adj Close', 'Close', 'High', 'Low', 'Volume', 'News', 'neg', 'neu', 'pos', 'compound', 'Label'])
aaplNewsStock

Unnamed: 0,Date,Open,Adj Close,Close,High,Low,Volume,News,neg,neu,pos,compound,Label
0,2006-12-01,13.114285,13.045714,91.320000,13.190000,12.871428,198769900,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...,0.032,0.905,0.063,0.7707,0
1,2006-12-04,13.125714,13.017143,91.120003,13.150000,12.928572,177384200,More on Housing Prices : The broadest governme...,0.011,0.904,0.085,0.8720,0
2,2006-12-06,12.948571,12.832857,89.830002,13.055715,12.810000,159546100,Honoring R.W. Apple in Words and Food : About ...,0.029,0.878,0.093,0.6858,0
3,2006-12-07,12.861428,12.434286,87.040001,12.928572,12.414286,251206900,"Homebuilders, and Worries Over Jobs, Lead a De...",0.091,0.869,0.040,-0.6712,0
4,2006-12-08,12.461429,12.608571,88.259995,12.770000,12.428572,196069300,"Homebuilders, and Worries Over Jobs, Lead a De...",0.084,0.848,0.069,-0.1796,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318,2016-11-17,109.809998,109.949997,109.949997,110.349998,108.830002,27632000,Turn an iPad-Made Movie Into a DVD : Home vide...,0.031,0.882,0.086,0.9231,1
2319,2016-11-18,109.720001,110.059998,110.059998,110.540001,109.660004,28428900,When Eve and Eve Bit the Apple : A Christian w...,0.052,0.847,0.101,0.7932,1
2320,2016-11-21,110.120003,111.730003,111.730003,111.989998,110.010002,29264600,"Daily Report: At Apple, U.S. Jobs That Go Beyo...",0.106,0.797,0.097,-0.6908,1
2321,2016-11-22,111.949997,111.800003,111.800003,112.419998,111.400002,25965500,A Trade War Against China Might Be a Fight Tru...,0.130,0.786,0.084,-0.8885,0


In [12]:
aaplNewsStock.to_csv('aaplNewsStock.csv') 
msftNewsStock.to_csv('msftNewsStock.csv')
djiaNewsStock.to_csv('djiaNewsStock.csv')