In [72]:
%matplotlib inline

import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

In [73]:
pd. set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 0)

# Alpha Vantage

This is the main dataframe with stock market technical indicators

In [74]:
alphavantage_csv_path = "/app/StockPricePredictions/data/alphavantage/time_series_daily_adjusted/AAPL.csv"

In [75]:
df = pd.read_csv(alphavantage_csv_path, low_memory=False)
df.sort_values(by=["date"], ascending=True, inplace=True)

df = df[df["date"]>='2010-01-01']

df["date"] = pd.to_datetime(df['date'])

df.set_index("date", inplace=True)

In [76]:
df.shape[0]

3064

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3064 entries, 2010-01-04 to 2022-03-04
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   1. open               3064 non-null   float64
 1   2. high               3064 non-null   float64
 2   3. low                3064 non-null   float64
 3   4. close              3064 non-null   float64
 4   5. adjusted close     3064 non-null   float64
 5   6. volume             3064 non-null   float64
 6   7. dividend amount    3064 non-null   float64
 7   8. split coefficient  3064 non-null   float64
dtypes: float64(8)
memory usage: 215.4 KB


In [78]:
df.head()

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04,213.43,214.5,212.38,214.01,6.543876,17633200.0,0.0,1.0
2010-01-05,214.6,215.59,213.25,214.38,6.55519,21496600.0,0.0,1.0
2010-01-06,214.38,215.23,210.75,210.97,6.450921,19720000.0,0.0,1.0
2010-01-07,211.75,212.0,209.05,210.58,6.438996,17040400.0,0.0,1.0
2010-01-08,210.3,212.0,209.06,211.98,6.481804,15986100.0,0.0,1.0


# Benzinga

In [79]:
benzinga_csv_path = "/app/StockPricePredictions/data/benzinga/aapl_non_dupes.csv"

In [80]:
df_benzinga = pd.read_csv(benzinga_csv_path, low_memory=False)

In [81]:
df_benzinga['date'] = pd.to_datetime(df_benzinga['date'])

In [82]:
df_benzinga.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29077 entries, 0 to 29076
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      29077 non-null  datetime64[ns]
 1   author    29077 non-null  object        
 2   created   29077 non-null  object        
 3   updated   29077 non-null  object        
 4   title     29077 non-null  object        
 5   teaser    4824 non-null   object        
 6   body      23916 non-null  object        
 7   url       29077 non-null  object        
 8   image     29077 non-null  object        
 9   channels  29077 non-null  object        
 10  stocks    29077 non-null  object        
 11  tags      29077 non-null  object        
 12  AAPL      29077 non-null  bool          
 13  MSFT      29077 non-null  bool          
 14  AMZN      29077 non-null  bool          
 15  FB        29077 non-null  bool          
 16  NVDA      29077 non-null  bool          
 17  INTC      29

In [83]:
df_benzinga.columns

Index(['date', 'author', 'created', 'updated', 'title', 'teaser', 'body',
       'url', 'image', 'channels', 'stocks', 'tags', 'AAPL', 'MSFT', 'AMZN',
       'FB', 'NVDA', 'INTC', 'NFLX'],
      dtype='object')

In [84]:
df_benzinga["text"] = df_benzinga["title"] + " " + df_benzinga["body"]

In [85]:
df_benzinga['text'] = np.where(df_benzinga["text"], df_benzinga["title"], df_benzinga["text"])

In [86]:
df_benzinga = df_benzinga[[ "date", "text"]]

In [87]:
df_benzinga.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29077 entries, 0 to 29076
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    29077 non-null  datetime64[ns]
 1   text    29077 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 454.5+ KB


In [88]:
df_benzinga.head()

Unnamed: 0,date,text
0,2010-01-02,Time to Sell Apple Puts
1,2010-01-04,"Nexus – Will It Change Telecom Industry Dynamics? (GOOG, AAPL, RIMM, ATT, VZ)"
2,2010-01-04,"Company News for January 04, 2010 - Corporate Summary"
3,2010-01-04,"Technology Industry Update (DELL, AAPL, AMD, INTC, NVD)"
4,2010-01-04,Apple Defies Gravity


# Tweets

Data collected from twint api

In [89]:
twint_csv_path = "/app/StockPricePredictions/data/twint/AAPL_20100101_to_20220304.csv"

In [90]:
df_twint = pd.read_csv(twint_csv_path, low_memory=False, lineterminator='\n')

In [91]:
df_twint['date'] = pd.to_datetime(df_twint['date'])

In [92]:
df_twint = df_twint[[ "id", "date", "created_at", "tweet", "hashtags", "cashtags", "day", "hour", "nlikes", "nretweets"]]

In [93]:
df_twint.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41056 entries, 0 to 41055
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          41056 non-null  int64         
 1   date        41056 non-null  datetime64[ns]
 2   created_at  41056 non-null  float64       
 3   tweet       41056 non-null  object        
 4   hashtags    41056 non-null  object        
 5   cashtags    41056 non-null  object        
 6   day         41056 non-null  int64         
 7   hour        41056 non-null  int64         
 8   nlikes      41056 non-null  int64         
 9   nretweets   41056 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(3)
memory usage: 3.1+ MB


In [94]:
def clean_tweet(df, source):
    # replace URLs with a whitespace

    if source == "benzinga":
        df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
    elif source == "twint":
        df['tweet'] = df['tweet'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
    
    return df

In [95]:
df_benzinga = clean_tweet(df_benzinga, "benzinga")

  df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


In [96]:
df_twint = clean_tweet(df_twint, "twint")

  df['tweet'] = df['tweet'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


In [97]:
df_benzinga.head()

Unnamed: 0,date,text
0,2010-01-02,Time to Sell Apple Puts
1,2010-01-04,"Nexus – Will It Change Telecom Industry Dynamics? (GOOG, AAPL, RIMM, ATT, VZ)"
2,2010-01-04,"Company News for January 04, 2010 - Corporate Summary"
3,2010-01-04,"Technology Industry Update (DELL, AAPL, AMD, INTC, NVD)"
4,2010-01-04,Apple Defies Gravity


# Vader Sentiment

In [98]:
idx = 0

In [99]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
sentence = df_benzinga["text"][idx] 
vs = analyzer.polarity_scores(sentence)
print("{:-<65} {}".format(sentence, str(vs)))

Time to Sell Apple Puts------------------------------------------ {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [100]:
def getSIA(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment

In [101]:
negative = []
neutral = []
positive = []
compound = []

for i in range(0, len(df_benzinga["text"])):
    sia = getSIA(df_benzinga["text"][i])
    negative.append(sia["neg"])
    neutral.append(sia["neu"])
    positive.append(sia["pos"])
    compound.append(sia["compound"])

df_benzinga["negative"] = negative
df_benzinga["neutral"] = neutral
df_benzinga["positive"] = positive
df_benzinga["compound"] = compound

In [103]:
negative = []
neutral = []
positive = []
compound = []

for i in range(0, len(df_twint["tweet"])):
    sia = getSIA(df_twint["tweet"][i])
    negative.append(sia["neg"])
    neutral.append(sia["neu"])
    positive.append(sia["pos"])
    compound.append(sia["compound"])

df_twint["negative"] = negative
df_twint["neutral"] = neutral
df_twint["positive"] = positive
df_twint["compound"] = compound

## Benzinga Group By Date

In [104]:
df_benzinga.set_index("date", inplace=True)

In [105]:
df_benzinga.head()

Unnamed: 0_level_0,text,negative,neutral,positive,compound
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-02,Time to Sell Apple Puts,0.0,1.0,0.0,0.0
2010-01-04,"Nexus – Will It Change Telecom Industry Dynamics? (GOOG, AAPL, RIMM, ATT, VZ)",0.0,0.851,0.149,0.2732
2010-01-04,"Company News for January 04, 2010 - Corporate Summary",0.0,1.0,0.0,0.0
2010-01-04,"Technology Industry Update (DELL, AAPL, AMD, INTC, NVD)",0.0,1.0,0.0,0.0
2010-01-04,Apple Defies Gravity,0.0,1.0,0.0,0.0


In [106]:
df_benzinga_duplicated_index = df_benzinga[df_benzinga.index.duplicated(keep=False)]

In [107]:
df_benzinga_duplicated_index.shape[0]

28562

In [108]:
df_benzinga_nonduplicated_index = df_benzinga[~df_benzinga.index.duplicated(keep=False)]

In [109]:
df_benzinga_nonduplicated_index.shape[0]

515

In [110]:
df_benzinga_groupby = df_benzinga.groupby("date").agg(
     negative = ("negative", "mean"),
     nuetral = ("neutral", "mean"),
     positive = ("positive", "mean"),
     compound = ("compound", "mean"),
     )


In [111]:
df_benzinga_groupby

Unnamed: 0_level_0,negative,nuetral,positive,compound
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-02,0.000000,1.000000,0.000000,0.000000
2010-01-04,0.023300,0.946500,0.030200,0.012850
2010-01-05,0.039714,0.917929,0.042357,0.019529
2010-01-06,0.000000,0.929800,0.070200,0.090833
2010-01-07,0.000000,0.862250,0.137750,0.279775
...,...,...,...,...
2022-02-28,0.090125,0.785000,0.124875,-0.053988
2022-03-01,0.104500,0.841750,0.053750,-0.057088
2022-03-02,0.039500,0.894333,0.066167,0.014058
2022-03-03,0.054462,0.930231,0.015308,-0.082885


In [112]:
df_merge = pd.merge(df, df_benzinga_groupby, how="left", left_index=True, right_index=True)

In [116]:
df_merge.drop(["negative", "nuetral", "positive"], axis=1, inplace=True)

In [118]:
df_merge.rename(columns={"compound": "compound_financial_news"}, inplace=True)

In [119]:
df_merge

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,compound_financial_news
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-04,213.430,214.50,212.38,214.01,6.543876,17633200.0,0.0,1.0,0.012850
2010-01-05,214.600,215.59,213.25,214.38,6.555190,21496600.0,0.0,1.0,0.019529
2010-01-06,214.380,215.23,210.75,210.97,6.450921,19720000.0,0.0,1.0,0.090833
2010-01-07,211.750,212.00,209.05,210.58,6.438996,17040400.0,0.0,1.0,0.279775
2010-01-08,210.300,212.00,209.06,211.98,6.481804,15986100.0,0.0,1.0,0.070060
...,...,...,...,...,...,...,...,...,...
2022-02-28,163.060,165.42,162.43,165.12,165.120000,95056629.0,0.0,1.0,-0.053988
2022-03-01,164.695,166.60,161.97,163.20,163.200000,83474425.0,0.0,1.0,-0.057088
2022-03-02,164.390,167.36,162.95,166.56,166.560000,79724750.0,0.0,1.0,0.014058
2022-03-03,168.470,168.91,165.55,166.23,166.230000,76678441.0,0.0,1.0,-0.082885


In [114]:
df_merge[df_merge.compound.isnull()].shape

(20, 12)

## Twint Group by Date

In [120]:
df_twint.set_index("date", inplace=True)

In [121]:
df_twint.head()

Unnamed: 0_level_0,id,created_at,tweet,hashtags,cashtags,day,hour,nlikes,nretweets,negative,neutral,positive,compound
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-01-01,7273036289,1262365000000.0,Apple 2.0: Tablet: Big iPhone or thin MacBook? $AAPL,[],['aapl'],5,16,2,7,0.0,1.0,0.0,0.0
2010-01-04,7382848163,1262647000000.0,"Apple 2.0: Apple tablet to ship in March, sources say $AAPL",[],['aapl'],1,23,1,6,0.0,1.0,0.0,0.0
2010-01-04,7380526126,1262643000000.0,"Apple Tablet Will Be 10-11 Inches, Ships In March, Says WSJ $AAPL by @jwyarow",[],['aapl'],1,22,4,9,0.0,1.0,0.0,0.0
2010-01-04,7377981409,1262637000000.0,CHART OF THE DAY: Android Taking Wind Out Of iPhone's Sails $GOOG $AAPL $RIMM,[],"['goog', 'aapl', 'rimm']",1,20,11,7,0.0,1.0,0.0,0.0
2010-01-04,7350195565,1262568000000.0,Apple 2.0: How many iPhones did Apple sell? $AAPL,[],['aapl'],1,1,6,10,0.0,1.0,0.0,0.0


In [122]:
df_twint_duplicated_index = df_twint[df_twint.index.duplicated(keep=False)]

In [123]:
df_twint_duplicated_index.shape[0]

40779

In [124]:
df_twint_nonduplicated_index = df_twint[~df_twint.index.duplicated(keep=False)]

In [125]:
df_twint_nonduplicated_index.shape[0]

277

In [126]:
df_twint_groupby = df_twint.groupby("date").agg(
     negative = ("negative", "mean"),
     nuetral = ("neutral", "mean"),
     positive = ("positive", "mean"),
     compound = ("compound", "mean"),
     nlikes = ("nlikes", "sum"),
     nretweets = ("nretweets", "sum"),
     )

In [127]:
df_twint_groupby

Unnamed: 0_level_0,negative,nuetral,positive,compound,nlikes,nretweets
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-01,0.000000,1.000000,0.000000,0.000000,2,7
2010-01-04,0.000000,1.000000,0.000000,0.000000,22,32
2010-01-05,0.100000,0.900000,0.000000,-0.229400,5,11
2010-01-06,0.154000,0.846000,0.000000,-0.476700,4,6
2010-01-07,0.000000,0.875500,0.124500,0.371500,12,13
...,...,...,...,...,...,...
2022-02-28,0.045889,0.892222,0.061833,0.138461,3813,442
2022-03-01,0.044458,0.939958,0.015583,-0.085358,4824,684
2022-03-02,0.044267,0.895467,0.060400,0.060020,4161,470
2022-03-03,0.030533,0.933133,0.036333,0.025033,1987,248


In [128]:
df_merge_two = pd.merge(df_merge, df_twint_groupby, how="left", left_index=True, right_index=True)

In [130]:
df_merge_two.drop(["negative", "nuetral", "positive"], axis=1, inplace=True)

In [131]:
df_merge_two.rename(columns={"compound": "compound_financial_tweets"}, inplace=True)

In [132]:
df_merge_two.head()

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,compound_financial_news,compound_financial_tweets,nlikes,nretweets
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-04,213.43,214.5,212.38,214.01,6.543876,17633200.0,0.0,1.0,0.01285,0.0,22.0,32.0
2010-01-05,214.6,215.59,213.25,214.38,6.55519,21496600.0,0.0,1.0,0.019529,-0.2294,5.0,11.0
2010-01-06,214.38,215.23,210.75,210.97,6.450921,19720000.0,0.0,1.0,0.090833,-0.4767,4.0,6.0
2010-01-07,211.75,212.0,209.05,210.58,6.438996,17040400.0,0.0,1.0,0.279775,0.3715,12.0,13.0
2010-01-08,210.3,212.0,209.06,211.98,6.481804,15986100.0,0.0,1.0,0.07006,0.0,18.0,19.0


In [133]:
df_benzinga_groupby.shape

(3846, 4)

In [135]:
df_twint_groupby.shape

(4175, 6)

In [136]:
df.shape

(3064, 8)

In [138]:
df_merge_two[df_merge_two.compound_financial_news.isnull()].shape

(20, 12)

In [139]:
df_merge_two[df_merge_two.compound_financial_tweets.isnull()].shape

(122, 12)

In [140]:
df_merge_two.fillna(method="ffill", inplace=True)

In [141]:
df_merge_two[df_merge_two.compound_financial_news.isnull()].shape

(0, 12)

In [142]:
df_merge_two[df_merge_two.compound_financial_tweets.isnull()].shape

(0, 12)

In [145]:
# df_merge_two.to_csv("/app/StockPricePredictions/data/alphavantage/time_series_daily_adjusted/AAPL_WITH_FINANCIAL_NEWS_AND_TWEETS.csv")
df_merge_two.head()

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,compound_financial_news,compound_financial_tweets,nlikes,nretweets
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-04,213.43,214.5,212.38,214.01,6.543876,17633200.0,0.0,1.0,0.01285,0.0,22.0,32.0
2010-01-05,214.6,215.59,213.25,214.38,6.55519,21496600.0,0.0,1.0,0.019529,-0.2294,5.0,11.0
2010-01-06,214.38,215.23,210.75,210.97,6.450921,19720000.0,0.0,1.0,0.090833,-0.4767,4.0,6.0
2010-01-07,211.75,212.0,209.05,210.58,6.438996,17040400.0,0.0,1.0,0.279775,0.3715,12.0,13.0
2010-01-08,210.3,212.0,209.06,211.98,6.481804,15986100.0,0.0,1.0,0.07006,0.0,18.0,19.0


# Spacy TextBlob

In [119]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('spacytextblob')
text = 'Nexus – Will It Change Telecom Industry Dynamics? '
doc = nlp(text)

In [120]:
print(doc._.blob.polarity)
# -0.125

print(doc._.blob.subjectivity)
# 0.9

0.0
0.0


# TextBlob

In [122]:
from textblob import TextBlob

def sentiment_analysis(tweet):

 def getSubjectivity(text):
   return TextBlob(text).sentiment.subjectivity
  
 #Create a function to get the polarity
 def getPolarity(text):
   return TextBlob(text).sentiment.polarity
  
 #Create two new columns ‘Subjectivity’ & ‘Polarity’

 subjectivity = getSubjectivity(tweet)
 polarity = getPolarity(tweet)

 def getAnalysis(score):
  if score < 0:
    return "Negative"
  elif score == 0:
    return "Neutral"
  else:
    return "Positive"

 # tweet [‘TextBlob_Analysis’] = tweet  [‘TextBlob_Polarity’].apply(getAnalysis )

 result = getAnalysis(polarity)

 print('Subjectivity:', subjectivity)
 print('Polarity:', polarity)
 print('Analysis:', result)
 # print('Sentiment:', sentiment)

idx = 5

sentiment_analysis(df_twint["tweet"][idx])

df_twint["tweet"][idx]

Subjectivity: 0.0
Polarity: 0.0
Analysis: Neutral


'Why Apple Bought Quattro Wireless And Is Getting Into Advertising $AAPL $GOOG by @fromedome   '