Combined Sentiment Analysis and Price Data

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [17]:
timeSplitDataNetScore = {'01/2021': 0.1386357657917504, '02/2021': 0.14664245056920375, '03/2021': 0.14013468072642057, '04/2021': 0.12663854166666683, '05/2021': 0.1500673718852843, '06/2021': 0.1208334491746305, '07/2021': 0.17671752650176703, '08/2021': 0.2005629290617843, '09/2021': 0.08912335834896816, '10/2021': 0.12653711267605602, '11/2021': 0.11554539835164827, '12/2021': 0.1660256097560975, '01/2022': 0.17248775510204092, '02/2022': 0.16341621093749978}
df = pd.DataFrame({'date': timeSplitDataNetScore.keys(), 'sentiment': timeSplitDataNetScore.values()})
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['date'], y=df['sentiment'], mode='lines+markers'))

In [2]:
# change search term then run all
# aave,  comp, crv, mkr, sushi, uni,  btc and eth lacking tweet data
search_term = "sushi"
tweet_path = 'Tweets\{}.xlsx'.format(search_term)
price_path = 'Price Data\{}.xlsx'.format(search_term)

In [3]:
df = pd.read_excel(tweet_path, sheet_name= "WorkingCopy")

In [4]:
# sentiment analysis
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')

def clean_content(contentInput):
    if type(contentInput) == np.float:
        return ""
    contentInput = contentInput.lower()
    contentInput = re.sub("'", "", contentInput) # to avoid removing contractions in english
    contentInput = re.sub("@[A-Za-z0-9_]+","", contentInput)
    contentInput = re.sub("#[A-Za-z0-9_]+","", contentInput)
    contentInput = re.sub(r'http\S+', '', contentInput)
    contentInput = re.sub('[()!?]', ' ', contentInput)
    contentInput = re.sub('\[.*?\]',' ', contentInput)
    contentInput = re.sub("[^a-z0-9]"," ", contentInput)
    contentInput = contentInput.strip()
    contentInput = contentInput.split()
    contentInput = [w for w in contentInput if not w in stop]
    contentInput = " ".join(word for word in contentInput)
    return contentInput

In [5]:
monthList = pd.date_range('2021-09-01','2022-03-07', 
            freq='MS').strftime("%m/%Y").tolist()

timeSplitData={}
timeSplitDataSentiment = {}

for month in monthList: 
    timeSplitData[month] = []
    timeSplitDataSentiment[month] = [] 

In [6]:
for row in df.iterrows():
    #extracting month data from row
    dateData = row[1]['Date']
    tmp = dateData.find('/')
    monthData = dateData[tmp+1:] #removing day e.g '12/'
    if len(monthData) == 6:
        monthData = "0"+monthData
    
    if monthData in timeSplitData: # check if data is in range of date we want to analyse
        content = row[1]['Content']
        #cleaning Data
        cleaned = clean_content(content)
        #Storing into time split dict
        timeSplitData[monthData].append(cleaned)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if type(contentInput) == np.float:


In [7]:
import nltk
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [8]:
for month in timeSplitData:
    for content in timeSplitData[month]:
        output = sid.polarity_scores(content)
        timeSplitDataSentiment[month].append(output)

In [9]:
timeSplitDataNetScore = {}
for month in monthList:
    timeSplitDataNetScore[month] = 0 
    
for month in timeSplitDataSentiment:
    for output in timeSplitDataSentiment[month]:
        compound = output['compound']
        compound = compound / len(timeSplitDataSentiment[month])
        timeSplitDataNetScore[month] +=compound

In [10]:
df2 = pd.read_excel(price_path, sheet_name="working")

In [11]:
time = df2["time"].to_numpy()
price = df2["PriceUSD"].to_numpy()

In [38]:
fig = go.Figure()

fig.add_trace(go.Line(x=time, y=price, name="Price"))

fig.update_layout(title_text=('6 Month Price History - {}').format(search_term))
# plt.plot(time, price, color="blue", linestyle="solid")  #individual plot for Price History

# plt.title(('6 Month Price History - {}').format(search_term))
# plt.xlabel('Month')
# months = ["1/09/2021","1/10/2021","1/11/2021","1/12/2021","1/01/2022", "1/02/2022", "1/03/2022"]
# plt.xticks(ticks = months, rotation = 45)
# plt.ylabel('Price in USD')


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [39]:
#individual plot of sentinment analysis
xAxis = []
yAxis = []

for date in timeSplitDataNetScore:
    xAxis.append("01/" + date)
    yAxis.append(timeSplitDataNetScore[date])

fig2 = go.Figure()

fig2.add_trace(go.Line(x=xAxis, y=yAxis, name=('Twitter {} CryptoCurrency Sentiment Analysis').format(search_term)))

fig2.update_layout(title_text=('6 Month Sentiment - {}').format(search_term))

# plt.plot(xAxis,yAxis, color='red', marker='o')
# plt.title(('Twitter {} CryptoCurrency Sentiment Analysis').format(search_term))
# plt.xlabel('Month')
# plt.xticks(rotation=45)
# plt.ylabel('Net Sentiment')
# plt.show()



plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [44]:
fig = make_subplots(rows=1, cols=2, subplot_titles=(('6 Month Price History - {}').format(search_term), ('Sentiment Analysis - {}').format(search_term)))

fig.add_trace(go.Line(x=time, y=price), row=1, col=1)

fig.add_trace(go.Line(x=xAxis, y=yAxis), row=1, col=2)

fig.update_layout(title_text="Side By Side Plots")


# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
# ax1.plot(time, price, color="blue", linestyle="solid")
# ax1.set_xticks(ticks = months)
# ax1.set_title(('6 Month Price History - {}').format(search_term))
# ax1.set_ylabel('Price in USD')
# ax1.set_xlabel('Month')
# ax2.plot(xAxis,yAxis, color='red', marker='o')
# ax2.set_title(('Sentiment Analysis - {}').format(search_term))
# ax2.set_ylabel('Net Sentiment')
# ax2.set_xlabel('Month')


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [15]:
tmp = pd.read_excel(price_path, sheet_name="working")
df3 = pd.DataFrame()
target = ["1/09/2021","1/10/2021","1/11/2021","1/12/2021","1/01/2022", "1/02/2022", "1/03/2022"]

for i in target:
    j =(tmp.loc[tmp['time'] == i])
    df3 = df3.append(j)

update = pd.Series(["01/09/2021","01/10/2021","01/11/2021","01/12/2021","01/01/2022", "01/02/2022", "01/03/2022"], name='time', index=[0,30,61,91,122,153,181])
df3.update(update)

In [24]:
time2 = df3["time"].to_numpy()
price2 = df3["PriceUSD"].to_numpy()

fig2 = make_subplots(specs=[[{"secondary_y": True}]])

fig2.add_trace(go.Line(x=time2, y=price2, name="Price"), secondary_y=False)

fig2.add_trace(go.Line(x=xAxis, y=yAxis, name="Sentiment"), secondary_y=True)

fig2.update_layout(title_text=("Price x Sentiment Analysis - {}").format(search_term))

In [25]:
df4 = pd.read_excel(price_path, sheet_name="AvgMthPrice")

month1 = df4["month"].to_numpy()
avgPrice = df4["avgPriceUSD"].to_numpy()

fig3 = make_subplots(specs=[[{"secondary_y": True}]])

fig3.add_trace(go.Line(x=month1, y=avgPrice, name="Price"), secondary_y=False)

fig3.update_layout(title_text=("Average Monthly Price x Sentiment Analysis - {}").format(search_term))

xA = []
yA = []

for date in timeSplitDataNetScore:
    xA.append(date)
    yA.append(timeSplitDataNetScore[date])

fig3.add_trace(go.Line(x=xA, y=yA, name="Sentiment"), secondary_y=True)





plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


