In [1]:
import credentials
import settings
import mysql.connector
import pandas as pd
import time
import textblob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.express as px
import nltk
import re
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sqlalchemy import create_engine
import nbformat
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VladD\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VladD\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# LOAD DATA FROM MYSQL
db_connection = mysql.connector.connect(
    host="localhost",
    user="root",
    passwd=credentials.MYSQLPASSWORD,
    database="twitterdb",
    auth_plugin='mysql_native_password',
    charset = 'utf8'
)
engine = create_engine('mysql+mysqlconnector://root:credentials.MYSQLPASSWORD@localhost:3306/twitterdb')  #CREATING CONNECTION TO MYSQL DATABASE
df = pd.read_sql('SELECT id_str, text, created_at, polarity FROM {}'.format(settings.TABLE_NAME), con=db_connection) #READ MYSQL DATA
# DATE TIME
df['created_at']=pd.to_datetime(df['created_at'])
for index,tweets in df[df['polarity']== -1].iterrows():
    print(" "+str(tweets[2])+ " "+tweets[1]) #

 2023-04-03 18:04:51 Just when #crypto twitter gets a bit boring, Elon musk adds the dogecoin logo to twitter
 2023-04-03 18:05:03 Just when #crypto twitter gets a bit boring, Elon Musk adds the dogecoin logo to twitter
 2023-04-03 18:33:54 Are you unable to withdraw from these scam platforms SEND A MESSAGE NOW!!!! -#Kicurency- #Robecoins- #monnore- #Fasbitra - #Omitra - #exbill- #RHK  #okx                   #btc                   #Crypto #1000x #opensoil
 2023-04-03 18:42:37 One hundred million #Ethereum to anyone who solves the problem of evil this side of death.
 2023-04-03 18:53:43 #Bitcoin #Cryptonews #Ethereum #BTC #Crypto #BlockchainShiba Inu (SHIB) Makes Surprisingly Awful Move


  df = pd.read_sql('SELECT id_str, text, created_at, polarity FROM {}'.format(settings.TABLE_NAME), con=db_connection) #READ MYSQL DATA


In [14]:
#SENTIMENT ANALYSIS WITH TEXTBLOB - NEGATIVE TWEETS ARE -1, POSITIVE TWEETS ARE +1, AND NEUTRAL TWEETS ARE 0. DONE BY THE PREPROCESSING FOR BETTER DATA STORAGE
# from textblob import TextBlob
# sentiment=TextBlob(tweet_text).sentiment
# polarity=sentiment.polarity
# subjectivity=sentiment.subjectivity

In [12]:
### FUNCTIONS ###
def clean_content(content):
    content = re.sub(r"http\S+", "", content)
    content = content.replace('&amp;', 'and')
    content = re.sub('[^A-Za-z0-9]+', ' ', content)
    content = content.lower()
    return content

def create_filtered_sen(content):
    tokenwords = word_tokenize(content)
    stop_words = set(stopwords.words('english'))
    filtered_sen = [word for word in tokenwords if word not in stop_words]
    return filtered_sen

def create_frequency_df(filtered_sen):
    fdist = FreqDist(filtered_sen)
    fd = pd.DataFrame(fdist.most_common(20),
                    columns=["Word", "Frequency"]).drop([0]).reindex()
    return fd

In [3]:
#CLEAN AND TRANSFORM DATA TO ENABLE TIME SERIES
result=df.groupby(
    [pd.Grouper(key='created_at',freq='30s'),'polarity']).count().unstack(fill_value=0).stack().reset_index() #CREATING RESULT DATAFRAME,
result=result.rename(columns=
                     {"id_str":"Num of '{}' mentions".format("Ethereum"),"created_at":"Time in UTC"}) #RENAMING THE DATAFRAME
time_series=result["Time in UTC"][result['polarity']==0].reset_index(drop=True)

fig=px.line(result,x='Time in UTC',
            y="Num of '{}' mentions".format("Ethereum"),color='polarity')
fig.show()

In [19]:
#TRACKING MOST FREQUENT VOCABULARY USED IN TWEETS

content = clean_content(' '.join(df["text"]))
filtered_sen = create_filtered_sen(content)
fd = create_frequency_df(filtered_sen)

fig = px.bar(fd, x="Word", y="Frequency")
fig.update_traces(marker_color='rgb(38,122,217)',          \
    marker_line_color='rgb(0,0,0)',                      \
    marker_line_width=1.5, opacity=0.8)
fig.show()
     

In [6]:
########################################
## DASHBOARD-LINE CHART AND BAR GRAPH ##
########################################

#PLOTTING THE LINE CHART

fig = make_subplots(
rows=2, cols=1,
row_heights=[0.6, 0.4], 
specs=[[{"type": "scatter"}],
        [{"type": "bar"}]])

fig.add_trace(go.Scatter(
x=time_series,
y=result["Num of '{}' mentions".format(settings.TRACK_WORDS[0])][result['polarity']==0].reset_index(drop=True),
name="Neutral",
line=dict(color='rgb(0,143,211)'),
opacity=0.8), row=1, col=1)   

fig.add_trace(go.Scatter(
x=time_series,
y=result["Num of '{}' mentions".format(settings.TRACK_WORDS[0])][result['polarity']==-1].reset_index(drop=True),
name="Negative",
line=dict(color='rgb(255,127,0)'),
opacity=0.8), row=1, col=1)

fig.add_trace(go.Scatter(
x=time_series,
y=result["Num of '{}' mentions".format(settings.TRACK_WORDS[0])][result['polarity']==1].reset_index(drop=True),
name="Positive",
line=dict(color='rgb(0,211,202)'),
opacity=0.8), row=1, col=1)

fig.add_trace(go.Bar(x=fd["Word"], y=fd["Frequency"], name="Freq Dist"), row=2, col=1)
fig.update_traces(marker_color='rgb(17,159,249)', marker_line_color='rgb(0,0,0)', \
        marker_line_width=0.5, opacity=0.7, row=2, col=1)

fig.update_layout(
height=1500,
title_text="Real-time Sentiment Analysis of Ethereum from Twitter",
showlegend=True,
font=dict(
family="Courier New, monospace",
size=14,
color="#0b0d0a"
),
margin=dict(t=100,  b=100,  
),
)

fig.update_xaxes(title_text="Time in UTC", row=1, col=1)
fig.update_yaxes(title_text="Num of 'Ethereum' mentions", row=1, col=1)

fig.update_xaxes(title_text="Words", row=2, col=1)
fig.update_yaxes(title_text="Frequency", row=2, col=1)

fig.show()


In [7]:
# WORDS MOST FREQUENTLY USED IN NEGATIVE TWEETS

negative_tweets = df[df['polarity'] < -0.1]
negative_content = clean_content(' '.join(negative_tweets["text"]))
negative_filtered_sen = create_filtered_sen(negative_content)
negative_fd = create_frequency_df(negative_filtered_sen)

#PLOT THE BAR GRAPH
fig = px.bar(negative_fd, x="Word", y="Frequency")
fig.update_traces(marker_color='rgb(38,122,217)',          \
    marker_line_color='rgb(0,0,0)',                      \
    marker_line_width=1.5, opacity=0.8)
fig.show()


In [20]:
# WORDS MOST FREQUENTLY USED IN POSITIVE TWEETS

positive_tweets = df[df['polarity'] > 0.1]
positive_content = clean_content(' '.join(positive_tweets["text"]))
positive_filtered_sen = create_filtered_sen(positive_content)
positive_fd = create_frequency_df(positive_filtered_sen)

#PLOT THE BAR GRAPH
fig = px.bar(positive_fd, x="Word", y="Frequency")
fig.update_traces(marker_color='rgb(38,122,217)',          \
    marker_line_color='rgb(0,0,0)',                      \
    marker_line_width=1.5, opacity=0.8)
fig.show()


In [21]:
fig = make_subplots(
    rows=3, cols=1,
    row_heights=[0.3, 0.3, 0.4],  # Corrected values here
    specs=[[{"type": "bar"}],
           [{"type": "bar"}],
           [{"type": "bar"}]])

fig.add_trace(go.Bar(x=fd["Word"], y=fd["Frequency"], name="Freq Dist"), row=1, col=1)
fig.update_traces(marker_color='rgb(17,159,249)', marker_line_color='rgb(0,0,0)', \
        marker_line_width=0.5, opacity=0.7, row=1, col=1)

fig.add_trace(go.Bar(x=negative_fd["Word"], y=negative_fd["Frequency"], name="Negative Sentiment Top Words"), row=2, col=1)
fig.update_traces(marker_color='rgb(255, 0, 0)', marker_line_color='rgb(0,0,0)', marker_line_width=0.5, opacity=0.7, row=2, col=1)

fig.add_trace(go.Bar(x=positive_fd["Word"], y=positive_fd["Frequency"], name="Positive Sentiment Top Words"), row=3, col=1)
fig.update_traces(marker_color='rgb(0, 255, 0)', marker_line_color='rgb(0,0,0)', marker_line_width=0.5, opacity=0.7, row=3, col=1)

fig.update_layout(height=2000)

fig.update_xaxes(title_text="Words", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)

fig.update_xaxes(title_text="Words", row=2, col=1)
fig.update_yaxes(title_text="Frequency", row=2, col=1)

fig.update_xaxes(title_text="Words", row=3, col=1)
fig.update_yaxes(title_text="Frequency", row=3, col=1)

fig.show()


In [22]:
fig = make_subplots(
    rows=4, cols=1,
    subplot_titles=("Ethereum Sentiment on Twitter","Frequently Used Words in Tweets related to Ethereum","Top Words Used in Negative Sentiment Tweets", "Top Words Used in Positive Sentiment Tweets"),
    row_heights=[0.3, 0.2, 0.2, 0.3], 
    specs=[[{"type": "scatter"}],
           [{"type": "bar"}],
           [{"type": "bar"}],
           [{"type": "bar"}]])
# Time Series
fig.add_trace(go.Scatter(
    x=time_series,
    y=result["Num of '{}' mentions".format(settings.TRACK_WORDS[0])][result['polarity']==0].reset_index(drop=True),
    name="Neutral",
    line=dict(color='rgb(0,143,211)'),
    opacity=0.8), row=1, col=1)
fig.add_trace(go.Scatter(
    x=time_series,
    y=result["Num of '{}' mentions".format(settings.TRACK_WORDS[0])][result['polarity']==-1].reset_index(drop=True),
    name="Negative",
    line=dict(color='rgb(255,127,0)'),
    opacity=0.8), row=1, col=1)
fig.add_trace(go.Scatter(
    x=time_series,
    y=result["Num of '{}' mentions".format(settings.TRACK_WORDS[0])][result['polarity']==1].reset_index(drop=True),
    name="Positive",
    line=dict(color='rgb(0,211,202)'),
    opacity=0.8), row=1, col=1)

fig.add_trace(go.Bar(x=fd["Word"], y=fd["Frequency"], name="Freq Dist"), row=2, col=1)
fig.update_traces(marker_color='rgb(17,159,249)', marker_line_color='rgb(0,0,0)', \
        marker_line_width=0.5, opacity=0.7, row=2, col=1)

fig.add_trace(go.Bar(x=negative_fd["Word"], y=negative_fd["Frequency"], name="Negative Sentiment Top Words"), row=3, col=1)
fig.update_traces(marker_color='rgb(255,127,0)', marker_line_color='rgb(0,0,0)', marker_line_width=0.5, opacity=0.7, row=3, col=1)

fig.add_trace(go.Bar(x=positive_fd["Word"], y=positive_fd["Frequency"], name="Positive Sentiment Top Words"), row=4, col=1)
fig.update_traces(marker_color='rgb(0,211,202)', marker_line_color='rgb(0,0,0)', marker_line_width=0.5, opacity=0.7, row=4, col=1)

fig.update_layout(height=2000)

fig.update_xaxes(title_text="Time in UTC", row=1, col=1)
fig.update_yaxes(title_text="Num of 'Ethereum' mentions", row=1, col=1)

fig.update_xaxes(title_text="Words", row=2, col=1)
fig.update_yaxes(title_text="Frequency", row=2, col=1)

fig.update_xaxes(title_text="Words", row=3, col=1)
fig.update_yaxes(title_text="Frequency", row=4, col=1)

fig.update_xaxes(title_text="Words", row=4, col=1)
fig.update_yaxes(title_text="Frequency", row=4, col=1)

fig.show()
