## Importing necessary libraries

In [1]:
import json
import numpy as np
import pandas as pd
from langdetect import detect
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
from textblob import TextBlob
import afinn
import plotly.express as px
import plotly.graph_objects as go

## Loading the Telegram data in JSON

In [2]:
file = open('result.json', encoding='utf8')
data = json.load(file)

### Extracting only the messages component of data

In [3]:
messages = data['messages']

### Converting JSON messages into pandas DataFrame

In [4]:
messagesData = pd.DataFrame(messages)

### A look at the dataframe

In [5]:
messagesData.head(5)

Unnamed: 0,id,type,date,from,from_id,text,reply_to_message_id,forwarded_from,actor,actor_id,...,message_id,file,thumbnail,media_type,sticker_emoji,contact_information,contact_vcard,mime_type,duration_seconds,via_bot
0,1903819,message,2021-05-01T00:00:12,,user1650688285,hi,,,,,...,,,,,,,,,,
1,1903842,message,2021-05-01T00:01:13,,user1650688285,do indicators work?,,,,,...,,,,,,,,,,
2,1903855,message,2021-05-01T00:01:50,Social Ch4in,user484605980,If you trade true gbp for gbp Fiat or vice Ver...,,,,,...,,,,,,,,,,
3,1903856,message,2021-05-01T00:02:05,Social Ch4in,user484605980,And how much is that fee?,,,,,...,,,,,,,,,,
4,1903857,message,2021-05-01T00:02:12,Social Ch4in,user484605980,If you’re a silver card holder,,,,,...,,,,,,,,,,


### Dropping unnecessary features like ids of message, fromPerson, toPerson etc.

In [6]:
messagesData = messagesData.loc[:, :'text']

In [7]:
messagesData = messagesData.drop(['type','id', 'from', 'from_id'], axis = 1)

### See size of data and the only features remaining are date and text messages.

In [8]:
messagesData

Unnamed: 0,date,text
0,2021-05-01T00:00:12,hi
1,2021-05-01T00:01:13,do indicators work?
2,2021-05-01T00:01:50,If you trade true gbp for gbp Fiat or vice Ver...
3,2021-05-01T00:02:05,And how much is that fee?
4,2021-05-01T00:02:12,If you’re a silver card holder
...,...,...
49431,2021-05-15T23:56:33,Yea I do
49432,2021-05-15T23:57:02,I beleive it will tank.
49433,2021-05-15T23:57:16,Are you worry that too many see that URL is a ...
49434,2021-05-15T23:58:56,Dammit i want some fake giveaways too. Where i...


### isEnglish function takes into a string/text/message and returns true if it is in English and False otherwise
#### Made use of detect from langdetect library - https://pypi.org/project/langdetect/

In [9]:
def isEnglish(text):
    try:
        return detect(text)=='en'
    except:
        return False

### Dropping all the non english text rows from dataframe using isEnglish function defined above
#### Takes a few minutes to get executed, Progress is measured using tqdm

In [10]:
for index, dataPoint in tqdm(messagesData.iterrows(), total = len(messagesData), desc = 'Filtering non English chats'):
    if isEnglish(dataPoint['text']) == False:
        messagesData.drop(index, inplace = True)

Filtering non English chats: 100%|███████| 49436/49436 [02:59<00:00, 274.72it/s]


### Need to reset indexes everytime we drop some rows from the dataFrame

In [11]:
messagesData = messagesData.reset_index(drop = True)
messagesData

Unnamed: 0,date,text
0,2021-05-01T00:01:13,do indicators work?
1,2021-05-01T00:01:50,If you trade true gbp for gbp Fiat or vice Ver...
2,2021-05-01T00:02:05,And how much is that fee?
3,2021-05-01T00:02:12,If you’re a silver card holder
4,2021-05-01T00:02:14,Make CKB withdrawable !
...,...,...
32749,2021-05-15T23:55:27,Any one belives in shiba?
32750,2021-05-15T23:55:58,That's a meme coin
32751,2021-05-15T23:56:01,Just talk dirty with them.
32752,2021-05-15T23:57:16,Are you worry that too many see that URL is a ...


### filterOnLetters function return True if the text/message/string contains "SHIB" or "DOGE", returns False otherwise

In [12]:
def filterOnLetters(text, word1 = "SHIB", word2="DOGE"):
    words = text.split()
    if word1 in words or word2 in words:
        return True
    return False

### Applying filterOnLetters function on the text column of our dataframe, dropping all the rows whose text doesn't contain "SHIB" or "DOGE"

In [13]:
# messagesData
messagesData = messagesData[messagesData['text'].apply(filterOnLetters)]

### Need to reset indexes everytime we drop some rows from the dataFrame

In [14]:
messagesData = messagesData.reset_index(drop = True)
messagesData

Unnamed: 0,date,text
0,2021-05-01T16:40:46,"If DOGE is more worth than CRO for now, who kn..."
1,2021-05-03T16:44:13,DOGE coin is up by +16.39%🚀🚀🚀
2,2021-05-03T19:50:58,Your did not receive your DOGE from the Superc...
3,2021-05-04T12:45:03,Invest in a coin with fundamentals. DOGE doesn...
4,2021-05-04T18:45:12,How DOGE points work ?
...,...,...
190,2021-05-14T18:17:44,"Hi, what is the email for support? I didn't re..."
191,2021-05-14T23:27:04,new coin SHIB is good or not
192,2021-05-15T21:04:06,"From where to buy SHIB in India, any leads ple..."
193,2021-05-15T21:08:02,SHIB is listed through the Cryptocom App 🙏


### Observation:
#### Initially, we had 49436 rows/messages in our dataframe originally taken from telegram
#### After removing non-english sentences, we had 32802 messages left with us in the dataframe
#### Further, on applying the "DOGE" and "SHIB" filter on the dataFrame, we are left with only 197 messages

### Unsupervised Sentiment Analysis -
#### Using TextBlob lexicon to calculate the sentiment score of each message and further categorizing each message into "positive", "negative" and "neutral" according to the scores assigned by TextBlob

In [15]:
messagesData['Polarity_TextBlob'] = messagesData['text'].map(lambda text: TextBlob(text).sentiment.polarity)
listOfCategoryTextBlob = ['positive' if score > 0 else 'negative' if score < 0 else 'neutral' 
                                  for score in messagesData['Polarity_TextBlob']]
Category_TextBlob = pd.DataFrame(listOfCategoryTextBlob)
messagesData['Category_TextBlob'] = Category_TextBlob

#### Using Afinn lexicon to calculate the sentiment score of each message and further categorizing each message into "positive", "negative" and "neutral" according to the scores assigned by Afinn

In [16]:
anf = afinn.Afinn()
messagesData['Polarity_Afinn'] = messagesData['text'].map(lambda text: anf.score(text))
Category_Afinn = pd.DataFrame(['positive' if score > 0 else 'negative' if score < 0 else 'neutral' 
                                  for score in messagesData['Polarity_Afinn']])
messagesData['Category_Afinn'] = Category_Afinn

#### Above two lexicons are used in order to first compare which will suit better to our dataset.

In [17]:
messagesData

Unnamed: 0,date,text,Polarity_TextBlob,Category_TextBlob,Polarity_Afinn,Category_Afinn
0,2021-05-01T16:40:46,"If DOGE is more worth than CRO for now, who kn...",0.266667,positive,2.0,positive
1,2021-05-03T16:44:13,DOGE coin is up by +16.39%🚀🚀🚀,0.000000,neutral,0.0,neutral
2,2021-05-03T19:50:58,Your did not receive your DOGE from the Superc...,0.000000,neutral,0.0,neutral
3,2021-05-04T12:45:03,Invest in a coin with fundamentals. DOGE doesn...,-0.071429,negative,0.0,neutral
4,2021-05-04T18:45:12,How DOGE points work ?,0.000000,neutral,0.0,neutral
...,...,...,...,...,...,...
190,2021-05-14T18:17:44,"Hi, what is the email for support? I didn't re...",0.000000,neutral,4.0,positive
191,2021-05-14T23:27:04,new coin SHIB is good or not,0.418182,positive,3.0,positive
192,2021-05-15T21:04:06,"From where to buy SHIB in India, any leads ple...",0.000000,neutral,1.0,positive
193,2021-05-15T21:08:02,SHIB is listed through the Cryptocom App 🙏,0.000000,neutral,0.0,neutral


### Adding one more row to our dataset categorizing each message to know if it contains only "DOGE" or only "SHIB" or both.
#### categorizeOnDogeOrShib is defined in order to check in a text/message/string if it contains "DOGE" or "SHIB" or both.

In [18]:
def categorizeOnDogeOrShib(text):
    words = text.split()
    if "DOGE" in words and "SHIB" in words:
        return "DOGESHIB"
    elif "DOGE" in words:
        return "DOGE"
    return "SHIB"

In [19]:
messagesData['DOGE OR SHIB'] = pd.DataFrame(["DOGESHIB" if "DOGE" in words and "SHIB" in words else "DOGE" if "DOGE" in words else "SHIB" for words in messagesData['text']])

### Grouping the data according to the new feature for comparison of the two lexicons being used

In [20]:
messagesData.groupby(by=['DOGE OR SHIB']).describe()

Unnamed: 0_level_0,Polarity_TextBlob,Polarity_TextBlob,Polarity_TextBlob,Polarity_TextBlob,Polarity_TextBlob,Polarity_TextBlob,Polarity_TextBlob,Polarity_TextBlob,Polarity_Afinn,Polarity_Afinn,Polarity_Afinn,Polarity_Afinn,Polarity_Afinn,Polarity_Afinn,Polarity_Afinn,Polarity_Afinn
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
DOGE OR SHIB,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
DOGE,55.0,0.071495,0.248532,-0.8,0.0,0.0,0.2,0.625,55.0,0.436364,2.043632,-6.0,0.0,0.0,1.5,7.0
DOGESHIB,7.0,0.012857,0.034017,0.0,0.0,0.0,0.0,0.09,7.0,0.285714,1.704336,-3.0,0.0,0.0,1.5,2.0
SHIB,133.0,0.029578,0.228623,-1.0,0.0,0.0,0.0,1.0,133.0,-0.398496,2.279486,-7.0,-1.0,0.0,0.0,5.0


## Comparison of Afinn and TextBlob

In [21]:
# Picking the message index with most Afinn polarity i.e. most positive sentiment message according to Afinn.
pos_idx = messagesData[(messagesData.Polarity_Afinn == 7)].index[0]

# Picking the message index with least Afinn polarity i.e. most negative sentiment message according to Afinn.
neg_idx = messagesData[(messagesData.Polarity_Afinn == -7)].index[0]

# Scores +7 and -7 are taken from max and min values in the description of groupby above.

print(messagesData.iloc[pos_idx][['text']][0])
print(messagesData.iloc[neg_idx][['text']][0])

Lol now that's funny. I can't convert DOGE to CRO.
The supercharger with SHIB was a real dissappointment. SHIB is another useless nonsense meme coin and worse it's an ERC20. 

I would much prefer to see coins on supercharger with real potential and utility. ENJ was alright. But what about coin launches on supercharger or having more than one coin at a time???


### Observation:
#### Most positive sentiment according to Afinn doesn't really seem like a positive message as the user is disappointed with the SHIB coin.
#### Most negative sentiment according to Afinn doesn't really sound like a negative message as the user is being neutral

In [22]:
# Picking the message index with most TextBlob polarity i.e. most positive sentiment message according to TextBlob.
pos_idx = messagesData[(messagesData.Polarity_TextBlob == 1)].index[0]

# Picking the message index with least TextBlob polarity i.e. most negative sentiment message according to TextBlob.
neg_idx = messagesData[(messagesData.Polarity_TextBlob == -1)].index[0]

# Scores +1 and -1 are taken from max and min values in the description of groupby above, moreover, TextBlob gives a normalised score.

print(messagesData.iloc[pos_idx][['text']][0])
print(messagesData.iloc[neg_idx][['text']][0])


SHIB is win!
listing SHIB is really disgusting for CDC.


## Observation:
### While looking at most positive and most negative messages by TextBlob and Afinn, we observe that TextBlob is indeed actually better scoring the text in terms of sentiments.
### Therefore, I will go with TextBlob and consider it's scores as more legit and accurate.

## Graphical Differences in Afinn and TextBlob scorings

In [24]:
fig = go.Figure()
x = ['Positive', 'Negative', 'Neutral']
fig.add_trace(go.Bar(
    x= x,
    y= [messagesData['Category_Afinn'].value_counts()[1], messagesData['Category_Afinn'].value_counts()[2], messagesData['Category_Afinn'].value_counts()[0]], 
    name='Afinn',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=x,
    y=[messagesData['Category_TextBlob'].value_counts()[1], messagesData['Category_TextBlob'].value_counts()[2], messagesData['Category_TextBlob'].value_counts()[0]],
    name='TextBlob',
    marker_color='green'
))

fig.update_layout(barmode='group',
                 title="Afinn v/s TextBlob polarity differences",
                xaxis_title="Sentiment",
                yaxis_title="Number of messages")
fig.show()
fig.write_image("AfinnVSTextBlob.png")

### Since, we are only observing dates 1-15 of May 2021, we don't really need month, year and time in the date column, we therefore slice our date to what's required from it.

In [25]:
messagesData['date']=messagesData['date'].str.slice(8,10)

In [26]:
messagesData

Unnamed: 0,date,text,Polarity_TextBlob,Category_TextBlob,Polarity_Afinn,Category_Afinn,DOGE OR SHIB
0,01,"If DOGE is more worth than CRO for now, who kn...",0.266667,positive,2.0,positive,DOGE
1,03,DOGE coin is up by +16.39%🚀🚀🚀,0.000000,neutral,0.0,neutral,DOGE
2,03,Your did not receive your DOGE from the Superc...,0.000000,neutral,0.0,neutral,DOGE
3,04,Invest in a coin with fundamentals. DOGE doesn...,-0.071429,negative,0.0,neutral,DOGE
4,04,How DOGE points work ?,0.000000,neutral,0.0,neutral,DOGE
...,...,...,...,...,...,...,...
190,14,"Hi, what is the email for support? I didn't re...",0.000000,neutral,4.0,positive,DOGE
191,14,new coin SHIB is good or not,0.418182,positive,3.0,positive,SHIB
192,15,"From where to buy SHIB in India, any leads ple...",0.000000,neutral,1.0,positive,SHIB
193,15,SHIB is listed through the Cryptocom App 🙏,0.000000,neutral,0.0,neutral,SHIB


In [27]:
messagesData['date'].value_counts()

10    69
08    34
11    32
09    18
13    11
12     7
14     5
06     4
07     4
04     3
15     3
03     2
05     2
01     1
Name: date, dtype: int64

In [28]:
fig  = px.bar(
    x= messagesData['date'],
    title= "Total messages per day",
    labels={
            "x": "Date (May 2021)",
    }
)
fig.update_traces(marker_color='green')
fig.show()
fig.write_image("TotalMessagesPerDate.png")

### Observations:
#### Number of messages increase suddenly on 8th and decrease post 11th, this might be an indicator of price change in either "DOGE" or "SHIB" or both, as people start discussing more about these in the given date range
#### We will later observe if these messages indicate a positive or a negative sentiment towards these coins.

In [39]:
p = messagesData.groupby(['date','Category_TextBlob']).size()

def numberOfSentimentsPerDay(kindOfSentiment, y, p):
    for i in range(1,16):
        try:
            y.append(p["{:02d}".format(i)][kindOfSentiment])
        except:
            y.append(0)
    return y

yPositive = []
yNegative = []
yNeutral = []
yPositive = numberOfSentimentsPerDay('positive', yPositive, p)
yNegative = numberOfSentimentsPerDay('negative', yNegative, p)
yNeutral = numberOfSentimentsPerDay('neutral', yNeutral, p)


In [48]:
fig = go.Figure()
x = messagesData['date'].unique()
yPos = yPositive
yNeg = yNegative
yNeut = yNeutral

fig.add_trace(go.Bar(
    x= x,
    y= yPos,
    name='Positive',
    marker_color='green'
))

fig.add_trace(go.Bar(
    x=x,
    y= yNeg,
    name='Negative',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=x,
    y= yNeut,
    name='Neutral',
    marker_color='orange'
))

fig.update_layout(
    title="Per Day Sentiment Analysis",
    xaxis_title="Dates - May 2021",
    yaxis_title="Number of messages",
)

fig.show()
fig.write_image("averageSentimentPerDayPlot.png")

### Observations:
#### In the date range [9-13], there are more positive sentiment messages in the group, which might be an indicator of people benefitting of either "DOGE" or "SHIB", it might also be the case that people are neutral for either one of "DOGE" or "SHIB" but benefitting from something else as there are comparable negative sentiments too.
#### There's a high number of neutral messages on 11th and it might be the case that prices remained as expected or no profit no loss for on of the coins.

In [41]:
dogeData = messagesData[messagesData['DOGE OR SHIB'] == "DOGE"]
shibData = messagesData[messagesData['DOGE OR SHIB'] == "SHIB"]

In [42]:
p = dogeData.groupby(['date','Category_TextBlob']).size()

yDogePositive = []
yDogeNegative = []
yDogeNeutral = []
yDogePositive = numberOfSentimentsPerDay('positive', yDogePositive, p)
yDogeNegative = numberOfSentimentsPerDay('negative', yDogeNegative, p)
yDogeNeutral = numberOfSentimentsPerDay('neutral', yDogeNeutral, p)


In [49]:
fig = go.Figure()
x = dogeData['date'].unique()
yPos = yDogePositive
yNeg = yDogeNegative
yNeut = yDogeNeutral

fig.add_trace(go.Bar(
    x= x,
    y= yPos,
    name='Positive',
    marker_color='green'
))

fig.add_trace(go.Bar(
    x=x,
    y= yNeg,
    name='Negative',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=x,
    y= yNeut,
    name='Neutral',
    marker_color='orange'
))

fig.update_layout(
    title="Per Day Sentiment Analysis - DOGE",
    xaxis_title="Dates - May 2021",
    yaxis_title="Number of messages",
)

fig.show()
fig.write_image("averageSentimentPerDayPlot-DOGE.png")

### Observations : 
#### Price of DOGE must have been risen on 11th or 12th as there are positive sentiments for DOGE on these dates, on prior days, there are comparable negative sentiments indicating that prices might have been falling before 11th.

In [45]:
p = shibData.groupby(['date','Category_TextBlob']).size()

yShibPositive = []
yShibNegative = []
yShibNeutral = []
yShibPositive = numberOfSentimentsPerDay('positive', yDogePositive, p)
yShibNegative = numberOfSentimentsPerDay('negative', yDogeNegative, p)
yShibNeutral = numberOfSentimentsPerDay('neutral', yDogeNeutral, p)

In [47]:
fig = go.Figure()
x = shibData['date'].unique()
yPos = yShibPositive
yNeg = yShibNegative
yNeut = yShibNeutral

fig.add_trace(go.Bar(
    x= x,
    y= yPos,
    name='Positive',
    marker_color='green'
))

fig.add_trace(go.Bar(
    x=x,
    y= yNeg,
    name='Negative',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=x,
    y= yNeut,
    name='Neutral',
    marker_color='orange'
))

fig.update_layout(
    title="Per Day Sentiment Analysis - SHIB",
    xaxis_title="Dates - May 2021",
    yaxis_title="Number of messages",
)

fig.show()
fig.write_image("averageSentimentPerDayPlot-SHIB.png")