# Sentiment analysis using existing toolkits like Vader and TextBlob

In [1]:
import pandas as pd
import csv
import re
import numpy as np
import plotly.express as px
from plotly.offline import init_notebook_mode
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pre import clean_text, remove_stopwords

[nltk_data] Downloading package stopwords to C:\Users\SHIVAM
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

df = pd.read_csv("../data_collection/tweets_data.csv")
df

Unnamed: 0,tweet_id_str,date_time,location,tweet_text,media_urls
0,1844078887871058378,Wed Oct 09 18:14:02 +0000 2024,No location provided,bharat lost real ratan. end era om shanti #rat...,
1,1843186838787526796,Mon Oct 07 07:09:21 +0000 2024,No location provided,thank thinking,
2,1844294321736122741,Thu Oct 10 08:30:06 +0000 2024,No location provided,om shanti ... #ratantata,
3,1844092588506349768,Wed Oct 09 19:08:29 +0000 2024,No location provided,"india lost ratan ! institution , remembered ge...",
4,1844189159876968623,Thu Oct 10 01:32:13 +0000 2024,No location provided,deepest condolences ' bharat ratna ' true sens...,
...,...,...,...,...,...
394,1844089092931453247,Wed Oct 09 18:54:35 +0000 2024,No location provided,india bows silence tonight. #ratantata,
395,1844727757290299825,Fri Oct 11 13:12:25 +0000 2024,No location provided,#ratantata #tatatrust #tatagroup,
396,1844094311715176510,Wed Oct 09 19:15:20 +0000 2024,No location provided,big loss end era #ratantata,
397,1844080814415282460,Wed Oct 09 18:21:42 +0000 2024,No location provided,chairman emeritus ratan tata guided tata group...,


In [3]:
df['tweet_text']

0      bharat lost real ratan. end era om shanti #rat...
1                                         thank thinking
2                               om shanti ... #ratantata
3      india lost ratan ! institution , remembered ge...
4      deepest condolences ' bharat ratna ' true sens...
                             ...                        
394               india bows silence tonight. #ratantata
395                     #ratantata #tatatrust #tatagroup
396                          big loss end era #ratantata
397    chairman emeritus ratan tata guided tata group...
398                                                  NaN
Name: tweet_text, Length: 399, dtype: object

<hr>

## 1. Sentiment Analysis with TextBlob

In [4]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [5]:

your_text = 'I like this movie'
getPolarity(your_text), getSubjectivity(your_text)

(0.0, 0.0)

We now have subjectivity and polarity scores for each of our tweets, which we add to our dataframe.

In [6]:
df['tweet_text'] = df['tweet_text'].fillna('')

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

df['subjectivity'] = df['tweet_text'].apply(getSubjectivity)
df['polarity'] = df['tweet_text'].apply(getPolarity)

df

Unnamed: 0,tweet_id_str,date_time,location,tweet_text,media_urls,subjectivity,polarity
0,1844078887871058378,Wed Oct 09 18:14:02 +0000 2024,No location provided,bharat lost real ratan. end era om shanti #rat...,,0.300000,0.200000
1,1843186838787526796,Mon Oct 07 07:09:21 +0000 2024,No location provided,thank thinking,,0.000000,0.000000
2,1844294321736122741,Thu Oct 10 08:30:06 +0000 2024,No location provided,om shanti ... #ratantata,,0.000000,0.000000
3,1844092588506349768,Wed Oct 09 19:08:29 +0000 2024,No location provided,"india lost ratan ! institution , remembered ge...",,0.000000,0.000000
4,1844189159876968623,Thu Oct 10 01:32:13 +0000 2024,No location provided,deepest condolences ' bharat ratna ' true sens...,,0.470000,0.010000
...,...,...,...,...,...,...,...
394,1844089092931453247,Wed Oct 09 18:54:35 +0000 2024,No location provided,india bows silence tonight. #ratantata,,0.000000,0.000000
395,1844727757290299825,Fri Oct 11 13:12:25 +0000 2024,No location provided,#ratantata #tatatrust #tatagroup,,0.000000,0.000000
396,1844094311715176510,Wed Oct 09 19:15:20 +0000 2024,No location provided,big loss end era #ratantata,,0.100000,0.000000
397,1844080814415282460,Wed Oct 09 18:21:42 +0000 2024,No location provided,chairman emeritus ratan tata guided tata group...,,0.454545,0.136364


Creating a function to add a sentiment label to each tweet, based on it's polarity score.

In [7]:
def get_sentiment_label(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'    

In [8]:
# Apply the get_sentiment_label function to the polarity column
# and add the sentiment results as a new column in our dataframe

df['TBsentiment'] = df['polarity'].apply(get_sentiment_label)
df

Unnamed: 0,tweet_id_str,date_time,location,tweet_text,media_urls,subjectivity,polarity,TBsentiment
0,1844078887871058378,Wed Oct 09 18:14:02 +0000 2024,No location provided,bharat lost real ratan. end era om shanti #rat...,,0.300000,0.200000,Positive
1,1843186838787526796,Mon Oct 07 07:09:21 +0000 2024,No location provided,thank thinking,,0.000000,0.000000,Neutral
2,1844294321736122741,Thu Oct 10 08:30:06 +0000 2024,No location provided,om shanti ... #ratantata,,0.000000,0.000000,Neutral
3,1844092588506349768,Wed Oct 09 19:08:29 +0000 2024,No location provided,"india lost ratan ! institution , remembered ge...",,0.000000,0.000000,Neutral
4,1844189159876968623,Thu Oct 10 01:32:13 +0000 2024,No location provided,deepest condolences ' bharat ratna ' true sens...,,0.470000,0.010000,Positive
...,...,...,...,...,...,...,...,...
394,1844089092931453247,Wed Oct 09 18:54:35 +0000 2024,No location provided,india bows silence tonight. #ratantata,,0.000000,0.000000,Neutral
395,1844727757290299825,Fri Oct 11 13:12:25 +0000 2024,No location provided,#ratantata #tatatrust #tatagroup,,0.000000,0.000000,Neutral
396,1844094311715176510,Wed Oct 09 19:15:20 +0000 2024,No location provided,big loss end era #ratantata,,0.100000,0.000000,Neutral
397,1844080814415282460,Wed Oct 09 18:21:42 +0000 2024,No location provided,chairman emeritus ratan tata guided tata group...,,0.454545,0.136364,Positive


We can have a quick look at the sentiment distribution of the tweets as follows:

In [9]:
df['TBsentiment'].value_counts()

TBsentiment
Neutral     219
Positive    156
Negative     24
Name: count, dtype: int64

In [10]:
# Filter and print the negative sentiment tweets
negative_tweets = df[df['TBsentiment'] == 'Negative']
print("\nNegative Sentiment Tweets:")
print(negative_tweets[['tweet_text', 'polarity']])


Negative Sentiment Tweets:
                                            tweet_text  polarity
7    say gone .. ' hard bear loss .. hard .. farewe... -0.291667
18                       proved world wrong #ratantata -0.500000
28   varanasi , uttar pradesh tribute #ratantata he... -0.125000
51   india ' titan ratan tata , credited transformi... -0.150000
62   saddened hear passing ratan tata sir. unmatche... -0.250000
68   monsoons , lot stray cats dogs take shelter ca... -0.033333
81   1st picture marine drive india world cup 2nd p... -0.250000
113  sushant singh rajput died , remained silent. r... -0.042857
122  ratan tata longer us. ratan tata , aged 86 , e... -0.100000
140  0 posts ratan tata sir sushant singh rajput sk... -0.287879
164  ' never spoke ill anyone , much learn ' diljit... -0.150000
176  sad announce ratan tata departed heavenly abod... -0.138889
195  news expected morning , life. though never met... -0.033333
212  #statecraftinthenews 1 8 ratan tata , 86 , pas... -0.1333

In [11]:
sorted_df = df.sort_values(by=['polarity'], ascending=False)

Top 15 most _positive_ tweets, which are now the first 15 tweets in the new dataframe.

In [12]:
for i, tweet in enumerate(sorted_df.head(15)['tweet_text']):
    print(i+1, tweet, '\n')

1 today , remember legendary patron , mr ratan tata. wisdom , kindness , leadership continue serve beacon , reminding us work integrity innovation , , throughout illustrious journey. #ratantata #ratannavaltata #thetitan #visionary #icon 

2 great soul thank inspiration .. rip sir ratan tata ! ! #ratantatasir #ratantatapassesaway #ratantatarip #ratantataliveson 

3 best speech #ratantata . 

4 forget airport. read billboard ... priceless ratan tata 

5 titan , icon , greatest #ratantata passed away .... rip sir 

6 ratan tata renowned indian businessman philanthropist , best known leading tata group 1991 2012. ' remarkable life journey 

7 great great great great wonderful man shri ratan tata g #ratantatasir 

8 memory ratan tata iim mumbai pays tribute great leader philanthropist whose impact forever inspire us. rest peace. #iim #iimmumbai #legend #ratantata #rip #riplegend #willmissyou #youwillbemissed 

9 ' great loss #ratantata #ripratantata 

10 ratan tata man made india proud. leg

Top 15 most _negative_ tweets, which are now the last 15 tweets in the new dataframe.

In [13]:
for i, tweet in enumerate(sorted_df.tail(15)['tweet_text']):
    print(i+1, tweet, '\n')

1 sad announce ratan tata departed heavenly abode. served chairman tata group tata sons 1990 2012 tenure , tata motors acquired jaguar land rover. mercedes benz also brought india tenure , joint venture telco previous name tata motors first owner w124 e220 fondly remembered everyone , efforts indian automotive sector. may soul rest peace #ratantata #ripratantata 

2 ' never spoke ill anyone , much learn ' diljit dosanjh halts germany concert pay tribute ratan tata meanwhile , mumbai , garba dandiya night also saw tribute ratan tata , crowd paused celebrations honour memory. #ratantata #tatagroup #diljitdosanjh 

3 india ' titan ratan tata , credited transforming tata group globally renowned conglomerate , passes away age 86 , company said late wednesday. ratan naval tata 28.12.1937 09.10.2024. may soul rest peace. #ratantata #tatagroup 

4 curious language ! ! ! #ratantata 

5 honouring late shri ratan tata , whose legacy generosity tata trusts gave karunashraya early hope support. kin

Now let's visualise distribution of the polarity and subjectivity assignments from TextBlob. To do this, we will make use of interactive plots from [Plotly](https://plotly.com/python/).

Interactive Plotly plots make use of JavaScript behind the scenes. To connect our Jupyter notebook with JavaScript, we need to execute the following line of code:

In [14]:
# ___Cell no. 12___

init_notebook_mode(connected=True)

Below, we use [Plotly Express](https://plotly.com/python/plotly-express/) to create a simple scatter plot of the polarity and subjectivity data. As this is an interactive plot, you will be able to hover your mouse over a point to view it's properties. 

Note how plotly express automatically labels our axes for us according to our dataframe column names.

In [15]:
from plotly.offline import plot

# ___Cell no. 13___

# Use plotly offline mode to render the plot
fig = px.scatter(df, x="polarity", y="subjectivity", hover_data=['tweet_text'],
                 title="TextBlob Sentiment Analysis")

# Plot offline
plot(fig)


'temp-plot.html'

<hr>

## 2. Sentiment Analysis with VADER

In [16]:
analyser = SentimentIntensityAnalyzer()

In [17]:
# ___Cell no. 15___

your_text = 'i like this movie'
analyser.polarity_scores(your_text)

{'neg': 0.0, 'neu': 0.545, 'pos': 0.455, 'compound': 0.3612}

Let us now use VADER to retrieve the compound sentiment score for all tweets and add this information to our original (unsorted) dataframe.

In [18]:
#Create a function to get the polarity

def get_vaderCompoundPolarity(text):
    return analyser.polarity_scores(text)['compound']
    
df['vader_compound'] = df['tweet_text'].apply(get_vaderCompoundPolarity)
df

Unnamed: 0,tweet_id_str,date_time,location,tweet_text,media_urls,subjectivity,polarity,TBsentiment,vader_compound
0,1844078887871058378,Wed Oct 09 18:14:02 +0000 2024,No location provided,bharat lost real ratan. end era om shanti #rat...,,0.300000,0.200000,Positive,-0.3182
1,1843186838787526796,Mon Oct 07 07:09:21 +0000 2024,No location provided,thank thinking,,0.000000,0.000000,Neutral,0.3612
2,1844294321736122741,Thu Oct 10 08:30:06 +0000 2024,No location provided,om shanti ... #ratantata,,0.000000,0.000000,Neutral,0.0000
3,1844092588506349768,Wed Oct 09 19:08:29 +0000 2024,No location provided,"india lost ratan ! institution , remembered ge...",,0.000000,0.000000,Neutral,0.3595
4,1844189159876968623,Thu Oct 10 01:32:13 +0000 2024,No location provided,deepest condolences ' bharat ratna ' true sens...,,0.470000,0.010000,Positive,0.5994
...,...,...,...,...,...,...,...,...,...
394,1844089092931453247,Wed Oct 09 18:54:35 +0000 2024,No location provided,india bows silence tonight. #ratantata,,0.000000,0.000000,Neutral,0.0000
395,1844727757290299825,Fri Oct 11 13:12:25 +0000 2024,No location provided,#ratantata #tatatrust #tatagroup,,0.000000,0.000000,Neutral,0.0000
396,1844094311715176510,Wed Oct 09 19:15:20 +0000 2024,No location provided,big loss end era #ratantata,,0.100000,0.000000,Neutral,-0.3182
397,1844080814415282460,Wed Oct 09 18:21:42 +0000 2024,No location provided,chairman emeritus ratan tata guided tata group...,,0.454545,0.136364,Positive,0.0000


Let us once again apply the 'get_sentiment_label' function to assign the VADER sentiment of each tweet given the compound score.

In [19]:

# Apply the get_sentiment_label function to the VADER compound score
# and add the VADER sentiment results as a new column in our dataframe

df['VADERsentiment'] = df['vader_compound'].apply(get_sentiment_label)
df

Unnamed: 0,tweet_id_str,date_time,location,tweet_text,media_urls,subjectivity,polarity,TBsentiment,vader_compound,VADERsentiment
0,1844078887871058378,Wed Oct 09 18:14:02 +0000 2024,No location provided,bharat lost real ratan. end era om shanti #rat...,,0.300000,0.200000,Positive,-0.3182,Negative
1,1843186838787526796,Mon Oct 07 07:09:21 +0000 2024,No location provided,thank thinking,,0.000000,0.000000,Neutral,0.3612,Positive
2,1844294321736122741,Thu Oct 10 08:30:06 +0000 2024,No location provided,om shanti ... #ratantata,,0.000000,0.000000,Neutral,0.0000,Neutral
3,1844092588506349768,Wed Oct 09 19:08:29 +0000 2024,No location provided,"india lost ratan ! institution , remembered ge...",,0.000000,0.000000,Neutral,0.3595,Positive
4,1844189159876968623,Thu Oct 10 01:32:13 +0000 2024,No location provided,deepest condolences ' bharat ratna ' true sens...,,0.470000,0.010000,Positive,0.5994,Positive
...,...,...,...,...,...,...,...,...,...,...
394,1844089092931453247,Wed Oct 09 18:54:35 +0000 2024,No location provided,india bows silence tonight. #ratantata,,0.000000,0.000000,Neutral,0.0000,Neutral
395,1844727757290299825,Fri Oct 11 13:12:25 +0000 2024,No location provided,#ratantata #tatatrust #tatagroup,,0.000000,0.000000,Neutral,0.0000,Neutral
396,1844094311715176510,Wed Oct 09 19:15:20 +0000 2024,No location provided,big loss end era #ratantata,,0.100000,0.000000,Neutral,-0.3182,Negative
397,1844080814415282460,Wed Oct 09 18:21:42 +0000 2024,No location provided,chairman emeritus ratan tata guided tata group...,,0.454545,0.136364,Positive,0.0000,Neutral


let's have a look at what VADER has classified as the 15 most postive and negative tweets by using the same method shown in the TextBlob example.

In [20]:
sorted_df2 = df.sort_values(by=['vader_compound'], ascending=False)

In [21]:

for i, tweet in enumerate(sorted_df2.head(15)['tweet_text']):
    print(i+1, tweet, '\n')

1 ratan tata , personal hero loss ratan tata sir passing. loss feels personal me. hero growing , like many others. consider blessed gotten opportunity know personal hero deeply last decade. meeting interacting many times time , learning lessons life. relationship mr. tata started 2008. graduating iit bombay guest speaker convocation. young kid words day stayed serve one country. 2015 , got chance meet mr tata decided invest ola. interactions end there. begun ! mr tata like business leader ever met. took personal interest journey. every interaction grace , humility , curiosity world learning me. 2016 , invited come bengaluru address whole company. thought person stature would politely decline actually took full day , flew bengaluru spent entire day team company ! one story like share today key role founding second company ola electric. one day 2017 got call asking come mumbai. said bhavish want take somewhere show something exciting . flew plane coimbatore see personal project making el

In [22]:
#Print out the text from the last 15 tweets in the sorted dataframe

for i, tweet in enumerate(sorted_df2.tail(15)['tweet_text']):
    print(i+1, tweet, '\n')

1 ratan tata final rites ! ! given state funeral ! ! salute man departure ! ! 

2 understand media really getting much asking every latest thing celebs. media asked #aliabhatt ratan tata sir reaction sorry say , shame alia ! #ripratantata #rip ratantata #ratantatasir #ratantatapassedaway 

3 deeply saddened passing mr. ratan tata . 

4 people turned streets colaba say one last goodbye ratan tata. tears sadness around. 

5 0 controversies 0 scams 0 attitude 0 arrogance 0 cringe events 0 display wealth 100 dedication country people. titan lived others country left vultures like ambanis amp adanis om shanti #ratantata ji 

6 #ratantata ' death , responsibility carry forward legacy falls shoulders nephew nieces leah , maya neville tata. need know details 

7 shantanu naidu ko ratan tata ke funeral dekh ke pata nahi kyo iron man funeral ke scene ki yaad aa gai . 

8 #ratantata world ' biggest donor. donated 829,734 crore. built multiple free hospitals , schools amp saved millions lives. tod

Compare the sentiment assignments of TextBlob and VADER by plotting another Plotly interactive scatter plot.

In [23]:
import plotly.express as px
from plotly.offline import plot

fig = px.scatter(df, x="polarity", y="vader_compound", hover_data=['tweet_text'],
                 title="TextBlob vs VADER")

plot(fig, filename='textblob_vs_vader.html')

'textblob_vs_vader.html'