<a href="https://colab.research.google.com/github/dzanahmed/welcome-ideathon-lshtm/blob/main/code/sentiment_analysis_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Install transformers datasets

In [1]:
! pip install transformers datasets
! pip install nltk
! pip install tqdm

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Download

## Load packages

In [2]:
from transformers import pipeline, Pipeline
import pandas as pd

from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from datasets import Dataset
import numpy as np
from scipy.special import softmax

from tqdm import tqdm

## Load data from github into dataframe

In [7]:
url = "https://raw.githubusercontent.com/dzanahmed/welcome-ideathon-lshtm/main/data/interim/vax_tweets_v0.csv?token=GHSAT0AAAAAACB5CGEQI2F23FT66VPBXBMSZFK2JPQ"
df = pd.read_csv(url)

# coerce all columnns to the correct data type.
df['user_followers'] = pd.to_numeric(df['user_followers'], errors='coerce').astype('Int64')
df['user_friends'] = pd.to_numeric(df['user_friends'], errors='coerce').astype('Int64')
df['user_favourites'] = pd.to_numeric(df['user_favourites'], errors='coerce').astype('Int64')
df['text'] = df['text'].astype(str)
df['user_verified'] = df['user_verified'].astype(bool)
df['is_retweet'] = df['is_retweet'].astype(bool)

df.dtypes

tweet_id                  int64
user_location            object
user_description         object
user_followers            Int64
user_friends              Int64
user_favourites           Int64
user_verified              bool
date                     object
text                     object
hashtags                 object
is_retweet                 bool
roberta_loc_score       float64
roberta_loc_guess        object
distilBERT_sentiment     object
distilBERT_score        float64
dtype: object

## Load dataframe into dataset

#### Load dataframe into huggingface dataset object

In [None]:
df = df[['tweet_id', 'text']]
print(len(df))

df = df.dropna()
print(len(df))


# save dataframe as a dataset
dataset = Dataset.from_pandas(df)
print(len(dataset))

99997
99997
99997


## Load RoBERTa model


In [None]:
model_path = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = TFAutoModelForSequenceClassification.from_pretrained(model_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/499M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Get Sentiment Analysis Predictions from RoBERTa Model

In [None]:
batch_size = 100
max_batches = int(len(dataset["text"]) / batch_size) + 1
n_batches = max_batches

df["negative_score"] = None
df["neutral_score"] = None
df["positive_score"] = None

for batch in range(n_batches):

  # store batch of tweets
  if batch < max_batches - 1:
    tweets_batch = dataset["text"][(batch * batch_size) : ((batch + 1) * batch_size)]
  else:
    tweets_batch = dataset["text"][(batch * batch_size) : len(dataset["text"])]


  # tokenize batch
  tokens = tokenizer(tweets_batch, padding=True, truncation=True, return_tensors="tf")

  # run batch through model
  output = model(**tokens)

  # extract the output into a dataframe
  for output_row in range(len(output.logits)):
    # convert from logit to softmax (probability) output
    probabilities = softmax(output.logits.numpy()[output_row])

    # store probabilities in correct column of data frame
    df.loc[batch * batch_size + output_row, "negative_score"] = probabilities[0]
    df.loc[batch * batch_size + output_row, "neutral_score"] = probabilities[1]
    df.loc[batch * batch_size + output_row, "positive_score"] = probabilities[2]

  print(batch)

## Save Results to csv file

In [None]:
from google.colab import files

df.to_csv('sentiment_analysis.csv', encoding = 'utf-8-sig')
files.download('sentiment_analysis.csv')

# Get VADER sentiment analysis predictions

## Setup dataframe to save VADER predictions

In [None]:
VADER_df = df[['tweet_id', 'text']].copy()
VADER_df['VADER_label'] = None
VADER_df['VADER_compound_score'] = None

# # check setup worked
# VADER_df.head()

##Import lexicon dictionary for VADER via prompt

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download()
# type "d vader_lexicon" into the prompt, then type "q" to quit.

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d vader_lexicon


    Downloading package vader_lexicon to /root/nltk_data...
      Package vader_lexicon is already up-to-date!



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

## Run VADER on tweet dataframe

In [None]:
# initialize VADER sentiment analyzer object from SentimentIntensityAnalyzer class
sent_analyzer = SentimentIntensityAnalyzer()


# run sentiment analysis using VADER on every row's text entry in VADER_df
for index in tqdm(range(len(VADER_df))):
  sentiment_dict = sent_analyzer.polarity_scores(VADER_df.loc[index, "text"])

  # assign label to row based on recommended thresholds.
  if sentiment_dict['compound'] >= 0.05 :
      VADER_df.loc[index, 'VADER_label'] = "Positive"

  elif sentiment_dict['compound'] <= - 0.05 :
      VADER_df.loc[index, 'VADER_label'] = "Negative"

  else :
      VADER_df.loc[index, 'VADER_label'] = "Neutral"

  # save the compound score used to assign the label.
  VADER_df.loc[index, 'VADER_compound_score'] = sentiment_dict['compound']



100%|██████████| 99997/99997 [02:33<00:00, 653.04it/s]


## Save results to csv file

In [None]:
from google.colab import files

VADER_df.to_csv('VADER_sentiment_analysis.csv', encoding = 'utf-8-sig')
files.download('VADER_sentiment_analysis.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Get TextBlob sentiment analysis predictions

## Setup dataframe to save VADER predictions

In [9]:
textblob_df = df[['tweet_id', 'text']].copy()
textblob_df['textblob_sentiment_label'] = None
textblob_df['textblob_sentiment_score'] = None
textblob_df['textblob_subjectivity_label'] = None
textblob_df['textblob_subjectivity_score'] = None

# check setup worked
textblob_df.head()

Unnamed: 0,tweet_id,text,textblob_sentiment_label,textblob_sentiment_score,textblob_subjectivity_label,textblob_subjectivity_score
0,1,We asked our coworkers why they're getting a C...,,,,
1,2,45+ #RURAL #Bengaluru #CovidVaccine Availabili...,,,,
2,3,@JoyAnnReid @NIH 👿Questions: Could the vacci...,,,,
3,4,Next question is how do you find out where you...,,,,
4,5,"If you told your child to get a Covid vaccine,...",,,,


##Import required packages

In [10]:
from textblob import TextBlob

## Run textblob on tweet dataframe

In [13]:
# run sentiment analysis using VADER on every row's text entry in VADER_df
for index in tqdm(range(len(textblob_df))):
  sentiment = TextBlob(textblob_df.loc[index, "text"]).sentiment


  # assign label to row based on recommended thresholds.
  if sentiment.polarity >= 0.05 :
      textblob_df.loc[index, 'textblob_sentiment_label'] = "Positive"

  elif sentiment.polarity <= - 0.05 :
      textblob_df.loc[index, 'textblob_sentiment_label'] = "Negative"

  else :
      textblob_df.loc[index, 'textblob_sentiment_label'] = "Neutral"

  # assign label to row based on arbitrary 0.5 threshold.
  if sentiment.subjectivity >= 0.5:
    textblob_df.loc[index, 'textblob_subjectivity_label'] = "Subjective"
  else:
    textblob_df.loc[index, 'textblob_subjectivity_label'] = "Objective"


  # save the compound score used to assign the label.
  textblob_df.loc[index, 'textblob_sentiment_score'] = sentiment.polarity
  textblob_df.loc[index, 'textblob_subjectivity_score'] = sentiment.subjectivity

100%|██████████| 99997/99997 [02:59<00:00, 558.40it/s] 


## Save results to csv file

In [14]:
from google.colab import files

textblob_df.loc[:, ['tweet_id', 'textblob_sentiment_label', 'textblob_subjectivity_label', 'textblob_sentiment_score', 'textblob_subjectivity_score']].to_csv('textblob_sentiment_analysis.csv', encoding = 'utf-8-sig', index = False)
files.download('textblob_sentiment_analysis.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>