<a href="https://colab.research.google.com/github/aiscience-22/UA_War/blob/twitter_data_cleanup_0.04/TwitterUA_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import json
import scipy 
import torch
from torch.utils.data import Dataset, DataLoader
import logging
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()



0% [Working]            Hit:1 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Connecting to security.ub                                                                               Hit:2 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.39)] [Connecting to security.ub0% [1 InRelease gpgv 15.9 kB] [Connecting to archive.ubuntu.com (91.189.91.39)]                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [1 InRelease gpgv 15.9 kB] [Connecting to archive.ubuntu.com (91.189.91.39)]                                                                               Hit:4 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Hit:6 http://security.ubuntu.com/ubuntu bion

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("UA_War").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [3]:
# Data loading 
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/uaresources/time_twitter_data.csv"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep="\t", header=True, inferSchema=True)
df.show(5)


Py4JJavaError: ignored

In [None]:
# show the first 5 rows of the august dataframe
text_df.head(5)

Unnamed: 0,tweetcreatedts,cleaned_text
0,2022-08-01 00:00:00,the conflict is being cast in binaries making ...
1,2022-08-01 00:00:00,remember when smashed into seven fragments and...
2,2022-08-01 00:00:01,hi you have to understand that we are billion...
3,2022-08-01 00:00:01,the world is in dire straits as it is not equi...
4,2022-08-01 00:00:01,will the imposed on cause a significant shift ...


In [None]:
# get shape of the DataFrame
print(f"{text_df.shape[0]} rows and {text_df.shape[1]} columns")

1050085 rows and 2 columns


In [None]:
# Check data types
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050085 entries, 0 to 1050084
Data columns (total 2 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   tweetcreatedts  1050085 non-null  object
 1   cleaned_text    1050004 non-null  object
dtypes: object(2)
memory usage: 16.0+ MB


Somehow the column tweetcreatedts became an object, we should fix that.

In [None]:
# Convert the time colum in the right type:
text_df["tweetcreatedts"] = pd.to_datetime(text_df["tweetcreatedts"])

# Sentiment Analysis Using RoBERTa

For each tweet the RoBERTa model will generate a score for each of negative, neutral, and positive sentiments

In [None]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Number of GPUs: {torch.cuda.device_count()}")

Number of GPUs: 0


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

In [None]:
from transformers import AutoModelForSequenceClassification

# Create instance of twitter-roberta-base-sentiment classification model
#model = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

In [None]:
# Attach it to the cuda
model = model.to(device)

In [None]:
import urllib
import csv

labels=[] # will contain 'positive', 'neutral', 'negative'
task = 'sentiment' # our task is sentiment analysis
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [None]:
labels

['negative', 'neutral', 'positive']

In [None]:
from scipy.special import softmax
from tqdm import tqdm

BATCH_SIZE = 100 # number of tweets in a batch that will be passed into tokenizer

scores_all = np.empty((0,len(labels)))
# create list of all the tweets in the dataset
text_df = text_df.dropna()
text_all = text_df['cleaned_text'].tolist()

n = len(text_all) # same as number of tweets
with torch.no_grad():
    for start_idx in tqdm(range(0, n, BATCH_SIZE)):
        end_idx = min(start_idx + BATCH_SIZE, n) 
        # reference: https://huggingface.co/docs/transformers/preprocessing
        # tokenize the tweets in the batch, return pytorch ('pt') tensors
        # some tweets are shorter than the uniform tensor length needed; padding adds 0's to maintain uniform tensor length
        # some tweets are too long; truncation truncates input to maximum length accepted by model
        encoded_input = tokenizer(text_all[start_idx:end_idx], return_tensors='pt', padding=True, truncation=True).to(device)
        # encoded_input = tokenizer(text_all, truncation=True, padding=True)
        # references: https://stackoverflow.com/questions/11315010/what-do-and-before-a-variable-name-mean-in-a-function-signature
        # https://stackoverflow.com/questions/1419046/normal-arguments-vs-keyword-arguments/1419160#1419160
        output = model(**encoded_input)
        # convert pytorch tensor to numpy
        scores = output[0].detach().cpu().numpy()
        # 
        scores = softmax(scores, axis=1)
        scores_all = np.concatenate((scores_all, scores), axis=0)
        
        # delete encoded_input, output, scores for next batch
        del encoded_input, output, scores 
        # release all unoccupied cached mem 
        torch.cuda.empty_cache()

 12%|█▏        | 1245/10501 [5:59:00<44:29:06, 17.30s/it]


KeyboardInterrupt: 

It looks like dataset is too big for our project: estimated time of the analysis (only for the 12 days of August!) is 59 hours. I stopped the sentiment analysis after 359 minutes. However, we have some data.

 Output below is what scores_all looks like for the 12% of finished analysis. Each row contains scores for negative, neutral, and positive sentiments. The higher the score, the more likely the tweet has that sentiment.

In [None]:
scores_all

array([[0.60271168, 0.38163382, 0.01565443],
       [0.21225016, 0.7508949 , 0.03685491],
       [0.4901295 , 0.46605185, 0.04381879],
       ...,
       [0.58497429, 0.38556179, 0.02946388],
       [0.58497429, 0.38556179, 0.02946388],
       [0.58497429, 0.38556179, 0.02946388]])

In [None]:
# Let's combine the scores with the existing DataFrame.
text_df[labels] = pd.DataFrame(scores_all, columns=labels)
text_df.head()

Unnamed: 0,tweetcreatedts,cleaned_text,negative,neutral,positive
0,2022-08-01 00:00:00,the conflict is being cast in binaries making ...,0.602712,0.381634,0.015654
1,2022-08-01 00:00:00,remember when smashed into seven fragments and...,0.21225,0.750895,0.036855
2,2022-08-01 00:00:01,hi you have to understand that we are billion...,0.49013,0.466052,0.043819
3,2022-08-01 00:00:01,the world is in dire straits as it is not equi...,0.876023,0.116907,0.00707
4,2022-08-01 00:00:01,will the imposed on cause a significant shift ...,0.044509,0.910851,0.04464


In [None]:
logging.basicConfig(level=logging.ERROR)

In [None]:
# Save this DataFrame so that we don't have to run the model again, which takes a long time.
file_path = "/Users/olgapodolska/Desktop/UA_War_my/resources/aug_twitter_sentiments.csv"
text_df.to_csv(file_path, index=False)