In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification # for tweet sentiment analysis
from deep_translator import GoogleTranslator # translate location
from geopy.geocoders import Nominatim # get country for city name

In [2]:
df = pd.read_csv('245k_tweets.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243185 entries, 0 to 243184
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   content             243185 non-null  object
 1   date                243185 non-null  object
 2   location            226918 non-null  object
 3   like_count          243185 non-null  int64 
 4   source              243185 non-null  object
 5   total_media_shared  243185 non-null  int64 
 6   retweet_count       243185 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 13.0+ MB


In [4]:
df.head(10)

Unnamed: 0,content,date,location,like_count,source,total_media_shared,retweet_count
0,1/ *NEW PAPERS ALERT*\n\nVery proud to share t...,2020-01-01 19:51:05+00:00,"Stanford, CA",15,Twitter Web App,0,4
1,"Drawing from actual events, #Event201 identifi...",2020-01-02 15:00:00+00:00,Baltimore,1,TweetDeck,0,0
2,Interesting article @PNAS on whether the Justi...,2020-01-05 22:46:12+00:00,Kumeyaay unceded land.,3,Twitter Web Client,0,1
3,@arambaut @WHO But batteries of tests have bee...,2020-01-05 19:05:20+00:00,"New York, NY",1,Twitter Web App,0,0
4,#Pandemic players: what do you do if you draw ...,2020-01-06 15:32:57+00:00,Kingdom of Fife,3,Twitter for Android,0,0
5,Preparing for the Next #Pandemic — The @WHO’s ...,2020-01-07 19:05:05+00:00,"Boston, MA",1,Hootsuite Inc.,1,0
6,It’s not known whether a #SARS-like “#coronavi...,2020-01-07 14:38:19+00:00,Worldwide,21,Hootsuite Inc.,0,13
7,Concern that a novel #coronavirus has caused t...,2020-01-08 23:55:29+00:00,Melbourne,2,Twitter Web Client,0,3
8,Given that both #Wuhan and #HongKong's numbers...,2020-01-08 17:32:46+00:00,"Maine, USA",8,Twitter Web App,0,3
9,Authorities in #HongKong are now reporting 9 p...,2020-01-08 17:29:40+00:00,"Maine, USA",5,Twitter Web App,0,1


In [5]:
df.isnull().sum()

content                   0
date                      0
location              16267
like_count                0
source                    0
total_media_shared        0
retweet_count             0
dtype: int64

In [6]:
df.query('location.isnull()')

Unnamed: 0,content,date,location,like_count,source,total_media_shared,retweet_count
24,Middle East Respiratory Syndrome #Coronavirus ...,2020-01-09 16:32:09+00:00,,0,Twitter Web App,0,0
43,"Wuhan pneumonia update. 41 cases, one death. S...",2020-01-10 23:57:30+00:00,,8,TweetDeck,0,1
57,"As someone who has lived, studied, and worked ...",2020-01-10 00:22:36+00:00,,16,Twitter for iPhone,0,3
123,WHO Director-General @DrTedros reportedly cons...,2020-01-13 14:35:02+00:00,,18,Twitter Web App,0,11
212,#Japan confirms case of #Wuhan #coronavirus\n\...,2020-01-16 19:02:22+00:00,,0,TweetDeck,1,0
...,...,...,...,...,...,...,...
243061,#ICYMI. Everyone 6 months and older should get...,2022-06-29 18:44:08+00:00,,10,Sprinklr,1,5
243079,Norway🇳🇴Was a Pandemic Success. Then It Spent ...,2022-06-29 18:13:35+00:00,,21,Twitter Web App,0,12
243084,2+ years into the #COVID19 pandemic and Black ...,2022-06-29 18:05:04+00:00,,5,Hootsuite Inc.,0,2
243130,'Body aches' #FernBritton shares update on #He...,2022-06-29 16:48:18+00:00,,1,TweetDeck,0,3


In [7]:
# filling all the missing values with "Not Specified" from location column
df['location'].fillna("not_specified", inplace = True)

In [8]:
df['date'] = pd.to_datetime(df['date']) # convert into datetime format

df['year'] = df['date'].dt.year
df['month_name'] = df['date'].dt.month_name()
df['month_day'] = df['date'].dt.day

In [9]:
df['full_date'] = [d.date() for d in df['date']]
df['time'] = [d.time() for d in df['date']]

In [10]:
df.drop('date',axis=1,inplace=True)
df.head()

Unnamed: 0,content,location,like_count,source,total_media_shared,retweet_count,year,month_name,month_day,full_date,time
0,1/ *NEW PAPERS ALERT*\n\nVery proud to share t...,"Stanford, CA",15,Twitter Web App,0,4,2020,January,1,2020-01-01,19:51:05
1,"Drawing from actual events, #Event201 identifi...",Baltimore,1,TweetDeck,0,0,2020,January,2,2020-01-02,15:00:00
2,Interesting article @PNAS on whether the Justi...,Kumeyaay unceded land.,3,Twitter Web Client,0,1,2020,January,5,2020-01-05,22:46:12
3,@arambaut @WHO But batteries of tests have bee...,"New York, NY",1,Twitter Web App,0,0,2020,January,5,2020-01-05,19:05:20
4,#Pandemic players: what do you do if you draw ...,Kingdom of Fife,3,Twitter for Android,0,0,2020,January,6,2020-01-06,15:32:57


In [11]:
# converts city name, state into country name
geolocator = Nominatim(user_agent = "http")
loc = geolocator.geocode('Nigeria')
print(loc.address.split(",")[-1].strip())

Nigeria


In [13]:
translate_tweet = GoogleTranslator(source='auto', target='en')
translate_tweet.translate("대한민국 서울")

'Korea, Seoul'

In [14]:
def get_country(place):
    '''
    returns country name given city but city in country like saudi arabia, china output like above
    so we need to translate in the end.
    '''
    try:
        loc = geolocator.geocode(place.lower())
        address = loc.address.split(",")[-1].strip()
    except Exception:
        return "multiple_address"
    return translate_tweet.translate(address) # may contain some chinese words like "北京"

In [15]:
import re

def format_link(tweet):
    '''
    replace all the link with "http" for sentiment analysis
    '''
    pattern = r'((www|http\:\/\/|https\:\/\/)?.[\w]*.(com|co))+(\/?[\w]?)*'
    match = re.compile(pattern)
    return match.sub(" http", tweet)
    
def format_text(tweet):
    '''
    replace all the \n with space from the tweet
    '''
    pattern = r'\n'
    match = re.compile(pattern)
    return match.sub(" ", tweet)

def format_mention(tweet):
    '''
    replace all the @username mention to @user for sentiment analysis
    '''
    pattern = r'@[\w]+'
    match = re.compile(pattern)
    return match.sub("@user", tweet)

In [16]:
get_country("united states")

'United States'

In [16]:
format_text(format_link(df['content'][32435]))

'Did you miss Friday’s live stream #Doctorpedia Frontline Webcast?  @Exp_Mark Ph.D. in Business Strategy and http, @Harvard professor and author of the best-selling novel #TheAIRepublic, spoke on #COVID19’s impact on the healthcare http.  http'

In [27]:
location_index = df['location'].value_counts().index

In [28]:
len(location_index)

7280

In [21]:
key_value = dict()

In [34]:
for country in location_index:
    if country not in key_value:
        key_value[country] = get_country(country)

In [35]:
len(key_value), len(location_index)

(7280, 7280)

In [42]:
df['location'] = df['location'].replace(key_value)

In [43]:
df['location'].value_counts()[:20]

United States                     106882
multiple_address                   43636
Canada                             26086
India                              13811
Australia                          11827
United Kingdom                      6310
Philippines                         5560
China                               3146
The United Arab Emirates            2618
Schweiz/Suisse/Svizzera/Svizra      2323
Ireland / Ireland                   1434
Israel                              1410
Saudi Arabia                        1319
South Africa                        1180
Nigeria                              911
Dominican Republic                   909
Turkey                               851
Qatar                                775
Japan                                704
Viti                                 620
Name: location, dtype: int64

In [44]:
# convert anything which is not posted from webapp, iphone, android, ipad etc. to social media management platform
df['source'] = np.where((df['source'] != "Twitter Web App") & 
                        (df['source'] != "Twitter for iPhone") & 
                        (df['source'] != "Twitter for Android") & 
                        (df['source'] != "Twitter for iPad") & 
                        (df['source'] != "Twitter Media Studio") & 
                        (df['source'] != "Twitter for Advertisers"), 
                        "Social Media Management Platform", df['source'])

In [96]:
df.head(10)

Unnamed: 0,content,location,like_count,source,total_media_shared,retweet_count,year,month_name,month_day,full_date,time
0,1/ *NEW PAPERS ALERT*\n\nVery proud to share t...,United States,15,Twitter Web App,0,4,2020,January,1,2020-01-01,19:51:05
1,"Drawing from actual events, #Event201 identifi...",United States,1,Social Media Management Platform,0,0,2020,January,2,2020-01-02,15:00:00
2,Interesting article @PNAS on whether the Justi...,multiple_address,3,Social Media Management Platform,0,1,2020,January,5,2020-01-05,22:46:12
3,@arambaut @WHO But batteries of tests have bee...,United States,1,Twitter Web App,0,0,2020,January,5,2020-01-05,19:05:20
4,#Pandemic players: what do you do if you draw ...,Kingdom of Fife,3,Twitter for Android,0,0,2020,January,6,2020-01-06,15:32:57
5,Preparing for the Next #Pandemic — The @WHO’s ...,United States,1,Social Media Management Platform,1,0,2020,January,7,2020-01-07,19:05:05
6,It’s not known whether a #SARS-like “#coronavi...,Dominican Republic,21,Social Media Management Platform,0,13,2020,January,7,2020-01-07,14:38:19
7,Concern that a novel #coronavirus has caused t...,Australia,2,Social Media Management Platform,0,3,2020,January,8,2020-01-08,23:55:29
8,Given that both #Wuhan and #HongKong's numbers...,United States,8,Twitter Web App,0,3,2020,January,8,2020-01-08,17:32:46
9,Authorities in #HongKong are now reporting 9 p...,United States,5,Twitter Web App,0,1,2020,January,8,2020-01-08,17:29:40


In [36]:
df['source'].value_counts()

Social Media Management Platform    108608
Twitter Web App                      76869
Twitter for iPhone                   40563
Twitter for Android                   8873
Twitter Media Studio                  5272
Twitter for iPad                      2757
Twitter for Advertisers                243
Name: source, dtype: int64

In [45]:
df['cleaned_tweet'] = df['content'].apply(lambda x: format_text(format_link(format_mention(x))))

In [98]:
df.head(10)

Unnamed: 0,content,location,like_count,source,total_media_shared,retweet_count,year,month_name,month_day,full_date,time,cleaned_tweet
0,1/ *NEW PAPERS ALERT*\n\nVery proud to share t...,United States,15,Twitter Web App,0,4,2020,January,1,2020-01-01,19:51:05,1/ *NEW PAPERS ALERT* Very proud to share the...
1,"Drawing from actual events, #Event201 identifi...",United States,1,Social Media Management Platform,0,0,2020,January,2,2020-01-02,15:00:00,"Drawing from actual events, #Event201 identifi..."
2,Interesting article @PNAS on whether the Justi...,multiple_address,3,Social Media Management Platform,0,1,2020,January,5,2020-01-05,22:46:12,Interesting article @user on whether the Justi...
3,@arambaut @WHO But batteries of tests have bee...,United States,1,Twitter Web App,0,0,2020,January,5,2020-01-05,19:05:20,@user @user But batteries of tests have been p...
4,#Pandemic players: what do you do if you draw ...,Kingdom of Fife,3,Twitter for Android,0,0,2020,January,6,2020-01-06,15:32:57,#Pandemic players: what do you do if you draw ...
5,Preparing for the Next #Pandemic — The @WHO’s ...,United States,1,Social Media Management Platform,1,0,2020,January,7,2020-01-07,19:05:05,Preparing for the Next #Pandemic — The @user’s...
6,It’s not known whether a #SARS-like “#coronavi...,Dominican Republic,21,Social Media Management Platform,0,13,2020,January,7,2020-01-07,14:38:19,It’s not known whether a #SARS-like http” has...
7,Concern that a novel #coronavirus has caused t...,Australia,2,Social Media Management Platform,0,3,2020,January,8,2020-01-08,23:55:29,Concern that a novel http has caused the #wuha...
8,Given that both #Wuhan and #HongKong's numbers...,United States,8,Twitter Web App,0,3,2020,January,8,2020-01-08,17:32:46,Given that both #Wuhan and #HongKong's http to...
9,Authorities in #HongKong are now reporting 9 p...,United States,5,Twitter Web App,0,1,2020,January,8,2020-01-08,17:29:40,Authorities in #HongKong are now reporting 9 p...


In [46]:
df.to_csv('halka_cleaned.csv', index=False)

In [47]:
# loading the model
roberta = 'cardiffnlp/twitter-roberta-base-sentiment'
model = TFRobertaForSequenceClassification.from_pretrained(roberta)
tokenizer = RobertaTokenizer.from_pretrained(roberta)
labels = ["Negative", "Neutral", "Positive"]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [None]:
tweet = format_link(format_mention(format_text(df.cleaned_tweet[990])))
print(tweet)

In [48]:
def get_sentiment(tweet):
    encoded_tweet = tokenizer(tweet, return_tensors='tf')
    output = model(encoded_tweet, training=False).logits
    return np.argmax(tf.nn.softmax(output))

In [49]:
get_sentiment(tweet)

NameError: name 'tweet' is not defined

In [None]:
labels[get_sentiment(tweet)]

In [50]:
df

Unnamed: 0,content,location,like_count,source,total_media_shared,retweet_count,year,month_name,month_day,full_date,time,cleaned_tweet
0,1/ *NEW PAPERS ALERT*\n\nVery proud to share t...,United States,15,Twitter Web App,0,4,2020,January,1,2020-01-01,19:51:05,1/ *NEW PAPERS ALERT* Very proud to share the...
1,"Drawing from actual events, #Event201 identifi...",United States,1,Social Media Management Platform,0,0,2020,January,2,2020-01-02,15:00:00,"Drawing from actual events, #Event201 identifi..."
2,Interesting article @PNAS on whether the Justi...,multiple_address,3,Social Media Management Platform,0,1,2020,January,5,2020-01-05,22:46:12,Interesting article @user on whether the Justi...
3,@arambaut @WHO But batteries of tests have bee...,United States,1,Twitter Web App,0,0,2020,January,5,2020-01-05,19:05:20,@user @user But batteries of tests have been p...
4,#Pandemic players: what do you do if you draw ...,multiple_address,3,Twitter for Android,0,0,2020,January,6,2020-01-06,15:32:57,#Pandemic players: what do you do if you draw ...
...,...,...,...,...,...,...,...,...,...,...,...,...
243180,The Italian Ambassador Antonino Maggiore was h...,multiple_address,20,Twitter for Android,1,1,2022,June,29,2022-06-29,15:38:31,The Italian Ambassador Antonino Maggiore was h...
243181,PAHO is recommending a fourth dose of the vacc...,multiple_address,4,Twitter Web App,0,2,2022,June,29,2022-06-29,15:38:24,PAHO is http a fourth dose of the vaccine for ...
243182,#elmo breaks down the #covid19 vaccine for kid...,United States,0,Social Media Management Platform,1,0,2022,June,29,2022-06-29,15:36:26,#elmo breaks down the http vaccine for kids an...
243183,"#Greece confirmed 16,115 new #coronavirus infe...",Hellas,0,Twitter Web App,1,0,2022,June,29,2022-06-29,15:35:46,"http 16,115 new http infections in the last 2..."


In [132]:
df2 = df1[:100]

In [51]:
half_tweets = df.cleaned_tweet[:150000]

In [57]:
sentiment = pd.DataFrame()

In [None]:
sentiment['sentiment'] = (df['cleaned_tweet'][150000:].apply(lambda x: labels[get_sentiment(x)]))

In [55]:
sentiment

Unnamed: 0,sentiment
0,Positive
1,Positive
2,Positive
3,Neutral
4,Negative
...,...
149995,Neutral
149996,Negative
149997,Neutral
149998,Neutral


In [56]:
sentiment.to_csv('sentiment_2.csv')

In [148]:
len(sentiment)

100

In [147]:
import time

start_time = time.time()

sentiment['sentiment'] = (df2['content'].apply(lambda x: labels[get_sentiment(x)]))
end_time = time.time()

total_time = end_time - start_time

print("Time taken to run the code: ", total_time, "seconds")

Time taken to run the code:  36.50439095497131 seconds


In [102]:
get_sentiment_vectorized(df.cleaned_tweet[2324])[0]

0

In [80]:
def get_sentiment_vectorized(tweets):
    encoded_tweets = tokenizer(tweets, return_tensors='tf')
    outputs = model(encoded_tweets, training=False).logits
    softmax_outputs = tf.nn.softmax(outputs)
    return np.argmax(softmax_outputs, axis=1)


In [None]:
df.to_csv('cleaned_tweets', index=False)