In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification # for tweet sentiment analysis
from deep_translator import GoogleTranslator # translate location
from geopy.geocoders import Nominatim # get country for city name

In [3]:
df = pd.read_csv('245k_tweets.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15276 entries, 0 to 15275
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   content             15276 non-null  object
 1   date                15276 non-null  object
 2   location            14212 non-null  object
 3   like_count          15276 non-null  int64 
 4   source              15276 non-null  object
 5   total_media_shared  15276 non-null  int64 
 6   retweet_count       15276 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 835.5+ KB


In [5]:
df.head(10)

Unnamed: 0,content,date,location,like_count,source,total_media_shared,retweet_count
0,#DeSantis and #GOP delivered legislative hando...,2022-06-28 23:55:00+00:00,"Sacramento, CA",3,Twitter Web App,0,2
1,3) We should all be annoyed that the FDA kept ...,2022-06-28 23:50:13+00:00,Washington DC | Virginia,230,Twitter for iPhone,0,55
2,The #pandemic has brought out greater fault li...,2022-06-28 23:50:00+00:00,India,0,TweetDeck,0,0
3,"""California Department of Public Health Issues...",2022-06-28 23:50:00+00:00,"Chicago, IL & Denver, CO",0,Semrush Social Media Tool,0,0
4,.@TheAtlantic's @edyong209 takes an honest loo...,2022-06-28 23:46:48+00:00,"Phoenix, AZ",3,Twitter Web App,4,1
5,2) I‚Äôve been trying to warn about rebound üèÄ #C...,2022-06-28 23:46:33+00:00,Washington DC | Virginia,293,Twitter for iPhone,0,82
6,"@shishiluo, our Head of Infectious Diseases, d...",2022-06-28 23:45:59+00:00,"San Mateo, CA",1,HubSpot,0,0
7,The first US-donated pediatric #COVID19 vaccin...,2022-06-28 23:45:39+00:00,"Washington, D.C.",131,Twitter Web App,3,28
8,Wowzers ‚ÄîFauci now has #Paxlovid rebound üèÄ #CO...,2022-06-28 23:41:28+00:00,Washington DC | Virginia,1521,Twitter for iPhone,0,450
9,As the world‚Äôs economy has come under the grip...,2022-06-28 23:35:00+00:00,India,0,TweetDeck,0,0


In [26]:
df['date'] = pd.to_datetime(df['date']) # convert into datetime format
df['year'] = df['date'].dt.year
df['month_name'] = df['date'].dt.month_name()
df['month_day'] = df['date'].dt.day

In [6]:
geolocator = Nominatim(user_agent = "http")
loc = geolocator.geocode('Saudi Arabia')
print(loc.address.split(",")[-1].strip())

ÿßŸÑÿ≥ÿπŸàÿØŸäÿ©


In [8]:
loc = geolocator.geocode("shanghai")
address = loc.address.split(",")[-1].strip()
print(address)

‰∏≠ÂõΩ


In [11]:
def get_country(place):
    try:
        loc = geolocator.geocode(place.lower())
        address = loc.address.split(",")[-1].strip()
    except Exception:
        return "multiple_address"
    
    return GoogleTranslator(source='auto', target='en').translate(address) # may contain some chinese words like "Âåó‰∫¨"

In [10]:
import re

def format_link(tweet):
    '''
    replace all the link with "http" for sentiment analysis
    '''
    pattern = r'((www|http\:\/\/|https\:\/\/)?.[\w]*.(com|co))+(\/?[\w]?)*'
    match = re.compile(pattern)
    return match.sub("http", tweet)
    
def format_text(tweet):
    '''
    replace all the \n with space from the tweet
    '''
    pattern = r'\n'
    match = re.compile(pattern)
    return match.sub(" ", tweet)

def format_mention(tweet):
    '''
    replace all the @username mention to @user for sentiment analysis
    '''
    pattern = r'@[\w]+'
    match = re.compile(pattern)
    return match.sub("@user", tweet)

In [12]:
get_country("shanghai")

'China'

In [14]:
format_text(format_link(df['content'][234]))

'The rickshaw men in #Tokyo are adding English-speaking staff, a sure sign #Japan is bracing for a return of tourists from abroad.  Read more: http  #COVID19 #tourism'

In [15]:
df['place'] = df['location'].apply(get_country) 
df['place'].unique()[:20]

In [32]:
# convert anything which is not posted from webapp, iphone, android, ipad etc. to social media management platform
df['source'] = np.where((df['source'] != "Twitter Web App") & 
                        (df['source'] != "Twitter for iPhone") & 
                        (df['source'] != "Twitter for Android") & 
                        (df['source'] != "Twitter for iPad") & 
                        (df['source'] != "Twitter Media Studio") & 
                        (df['source'] != "Twitter for Advertisers"), 
                        "Social Media Management Platform", df['source'])

In [33]:
df['source'].value_counts()

Twitter Web App                     6409
Social Media Management Platform    6160
Twitter for iPhone                  1413
Twitter for Android                  882
Twitter Media Studio                 256
Twitter for iPad                      98
Twitter for Advertisers               58
Name: source, dtype: int64

In [22]:
import snscrape.modules.twitter as sntwitter
count = 1
tweets = []
for tweet in sntwitter.TwitterSearchScraper("#COVID19 OR #CoronaVirus OR #Pandemic since:2020-01-01 until:2022-06-29 filter:verified").get_items():
    if len(tweets) == count:
        break
    if tweet.lang=='en':
        tweets.append(tweet)

In [23]:
tweets[0].rawContent

'#DeSantis and #GOP delivered legislative handouts to the nursing home industry, like extending protections from #COVID19-related liability lawsuits and reducing the amount of resident care the facilities are required to provide to residents\nhttps://t.co/J4LjWD8RMZ\nvia @readsludge'

In [36]:
# loading the model
roberta = 'cardiffnlp/twitter-roberta-base-sentiment'
model = TFRobertaForSequenceClassification.from_pretrained(roberta)
tokenizer = RobertaTokenizer.from_pretrained(roberta)
labels = ["Negative", "Neutral", "Positive"]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Downloading (‚Ä¶)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (‚Ä¶)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (‚Ä¶)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [25]:
tweet = format_link(format_mention(format_text(tweets[0].rawContent)))
print(encoded_tweet)

#DeSantis and #GOP delivered legislative handouts to the nursing home industry, like extending protections from #COVID19-related liability lawsuits and reducing the amount of resident care the facilities are required to provide to residents http via @user


In [45]:
def get_sentiment(tweet):
    encoded_tweet = tokenizer(tweet, return_tensors='tf')
    output = model(encoded_tweet, training=False).logits
    return tf.nn.softmax(output)

In [50]:
get_sentiment(tweet).numpy().argmax()

1

In [52]:
labels[get_sentiment(tweet).numpy().argmax()]

'Neutral'