In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification # for tweet sentiment analysis
from deep_translator import GoogleTranslator # translate location
from geopy.geocoders import Nominatim # get country for city name

In [None]:
df = pd.read_csv('tweets.csv')

In [None]:
df.info()

In [None]:
df.head(10)

In [None]:
df.isnull().sum()

In [None]:
df.query('location.isnull()')

In [None]:
# filling all the missing values with "Not Specified" from location column
df['location'].fillna("not_specified", inplace = True)

In [None]:
df['date'] = pd.to_datetime(df['date']) # convert into datetime format

df['year'] = df['date'].dt.year
df['month_name'] = df['date'].dt.month_name()
df['month_day'] = df['date'].dt.day

In [None]:
df['full_date'] = [d.date() for d in df['date']]

df['time'] = [d.time() for d in df['date']]

In [None]:
df.drop('date',axis=1,inplace=True)
df.head()

In [None]:
# converts city name, state into country name
geolocator = Nominatim(user_agent = "http")
loc = geolocator.geocode('Nigeria')
print(loc.address.split(",")[-1].strip())

In [None]:
translate_tweet = GoogleTranslator(source='auto', target='en')
translate_tweet.translate("대한민국 서울")

In [None]:
def get_country(place):
    '''
    returns country name given city but cities in country like saudi arabia, china output like "대한민국 서울"
    so we need to translate in the end.
    '''
    try:
        loc = geolocator.geocode(place.lower())
        address = loc.address.split(",")[-1].strip()
    except Exception:
        return "unknown"
    return translate_tweet.translate(address) # may contain some chinese words like "北京"

In [None]:
import re

def format_link(tweet):
    '''
    replace all the link with "http" for sentiment analysis
    '''
    pattern = r'((www|http\:\/\/|https\:\/\/)?.[\w]*.(com|co))+(\/?[\w]?)*'
    match = re.compile(pattern)
    return match.sub("http", tweet)
    
def format_text(tweet):
    '''
    replace all the \n with space from the tweet
    '''
    pattern = r'\n'
    match = re.compile(pattern)
    return match.sub(" ", tweet)

def format_mention(tweet):
    '''
    replace all the @username mention to @user for sentiment analysis
    '''
    pattern = r'@[\w]+'
    match = re.compile(pattern)
    return match.sub("@user", tweet)

In [None]:
get_country("fresno")

In [None]:
df['location'].value_counts()[:20]

In [None]:
# convert anything which is not posted from webapp, iphone, android, ipad etc. to social media management platform
df['source'] = np.where((df['source'] != "Twitter Web App") & 
                        (df['source'] != "Twitter for iPhone") & 
                        (df['source'] != "Twitter for Android") & 
                        (df['source'] != "Twitter for iPad") & 
                        (df['source'] != "Twitter Media Studio") & 
                        (df['source'] != "Twitter for Advertisers"), 
                        "Social Media Management Platform", df['source'])

In [None]:
df['source'].value_counts()

In [None]:
df['cleaned_tweet'] = df['content'].apply(lambda x: format_text(format_link(format_mention(x))))

In [None]:
df.head(10)

In [None]:
# loading the model for tweet analysis
roberta = 'cardiffnlp/twitter-roberta-base-sentiment'
model = TFRobertaForSequenceClassification.from_pretrained(roberta)
tokenizer = RobertaTokenizer.from_pretrained(roberta)
labels = [ "Negative", "Neutral", "Positive" ]

In [None]:
tweet = format_link(format_mention(format_text(df.cleaned_tweet[990])))
print(tweet)

In [None]:
def get_sentiment(tweet):
    encoded_tweet = tokenizer(tweet, return_tensors='tf')
    output = model(encoded_tweet, training=False).logits
    return np.argmax(tf.nn.softmax(output))

In [None]:
df['sentiment'] = (df['cleaned_tweet'].apply(lambda x: labels[get_sentiment(x)]))

In [None]:
df.head(10)

In [None]:
df.rename(columns={'new_column':'sentiment'})

In [None]:
df.to_csv('cleaned_tweets.csv', index=False)