# Sentiment Analysis


### Install Packages

In [2]:
!pip install transformers
!pip install emoji

Collecting emoji
  Downloading emoji-2.9.0-py2.py3-none-any.whl (397 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.5/397.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.9.0


### Import Packages

In [3]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import re
import emoji
from tqdm.notebook import tqdm
tqdm.pandas()

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/Rising-Stars-by-Sunshine/STATS201-PS2-Jenny/main/Data/processed%20data/cleaned_dataset.csv")

### Sentiment Analysis Model init

In [5]:
#load model and tokenizer
#https://huggingface.co/savasy/bert-base-turkish-sentiment-cased
model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
# create pipeline
sa = pipeline("sentiment-analysis", tokenizer=tokenizer, model=model)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Preprocessing
In the sentiment analysis, the data preprocessing will be more detailed than the data cleaning, to deal with the emojis and words within the tweets.

In [40]:
def preprocess_word(word):
    # Remove punctuation
    #word = word.strip('\'"?!,.():;')

    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word


def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)|😉', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)

    return tweet

def remove_emoji(tweet):
    return emoji.replace_emoji(tweet, replace=" ")


def preprocess_tweet(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()

    #Clean only digits
    tweet = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", tweet)

    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', '', tweet)

    # Replace @handle with the word USER_MENTION
    #tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    tweet = re.sub(r'@[\S]+', '', tweet)

    # Replaces #hashtag with hashtag
    #tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    tweet = re.sub(r'#(\S+)', '', tweet)

    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)

    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)

    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')

    #remove emojis
    tweet = remove_emoji(tweet)

    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()

    for word in words:
      word = preprocess_word(word)
      #if is_valid_word(word):
      #    processed_tweet.append(word)
      processed_tweet.append(word)

    return ' '.join(processed_tweet)

In [7]:
index = 3200
print(df.loc[index].Text)
p = sa(df.loc[index].Text)
print(p)

Global Goals
@GlobalGoalsUN
·
Mar 30
[{'label': 'positive', 'score': 0.9766541123390198}]


In [8]:
pre_twit = preprocess_tweet(df.loc[index].Text)
print(pre_twit)
p = sa(pre_twit)
print(p)

global goals · mar
[{'label': 'positive', 'score': 0.9897163510322571}]


In [9]:
def get_sentiment_analysis(tweet):
    """True if tweet has positive compound sentiment, False otherwise."""
    p = sa(str(tweet))
    #print("label", p[0]["label"])
    #print("score", round(p[0]["score"],4))
    return pd.Series([p[0]["label"], round(p[0]["score"],4)])

In [10]:
s = get_sentiment_analysis(pre_twit)
print(s)

0    positive
1      0.9897
dtype: object


### Sentiment Analysis

Attention: the first step to do the sentiment analysis need more than 15 minutes!

In [11]:
# Set default values for sentiment_label and sentiment_score
df["sentiment_label"] = "-"
df["sentiment_score"] = -1
df[["sentiment_label", "sentiment_score"]] = df["Text"].progress_apply(get_sentiment_analysis)

  0%|          | 0/9038 [00:00<?, ?it/s]

In [36]:
df['sentiment_score'].apply(lambda x: float(x))

THRESHOLD = 0.95

In [39]:
df

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,mentioned_users,partial_clean,cleaned_text,sentiment_label,sentiment_score
0,Lauren Boebert,@laurenboebert,2022-01-17T23:32:38.000Z,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...,,The only solution I’ve ever heard the Left pro...,the only solution ive ever heard the left prop...,neutral,0.9417
1,Catherine,@catherine___c,2022-01-17T22:54:02.000Z,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...,,Climate change doesn’t cause volcanic eruption...,climate change doesnt cause volcanic eruptions,neutral,0.9791
2,king Keith,@KaConfessor,2022-01-17T23:51:41.000Z,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...,,Vaccinated tennis ball boy collapses in the te...,vaccinated tennis ball boy collapses in the te...,positive,0.9945
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17T21:42:04.000Z,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...,,North America has experienced an average winte...,north america has experienced an average winte...,positive,0.9911
4,Thomas Speight,@Thomas_Sp8,2022-01-17T21:10:40.000Z,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...,joeywreck,They're gonna do the same with Climate Change ...,they re gonna do the same with climate change ...,neutral,0.9666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9033,Dr Srijana Mitra Das,@srijanapiya17,2022-07-18T12:08:28.000Z,Dr Srijana Mitra Das\n@srijanapiya17\n·\nJul 18,#ClimateChange is now the greatest story on Ea...,,2,16,24,['https://pbs.twimg.com/profile_images/5140754...,https://twitter.com/srijanapiya17/status/15490...,"DrSimEvans,DrSimEvans",#ClimateChange is now the greatest story on Ea...,climatechange is now the greatest story on ear...,neutral,0.6030
9034,1%_Better_Every_Day,@jh336405,2022-07-18T00:33:20.000Z,1%_Better_Every_Day\n@jh336405\n·\nJul 18,Replying to \n@jh336405\n @acuna_r\n and 41 ot...,💯 💯 🌏,4,,,['https://pbs.twimg.com/profile_images/1442412...,https://twitter.com/jh336405/status/1548828230...,"jh336405,acuna_r,rahmstorf",Replying to and 41 others And Stefan Rahmstorf...,replying to and others and stefan rahmstorf co...,neutral,0.7370
9035,David Schechter,@DavidSchechter,2022-07-18T21:13:13.000Z,David Schechter\n@DavidSchechter\n·\nJul 18,While Texans are being asked to use less elect...,,3,14,23,['https://pbs.twimg.com/card_img/1549138950475...,https://twitter.com/DavidSchechter/status/1549...,GregAbbott_TX,While Texans are being asked to use less elect...,while texans are being asked to use less elect...,neutral,0.9741
9036,Daily Climate,@TheDailyClimate,2022-07-18T10:15:09.000Z,Daily Climate\n@TheDailyClimate\n·\nJul 18,"Sea levels are rising, and communities are scr...",,,3,,['https://pbs.twimg.com/card_img/1547862999808...,https://twitter.com/TheDailyClimate/status/154...,NHBulletinNews,"Sea levels are rising, and communities are scr...",sea levels are rising and communities are scra...,neutral,0.9789
