> # Setting up the notebook

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from nltk.corpus import stopwords
from PIL import Image
import re
plt.style.use('ggplot')

In [None]:
train_df = pd.read_csv('/kaggle/input/eurecom-aml-2022-challenge-3/train.csv')
test_df = pd.read_csv('/kaggle/input/eurecom-aml-2022-challenge-3/test.csv')

> # EDA - Exploratory Data Analysis

In [None]:
print(train_df.info())

In [None]:
print(test_df.info())

In [None]:
print(f"Train dataset has {len(train_df)} records and {train_df.isna().sum().sum()} null values" )
print(f"Test dataset has {len(test_df)} records and {test_df.isna().sum().sum()} null values" )

In [None]:
# How many unique text IDs ?
len(train_df["textID"].unique())  # 24732 -> no duplicates IDs

In [None]:
# How many unique texts ?
len(train_df["text"].unique())  # 24732 -> no duplicates texts

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(data=train_df, x="sentiment", palette="mako")
plt.title("Tweet sentiments - training data", fontsize=20, pad=10)
plt.xlabel("Sentiment", fontsize=15, labelpad=10)
plt.ylabel("Count", fontsize=15, labelpad=10)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
class_df = train_df.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
percent_class=class_df.text
labels= class_df.sentiment
colors = ['#FACA0C','#17C37B','#F92969']
plt.figure(figsize=(12,6))
my_pie,_,_ = plt.pie(percent_class, radius = 1, labels=labels, colors=colors, autopct="%.1f%%", textprops={'fontsize': 16})
plt.setp(my_pie, width=0.7, edgecolor='white') 
plt.show()

In [None]:
print(f"Avgerage tweet's length: {train_df['text'].apply(len).mean():.2f} characters")

In [None]:
def count_words(text):
  '''
  Count the words in a text
  '''
  return len(text.split(" "))

df_text_sentiment = train_df.loc[:, ["text", "sentiment"]]
# Map text to length in terms of characters
df_text_sentiment["number_of_characters"] = df_text_sentiment["text"].apply(lambda text : len(text))
df_text_sentiment["number_of_words"] = df_text_sentiment["text"].apply(count_words)
df_text_sentiment = df_text_sentiment.drop(columns=["text"])

# Visualize scatter plot with tweet's characters and sentiment
cdict = {"positive": "green", "neutral": "yellow", "negative": "red"}
plt.rcParams['axes.facecolor'] = 'white'
plt.figure(figsize=(12,6))
plt.scatter(x='number_of_words', y='number_of_characters', c=df_text_sentiment['sentiment'].map(cdict), data=df_text_sentiment)
#plt.title("Correlation between #words , #characters and the sentiment", fontsize=20, pad=10)
plt.xlabel("#words", fontsize=15 , color='black')
plt.ylabel("#characters", fontsize=15, color='black')
plt.xticks(fontsize=10, color='black')
plt.yticks(fontsize=10, color='black')
plt.grid(color='black', linestyle='-.', linewidth=0.7)
plt.show()

In [None]:
# Plot stacked bar plot
cross_tab_prop = pd.crosstab(index=df_text_sentiment['number_of_words'],
                             columns=df_text_sentiment['sentiment'],
                             normalize="index")
cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    figsize=(15, 8),
                    color=['#F92969','#FACA0C','#17C37B'])

plt.legend(loc="upper left", ncol=3)
plt.xlabel("Number of words")
plt.ylabel("Sentiment proportion")
plt.show()

In [None]:
fig,(ax1,ax2,ax3) = plt.subplots(1,3,figsize=(15,5))

tweet_len=train_df[train_df['sentiment']=="positive"]['text'].str.len()
ax1.hist(tweet_len,color='#17C37B')
ax1.set_title('Positive Sentiments')
ax1.set_ylabel("Count", fontsize=15)

tweet_len=train_df[train_df['sentiment']=="neutral"]['text'].str.len()
ax2.hist(tweet_len,color='#FACA0C')
ax2.set_title('Neutral Sentiments')
ax2.set_xlabel("Number of characters", fontsize=15)

tweet_len=train_df[train_df['sentiment']=="negative"]['text'].str.len()
ax3.hist(tweet_len,color='#F92969')
ax3.set_title('Negative Sentiments')

fig.suptitle('Characters in a tweet', fontsize=20)
plt.show()

In [None]:
fig,(ax1,ax2,ax3)=plt.subplots(1,3,figsize=(15,5))

tweet_len=train_df[train_df['sentiment']=="positive"]['text'].str.split().map(lambda x: len(x))
ax1.hist(tweet_len,color='#17C37B')
ax1.set_title('Positive Sentiments')
ax1.set_ylabel("Count", fontsize=15)


tweet_len=train_df[train_df['sentiment']=="neutral"]['text'].str.split().map(lambda x: len(x))
ax2.hist(tweet_len,color='#FACA0C')
ax2.set_title('Neutral Sentiments')
ax2.set_xlabel("Number of words", fontsize=15)


tweet_len=train_df[train_df['sentiment']=="negative"]['text'].str.split().map(lambda x: len(x))
ax3.hist(tweet_len,color='#F92969')
ax3.set_title('Negative Sentiments')


fig.suptitle('Words in a tweet', fontsize=20)
plt.show()

In [None]:
def avg_words(sentiment):
    mean_num_words = train_df[train_df["sentiment"] == sentiment]['text'].str.split().map(lambda x: len(x)).mean()
    return round(mean_num_words, 2)

# Average tweet's length (in words) per sentiment
print(f"Average number of words in positive tweets: {avg_words('positive')}")
print(f"Average number of words in neutral tweets: {avg_words('neutral')}")
print(f"Average number of words in negative tweets: {avg_words('negative')}")

In [None]:
nltk.download("stopwords")

In [None]:
from collections import Counter

def get_most_common_words(df):
    text_positive = train_df[train_df["sentiment"] == "positive"]["text"]
    text_neutral = train_df[train_df["sentiment"] == "neutral"]["text"]
    text_negative = train_df[train_df["sentiment"] == "negative"]["text"]

    # capture words in text per sentiment
    def get_mcw_per_sentiment(text):
        comment_words = ''
        for text_i in text_positive:
            tokens = text_i.split()
            for i in range(len(tokens)):
                tokens[i] = tokens[i].lower()
      
        comment_words += " ".join(tokens)+" "
    
        comment_words = comment_words.split(" ")
        # remove stopwords
        comment_words = [x for x in comment_words if x not in stopwords.words("english")]
        return comment_words

    # top 25 common words per sentiment, no stopwords
    th = 25
    words_positive = list(map(lambda x: x[0], Counter(get_mcw_per_sentiment(text_positive)).most_common(th)))
    words_neutral = list(map(lambda x: x[0], Counter(get_mcw_per_sentiment(text_neutral)).most_common(th)))
    words_negative = list(map(lambda x: x[0], Counter(get_mcw_per_sentiment(text_negative)).most_common(th)))

    # intersection to see which are the most common words in general
    return set(words_positive) & set(words_neutral) & set(words_negative)

In [None]:
def get_word_cloud(sentiment, color_palette, stopwords_list):
    comment_words = ''
    # iterate through the csv file
    for val, curr_sentiment in zip(train_df["text"], train_df["sentiment"]):
        if curr_sentiment != sentiment:
            continue
        # typecaste each val to string
        val = str(val)
        # split the value
        tokens = val.split()
        # Converts each token into lowercase
        for i in range(len(tokens)):
          tokens[i] = tokens[i].lower()
    
    comment_words += " ".join(tokens)+" "

    mask_dir = np.array(Image.open('twitter_logo.png'))
    wordcloud = WordCloud(width = 800, height = 800,
                        background_color ='white',
                        stopwords = stopwords_list,
                        min_font_size = 10,
                        colormap=color_palette,
                        mask=mask_dir).generate(comment_words)

    return wordcloud

stopwords_list = set(stopwords.words("english")).union(set(get_most_common_words(train_df).union({"lol", "work", "today"})))
# Plot Tweet's wordclouds per sentiment
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 8))			
# plt.figure(figsize = (8, 8), facecolor = None)
axs[0].imshow(get_word_cloud("positive", "Greens", stopwords_list))
axs[0].axis("off")
axs[0].set_title("Positive wordcloud", fontsize=20, pad=15)

axs[1].imshow(get_word_cloud("neutral", "Wistia", stopwords_list))
axs[1].axis("off")
axs[1].set_title("Neutral wordcloud", fontsize=20, pad=15)

axs[2].imshow(get_word_cloud("negative", "Reds", stopwords_list))
axs[2].axis("off")
axs[2].set_title("Negative wordcloud", fontsize=20, pad=15)

plt.show()

In [None]:
# Util function to get the tweets corpus of a specific sentiment
def create_corpus(sentiment):
    corpus=[]
    
    for x in train_df[train_df['sentiment']==sentiment ]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
from collections import defaultdict
import string


# Check punctuactions per sentiment
def plot_punctuactions(sentiment, color):
    plt.figure(figsize=(10,5))
    corpus=create_corpus(sentiment)
    dic=defaultdict(int)
    special = string.punctuation
    for i in (corpus):
        if i in special:
            dic[i]+=1    
    x,y=zip(*dic.items())
    plt.title(f"Punctuactions distribution in {sentiment} tweets", fontsize=20, pad=10)
    plt.ylabel("Count", fontsize=15)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.bar(x,y,color=color)
    plt.show()

plot_punctuactions("positive", "#17C37B")
plot_punctuactions("neutral", "#FACA0C")
plot_punctuactions("negative", "#F92969")

In [None]:
# Check most common hashtags
def find_hashtag(text):
    line=re.findall(r'(?<=#)\w+',text)
    return " ".join(line)

train_df['hash']=train_df['text'].apply(find_hashtag)
temp=train_df['hash'].value_counts()[:][1:13]
temp= temp.to_frame().reset_index().rename(columns={'index':'Hashtag','hash':'count'})
plt.figure(figsize=(15,8))
sns.barplot(x="Hashtag",y="count", data = temp)
plt.title("Most popular hashtags", fontsize=20, pad=10)
plt.xlabel("Hashtag", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

In [None]:
# Check mentions

def find_mentions(text):
    line=re.findall(r'(?<=@)\w+',text)
    return " ".join(line)
train_df['mentions']=train_df['text'].apply(find_mentions)

temp=train_df['mentions'].value_counts()[:][1:13]
temp =temp.to_frame().reset_index().rename(columns={'index':'Mentions','mentions':'count'})

plt.figure(figsize=(15,8))
sns.barplot(x="Mentions",y="count", data = temp)
plt.title("Most popular mentions", fontsize=20, pad=10)
plt.xlabel("Mention", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

In [None]:
# Check tweets with very few characters - Some of them make no sense, check sentiment of those as well
def check_characters_length(df):
    for i, text in enumerate(df["text"]):
        curr_length = len(text)
        if curr_length <= 5:
            print(f"{text} - {df.loc[i, 'sentiment']}")

check_characters_length(train_df)



> # Preprocessing



In [None]:
# Remove URLs and HTML links
def remove_urls(text):
    url_remove = re.compile(r'https?://\S+|www\.\S+')
    return url_remove.sub(r'', text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

# Remove mentions
def remove_mention(x):
    return re.sub(r'@\w+','',x)

In [None]:
!pip install wordninja  # library we use to split concatenated words in hashtags

import wordninja

In [None]:
!pip3 install pyenchant
!sudo apt-get install libenchant1c2a

import enchant

In [None]:
abbreviations_dict = {
  "AFAIK":"As Far As I Know",
  "AFK":"Away From Keyboard",
  "ASAP":"As Soon As Possible",
  "ATK":"At The Keyboard",
  "ATM":"At The Moment",
  "A3":"Anytime, Anywhere, Anyplace",
  "BAK":"Back At Keyboard",
  "BBL":"Be Back Later",
  "BBS":"Be Back Soon",
  "BFN":"Bye For Now",
  "B4N":"Bye For Now",
  "BRB":"Be Right Back",
  "BRT":"Be Right There",
  "BTW":"By The Way",
  "B4":"Before",
  "B4N":"Bye For Now",
  "CUL8R":"See You Later",
  "CYA":"See You",
  "FAQ":"Frequently Asked Questions",
  "FC":"Fingers Crossed",
  "FWIW":"For What It's Worth",
  "FYI":"For Your Information",
  "GMTA":"Great Minds Think Alike",
  "GR8":"Great!",
  "G9":"Genius",
  "ICQ":"I Seek you (also a chat program)",
  "ILU":"ILU: I Love You",
  "IMHO":"In My Honest Opinion",
  "IOW":"In Other Words",
  "KISS":"Keep It Simple, Stupid",
  "LDR":"Long Distance Relationship",
  "LMAO":"Laugh My fuck Off",
  "LOL":"Laughing Out Loud",
  "LTNS":"Long Time No See",
  "L8R":"Later",
  "MTE":"My Thoughts Exactly",
  "M8":"Mate",
  "NRN":"No Reply Necessary",
  "PITA":"Pain In The fuck",
  "PRT":"Party",
  "PRW":"Parents Are Watching",
  "ROFL":"Rolling On The Floor Laughing",
  "ROFLOL":"Rolling On The Floor Laughing Out Loud",
  "ROTFLMAO":"Rolling On The Floor Laughing My fuck Off",
  "SK8":"Skate",
  "STATS":"Your sex and age",
  "THX":"Thank You",
  "TTYL":"Talk To You Later",
  "U2":"You Too",
  "U4E":"Yours For Ever",
  "WB":"Welcome Back",
  "WTF":"What The fuck",
  "WTG":"Way To Go!",
  "WUF":"Where Are You From?",
  "WKDN":"Week-End",
  "W8":"Wait",
  "7K":"Sick Laugher",
  "****":"fuck"
}

In [None]:
abbreviations_dict = dict((k.lower(), v.lower()) for k, v in abbreviations_dict.items())

In [None]:
d = enchant.Dict("en_US")

def handle_hashtag(x):
    if "#" in x:
        hashtags = re.findall(r"#(\w+)",x)  # may have more than one hashtag per tweet
        original_hashtags = hashtags.copy()
        corrects = [0] * len(hashtags)
        for i, hashtag in enumerate(hashtags):
            hashtag_final = hashtag.lower()
            # check if the hashtag is a english word, in case just remove the #
            if d.check(hashtag_final):
                corrects[i] = 1
            else:
                # check if there is some abbreviation in the hashtag, in case write it in complete form
                for abbrev in abbreviations_dict.keys():
                    if abbrev in hashtag:
                        idx_start = hashtag.find(abbrev)
                        hashtag_final = hashtag[:idx_start] + re.sub(" ", "", abbreviations_dict[abbrev]) + hashtag[idx_start+len(abbrev):]
                # split the hashtag with wordninja and remove the #
                hashtag_final = " ".join(wordninja.split(hashtag_final))
                # check that each splitted word is in the english vocabulary, otherwise just delete the hashtag
                corrects[i] = 1
                for word in hashtag_final.split(" "):
                    if not d.check(word): # and not d.check(word.capitalize()):  # also cover where i.e. friday is not found, but Friday is found
                        corrects[i] = 0
            if corrects[i]:
                hashtags[i] = hashtag_final
        # now substitute all "successful" converted hashtags, delete not successful
        for i, correct_flag in enumerate(corrects):
            to_replace = "#" + original_hashtags[i]
            if correct_flag:
                x = x.replace(to_replace, hashtags[i])
            else:
                x = x.replace(to_replace, "")

    return x

In [None]:
# Lowercase
def text_lowercase(x):
    return x.lower()

In [None]:
!pip install emot

In [None]:
import pickle
from emot.emo_unicode import UNICODE_EMOJI # For emojis

# Function for converting emojis into word - it takes a bit
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
        text = text.replace("_", " ")
    return text

In [None]:
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

# Function for converting emoticons into word
def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, "_".join(EMOTICONS_EMO[emot].replace(",","").replace(":","").split()))
        text = text.replace("_", " ")
    return text

In [None]:
import string

# Remove useless spaces, numbers
def rem_spaces_numbers(text):
    # remove numbers
    mod_text = re.sub(r'\d+', ' ', text)
    # remove useless spaces (more than one spaces concatenated and spaces at beggining/end of a text)
    mod_text = re.sub(' +', ' ', mod_text)
    mod_text = re.sub('^ +', '', mod_text)
    mod_text = re.sub(' +$', '', mod_text)

    return mod_text

In [None]:
!pip install textblob

from textblob import TextBlob

def correct_spelling(text):
    textBlb = TextBlob(text)
    textCorrected = textBlb.correct()   # Correcting the text
    return textCorrected

In [None]:
# Convert slang into actual words
abbreviations = {
'im': "I am",
"ur":"You are",
'atm': 'At The Moment',
'afaik': 'As Far as I Know',
'b/c': 'Because',
'bfn': 'Bye For Now',
'br': 'Best Regards',
'btw': 'By the Way',
'dm': 'Direct Message',
'em': 'email',
'fb': 'facebook',
'ff': 'Follow Friday',
'ffs': "For Fuck's Sake",
'fml': 'Fuck My Life',
'ftf': 'Face To Face',
'ftl': 'For The Loss',
'ftw': 'For The Win',
'fwd': 'Forward',
'fwiw': "For What It's Worth",
'ht': 'Hat Tip',
'hth': 'Hope That Helps',
'imho': 'In My Humble Opinion',
'imo': 'In My Opinion',
'irl': 'In Real Life',
'jv': 'Joint Venture',
'j/k': 'Just Kidding',
'li': 'LinkedIn',
'lmk': 'Let Me Know',
'lol': 'Laughing Out Loud',
'mt': 'Modified Tweet',
'nsfw': 'Not Safe For Work',
'oh': 'Overheard',
'omfg': 'Oh My Fucking God',
'omg': 'Oh My God',
'prt': 'Partial Retweet',
're': 'Replies',
'rr': 'Re-Run',
'rt': 'Retweet',
'rtf': 'Read The FAQ',
'rtfm': 'Read The Fucking Manual',
'rthx': 'Thanks For The Retweet',
'snafu': 'Situation Normal All Fucked Up',
'sob': 'Son Of a Bitch',
'stfu': 'Shut the Fuck Up',
"smh": "Shake My Head",
'tmb': 'Tweet Me Back',
'tmi': 'Too Much Information',
'wtf': 'What The Fuck',
'ymmv': 'Your Mileage May Vary',
'yw': "You're Welcome",
'tl;DR': "Too Long Dind't Read",
"$": "Dollar",
"€": "Euro",
"4ao": "For Adults Only",
"a.m": "Before Midday",
"a3": "Anytime Anywhere Anyplace",
"aamof": "As A Matter Of Fact",
"acct": "Account",
"adih": "Another Day In Hell",
"afaic": "As Far As I Am Concerned",
"afaict": "As Far As I Can Tell",
"afaik": "As Far As I Know",
"afair": "As Far As I Remember",
"afk": "Away From Keyboard",
"app": "Application",
"approx": "Approximately",
"apps": "Applications",
"asap": "As Soon As Possible",
"asl": "Age, Sex, Location",
"atk": "At The Keyboard",
"ave.": "Avenue",
"aymm": "Are You My Mother",
"ayor": "At Your Own Risk",
"b&b": "Bed And Breakfast",
"b+b": "Bed And Breakfast",
"b.c": "Before Christ",
"b2b": "Business To Business",
"b2c": "Business To Customer",
"b4": "Before",
"b4n": "Bye For Now",
"b@u": "Back At You",
"bae": "Before Anyone Else",
"bak": "Back At Keyboard",
"bbbg": "Bye Bye Be Good",
"bbc": "British Broadcasting Corporation",
"bbias": "Be Back In A Second",
"bbl": "Be Back Later",
"bbs": "Be Back Soon",
"be4": "Before",
"bfn": "Bye For Now",
"blvd": "Boulevard",
"bout": "About",
"brb": "Be Right Back",
"bros": "Brothers",
"brt": "Be Right There",
"bsaaw": "Big Smile And A Wink",
"btw": "By The Way",
"bwl": "Bursting With Laughter",
"c/o": "Care Of",
"cet": "Central European Time",
"cf": "Compare",
"cia": "Central Intelligence Agency",
"csl": "Can Not Stop Laughing",
"cu": "See You",
"cul8r": "See You Later",
"cv": "Curriculum Vitae",
"cwot": "Complete Waste Of Time",
"cya": "See You",
"cyt": "See You Tomorrow",
"dae": "Does Anyone Else",
"dbmib": "Do Not Bother Me I Am Busy",
"diy": "Do It Yourself",
"dm": "Direct Message",
"dwh": "During Work Hours",
"e123": "Easy As One Two Three",
"eet": "Eastern European Time",
"eg": "Example",
"embm": "Early Morning Business Meeting",
"encl": "Enclosed",
"encl.": "Enclosed",
"etc": "And So On",
"faq": "Frequently Asked Questions",
"fawc": "For Anyone Who Cares",
"fb": "Facebook",
"fc": "Fingers Crossed",
"fig": "Figure",
"fimh": "Forever In My Heart",
"ft.": "Feet",
"ft": "Featuring",
"ftl": "For The Loss",
"ftw": "For The Win",
"fwiw": "For What It Is Worth",
"fyi": "For Your Information",
"g9": "Genius",
"gahoy": "Get A Hold Of Yourself",
"gal": "Get A Life",
"gcse": "General Certificate Of Secondary Education",
"gfn": "Gone For Now",
"gg": "Good Game",
"gl": "Good Luck",
"glhf": "Good Luck Have Fun",
"gmt": "Greenwich Mean Time",
"gmta": "Great Minds Think Alike",
"gn": "Good Night",
"g.o.a.t": "Greatest Of All Time",
"goat": "Greatest Of All Time",
"goi": "Get Over It",
"gps": "Global Positioning System",
"gr8": "Great",
"gratz": "Congratulations",
"gyal": "Girl",
"h&c": "Hot And Cold",
"hp": "Horsepower",
"hr": "Hour",
"hrh": "His Royal Highness",
"ht": "Height",
"ibrb": "I Will Be Right Back",
"ic": "I See",
"icq": "I Seek You",
"icymi": "In Case You Missed It",
"idc": "I Do Not Care",
"idgadf": "I Do Not Give A Damn Fuck",
"idgaf": "I Do Not Give A Fuck",
"idk": "I Do Not Know",
"ie": "That Is",
"i.e": "That Is",
"ifyp": "I Feel Your Pain",
"ig": "Instagram",
"iirc": "If I Remember Correctly",
"ilu": "I Love You",
"ily": "I Love You",
"imho": "In My Humble Opinion",
"imo": "In My Opinion",
"imu": "I Miss You",
"iow": "In Other Words",
"irl": "In Real Life",
"j4f": "Just For Fun",
"jic": "Just In Case",
"jk": "Just Kidding",
"jsyk": "Just So You Know",
"l8r": "Later",
"lb": "Pound",
"lbs": "Pounds",
"ldr": "Long Distance Relationship",
"lmao": "Laugh My Ass Off",
"lmfao": "Laugh My Fucking Ass Off",
"lol": "Laughing Out Loud",
"ltd": "Limited",
"ltns": "Long Time No See",
"m8": "Mate",
"mf": "Motherfucker",
"mfs": "Motherfuckers",
"mfw": "My Face When",
"mofo": "Motherfucker",
"mph": "Miles Per Hour",
"mr": "Mister",
"mrw": "My Reaction When",
"ms": "Miss",
"mte": "My Thoughts Exactly",
"nagi": "Not A Good Idea",
"nbc": "National Broadcasting Company",
"nbd": "Not Big Deal",
"nfs": "Not For Sale",
"ngl": "Not Going To Lie",
"nhs": "National Health Service",
"nrn": "No Reply Necessary",
"nsfl": "Not Safe For Life",
"nsfw": "Not Safe For Work",
"nth": "Nice To Have",
"nvr": "Never",
"nyc": "New York City",
"oc": "Original Content",
"og": "Original",
"ohp": "Overhead Projector",
"oic": "Oh I See",
"omdb": "Over My Dead Body",
"omg": "Oh My God",
"omw": "On My Way",
"p.a": "Per Annum",
"p.m": "After Midday",
"pm": "Prime Minister",
"poc": "People Of Color",
"pov": "Point Of View",
"pp": "Pages",
"ppl": "People",
"prw": "Parents Are Watching",
"ps": "Postscript",
"pt": "Point",
"ptb": "Please Text Back",
"pto": "Please Turn Over",
"qpsa": "What Happens",
"ratchet": "Rude",
"rbtl": "Read Between The Lines",
"rlrt": "Real Life Retweet",
"rofl": "Rolling On The Floor Laughing",
"roflol": "Rolling On The Floor Laughing Out Loud",
"rotflmao": "Rolling On The Floor Laughing My Ass Off",
"rt": "Retweet",
"ruok": "Are You Ok",
"sfw": "Safe For Work",
"sk8": "Skate",
"smh": "Shake My Head",
"sq": "Square",
"srsly": "Seriously",
"ssdd": "Same Stuff Different Day",
"tbh": "To Be Honest",
"tbs": "Tablespooful",
"tbsp": "Tablespooful",
"tfw": "That Feeling When",
"thks": "Thank You",
"tho": "Though",
"thx": "Thank You",
"tia": "Thanks In Advance",
"til": "Today I Learned",
"tl;dr": "Too Long I Did Not Read",
"tldr": "Too Long I Did Not Read",
"tmb": "Tweet Me Back",
"tntl": "Trying Not To Laugh",
"ttyl": "Talk To You Later",
"u": "You",
"u2": "You Too",
"u4e": "Yours For Ever",
"utc": "Coordinated Universal Time",
"w/": "With",
"w/o": "Without",
"w8": "Wait",
"wassup": "What Is Up",
"wb": "Welcome Back",
"wtf": "What The Fuck",
"wtg": "Way To Go",
"wtpa": "Where The Party At",
"wuf": "Where Are You From",
"wuzup": "What Is Up",
"wywh": "Wish You Were Here",
"yd": "Yard",
"ygtr": "You Got That Right",
"ynk": "You Never Know",
"zzz": "Sleeping Bored And Tired"
}


def convert_abbrev_text(text):
    final_text = ""
    for word in text.split():
        if word.lower() in abbreviations.keys(): # add lower to find the value based on a cased format 
            final_text += abbreviations[word.lower()] 
        else:
            final_text += word + " "
    return final_text

In [None]:
def convert_suspension_points(text):
    res = re.sub(r'[.]{2,}', "...", text)
    res = re.sub(r'[?]{2,}', "???", res)
    res = re.sub(r'[!]{2,}', "!!!", res)

    return res

In [None]:
def preprocess_all_at_once(text):
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_mention(text)
    text = handle_hashtag(text)
    text = convert_emojis(text)
    text = convert_emoticons(text)
    text = convert_suspension_points(text)
    text = correct_spelling(text)  # expensive
    text = convert_abbrev_text(text)
    text = rem_spaces_numbers(text)
    text = text_lowercase(text)
    return text

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

# APPLY PREPROCESSING TO TRAIN
train_df['preprocessed_text'] = train_df['text'].progress_apply(preprocess_all_at_once)

# APPLY PREPROCESSING TO TEST
test_df['preprocessed_text'] = test_df['text'].progress_apply(preprocess_all_at_once)

In [None]:
# Map textual label to categorical ( 0:negative, 1:neutral, 2:positive )

def map_sentiment(sentiment: str):
    if sentiment == "positive":
        return [0,0,1]
    if sentiment == "negative":
        return [1,0,0]
    if sentiment == "neutral":
        return [0,1,0]

    raise ValueError('Value not present')

train_df["sentiment"] = train_df["sentiment"].apply(map_sentiment)

In [None]:
train_df, validation_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df["sentiment"])

## Sentiment Analysis

In [None]:
!pip install -q transformers
!pip install tensorflow-addons
!pip install emoji==0.6.0

In [None]:
from transformers import BertTokenizer,AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/bertweet-base-sentiment')

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [None]:
# can be up to 512 for BERT
max_length = 128
batch_size = 32

def convert_example_to_feature(text):
    return tokenizer.encode_plus(text,
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
    )

In [None]:
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    if label is not None:
        return {
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_masks,
        }, label
    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_masks,
    }

In [None]:
from tqdm import tqdm
import numpy as np

def encode_examples(df, labels=True):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    for i in tqdm(range(df.shape[0])):
        text = df.iloc[i]["preprocessed_text"]
        bert_input = convert_example_to_feature(text)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        if labels:
            label = df.iloc[i]["sentiment"]
            label_list.append(label)
  
  if labels:
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, None)).map(map_example_to_dict)

In [None]:
train_df_encoded = encode_examples(train_df).batch(batch_size)
validation_df_encoded = encode_examples(validation_df).batch(batch_size)

In [None]:
from transformers import TFBertForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
import tensorflow as tf
import tensorflow_addons as tfa

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1

# model initialization
model = TFAutoModelForSequenceClassification.from_pretrained('cardiffnlp/bertweet-base-sentiment', num_labels=3)

# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
bert_history = model.fit(train_df_encoded, epochs=number_of_epochs, validation_data=validation_df_encoded)

In [None]:
test_df_encoded = []

for i in tqdm(range(test_df.shape[0])):
    test_df_encoded.append(tokenizer.encode(test_df.iloc[i]["text"],truncation=True,padding=True,return_tensors="tf"))

In [None]:
tf_output = []

for i in tqdm(range(len(test_df_encoded))):
    tf_output.append(model.predict(test_df_encoded[i])[0])

In [None]:
tf_output_flattened = list(map(lambda x: x.flatten(),tf_output))

In [None]:
tf_output_np = np.array(tf_output_flattened)

In [None]:
tf_prediction = tf.nn.softmax(tf_output_np, axis=1)
labels = tf.argmax(tf_prediction, axis=1)
labels = labels.numpy()

In [None]:
# Prediction are in {0,1,2}, challenge format is {-1,0,1}
test_df["sentiment"] = labels - 1

> # Interpretability

In [None]:
def get_gradients(text, model, tokenizer):

    def get_correct_span_mask(correct_index, token_size):
        span_mask = np.zeros((1, token_size))
        span_mask[0, correct_index] = 1
        span_mask = tf.constant(span_mask, dtype='float32')
        return span_mask

    embedding_matrix = model.roberta.embeddings.weights[0]
    encoded_tokens = tokenizer(text, return_tensors="tf")
    token_ids = list(encoded_tokens["input_ids"].numpy()[0])
    vocab_size = embedding_matrix.get_shape()[0]

    # convert token ids to one hot. We can't differentiate wrt to int token ids hence the need for one hot representation
    token_ids_tensor = tf.constant([token_ids], dtype='int32')
    token_ids_tensor_one_hot = tf.one_hot(token_ids_tensor, vocab_size)

    with tf.GradientTape(watch_accessed_variables=False) as tape:
        # (i) watch input variable
        tape.watch(token_ids_tensor_one_hot)

        # multiply input model embedding matrix; allows us do backprop wrt one hot input
        inputs_embeds = tf.matmul(token_ids_tensor_one_hot,embedding_matrix)

        # (ii) get prediction
        pred_scores = model({"inputs_embeds": inputs_embeds, "attention_mask": encoded_tokens["attention_mask"] } ).logits
        max_class = tf.argmax(pred_scores, axis=1).numpy()[0]

        # get mask for predicted score class
        score_mask = get_correct_span_mask(max_class, pred_scores.shape[1])

        # zero out all predictions outside of the correct  prediction class; we want to get gradients wrt to just this class
        predict_correct_class = tf.reduce_sum(pred_scores * score_mask )

        # (iii) get gradient of input with respect to prediction class
        gradient_non_normalized = tf.norm(
            tape.gradient(predict_correct_class, token_ids_tensor_one_hot),axis=2)

        # (iv) normalize gradient scores and return them as "explanations"
        gradient_tensor = (
            gradient_non_normalized /
            tf.reduce_max(gradient_non_normalized)
        )
        gradients = gradient_tensor[0].numpy().tolist()
        token_words = tokenizer.convert_ids_to_tokens(token_ids)

        prediction_label= max_class
    return gradients, token_words , prediction_label

In [None]:
import matplotlib.pyplot as plt

def plot_gradients(tokens,gradients, title, label):
    """ Plot  explanations
    """
    plt.figure(figsize=(21,3))
    xvals = [ x + str(i) for i,x in enumerate(tokens)]

    if label == 'Negative' : 
        colors =  [ (1,0,0, c) for c in (gradients) ]
    elif label == 'Neutral' : 
        colors =  [ (1,1,0, c) for c in (gradients) ]
    else : 
        colors =  [ (0,1,0, c) for c in (gradients) ]
    plt.rcParams['axes.facecolor'] = 'white'
    plt.tight_layout()
    plt.bar(xvals, gradients, color=colors, linewidth=1 )
    plt.xlabel("Token", fontsize=25 , color='black')
    plt.ylabel("Explainability score", fontsize=25, color='black')
    plt.xticks(fontsize=20, color='black')
    plt.yticks(fontsize=20, color='black')
    plt.title(title, fontsize=30)
    plt.xticks(ticks=[i for i in range(len(tokens))], labels=tokens, fontsize=20,rotation=90)
    plt.show()

In [None]:
texts = test_df["text"]

results_interpretability = []
for i in tqdm(range(len(texts))):
    text = texts[i]

    gradients, words, label = get_gradients(text, model, tokenizer)
    results_interpretability.append(
      {"sentence": text,
      "words": words,
       "label": label,
      "gradients": gradients}
    )

In [None]:
results_interpretability_df = pd.DataFrame(results_interpretability)

In [None]:
import json
from tqdm.notebook import tqdm
import ast
tqdm.pandas()

def find_indices(lst, condition):
  return [i for i, elem in enumerate(lst) if condition(elem)]


def unite_tokens(row):
  words = ast.literal_eval(row["words"])
  sentence = row["sentence"]
  gradients = ast.literal_eval(row["gradients"])
  label = row["label"]

  indices_chiocciole = find_indices(words, lambda x: "@@" in x) 

  new_indices = []

  for index in indices_chiocciole:
    new_indices.append(index)
    new_indices.append(index + 1)

  # indices of words that must be united and averaged interpretability score
  new_indices = list(set(new_indices))

  new_words = []
  new_gradients = []
  
  continuo = False
  j = 0
  for i, word in enumerate(words):
    if i in new_indices:
      if continuo == False:
        new_words.append(word.replace("@@", ""))
        new_gradients.append(gradients[i])
        continuo = True
      else:
        new_words[-1] += word.replace("@@", "")
        new_gradients[-1] = max(gradients[i], new_gradients[-1])
    else:
      if continuo:
        continuo = False
      new_words.append(word)
      new_gradients.append(gradients[i])

  new_row = pd.Series(data=[new_words, sentence,  label, new_gradients], index=['words', 'sentence', 'label','gradients'])

  return new_row


results_interpretability_df = pd.read_csv(f"/content/drive/MyDrive/AML/Challenge_3/results_interpretability_df.csv")

results_processed_interpretability_df = results_interpretability_df.progress_apply(unite_tokens, axis=1)

results_processed_interpretability_df.to_csv(f"/content/drive/MyDrive/AML/Challenge_3/results_processed_interpretability_df.csv", index=False)

In [None]:
import ast
def get_test_selected_text(row):
    row.words = ast.literal_eval(row.words)
    row.gradients = ast.literal_eval(row.gradients)
    return row

results_processed_interpretability_df = pd.read_csv("results_processed_interpretability_df.csv")
results_processed_interpretability_df = results_processed_interpretability_df.progress_apply(get_test_selected_text, axis=1)

In [None]:
sentiment_dict = {
    0: "Negative",
    1: "Neutral",
    2: "Positive"
}

for i in range(len(results_processed_interpretability_df.iloc[:10,:])):
    row = results_processed_interpretability_df.iloc[i,:]
    words = row["words"]
    gradients = row["gradients"]
    label = sentiment_dict[row["label"]]
    sentence = row["sentence"]
 
    plot_gradients(words, gradients, f"Prediction: {label} | {sentence} ", label)

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from collections import defaultdict

# BLEU SCORE

bleu_scores = defaultdict(list)

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
for threshold in thresholds:
    for i in range(len(results_processed_interpretability_df)):
        row = results_processed_interpretability_df.iloc[i,:]
        words = row["words"]
        gradients = row["gradients"]
        label = sentiment_dict[row["label"]]
        sentence = row["sentence"]

        # print(gradients)
        # print(f"words: {words}")

        mask = np.array(gradients) > threshold
        # print(f"mask: {mask}")

        words = [ np.array(words)[mask].tolist() ]

        # print(f"words: {words}")
        # print(f"test split: {test_df.iloc[i]['selected_text'].split(' ')}")
        bleu_score = sentence_bleu(words, test_df.iloc[i]["selected_text"].split(" "))

        # print(f"score: {bleu_score}")

        bleu_scores[threshold].append(bleu_score)

In [None]:
import seaborn as sns

plt.rcParams['axes.facecolor'] = 'white'
for threshold in thresholds:
    sns.displot(bleu_scores[threshold])
    plt.title(f'Threshold used: {threshold}', fontsize=20)
    plt.xticks(color="black")
    plt.yticks(color="black")
    plt.xlabel("Score", fontsize=15 , color='black')
    plt.ylabel("Count", fontsize=15, color='black')
    plt.show()

> # Results analysis

In [None]:
validation_df_encoded_no_batch = []

for i in tqdm(range(validation_df.shape[0])):
    validation_df_encoded_no_batch.append(tokenizer.encode(validation_df.iloc[i]["text"],truncation=True,padding=True,return_tensors="tf"))

In [None]:
tf_output_validation = []

for i in tqdm(range(len(validation_df_encoded_no_batch))):
     tf_output_validation.append(model.predict(validation_df_encoded_no_batch[i])[0])

In [None]:
tf_output_val_flattened = list(map(lambda x: x.flatten(),tf_output_validation))

tf_output_val_np = np.array(tf_output_val_flattened)

tf_val_prediction = tf.nn.softmax(tf_output_val_np, axis=1)
labels_val = tf.argmax(tf_val_prediction, axis=1)
labels_val = labels_val.numpy()

labels_val = labels_val - 1

In [None]:
data_with_error = []
error_indexes = []

y_val = validation_df["sentiment"].apply(np.argmax) - 1

y_val_np = y_val.values
count_error = 0
for i in range(len(labels_val)):
    if(labels_val[i] !=  y_val_np[i]):
        count_error += 1
        data_with_error.append([ validation_df["text"].iloc[i], labels_val[i], y_val_np[i]])
        error_indexes.append(i)
        print(validation_df["text"].iloc[i])

print(f"Accuracy: {(1 - count_error/len(labels_val)):.2f}")

df_data_with_error = pd.DataFrame(data_with_error, columns=["text","y pred","y true"])

In [None]:
from sklearn.metrics import confusion_matrix

class_names = ["negative","neutral","positive"]

def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment', fontsize=15, color='black')
    plt.xlabel('Predicted sentiment', fontsize=15, color='black')
    plt.xticks(color='black')
    plt.yticks(color='black')
    plt.title('Confusion Matrix', fontsize=20)

cm = confusion_matrix(y_val_np, labels_val, normalize='true')
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)