# <a name="p1">Importing Dependencies</a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Utilities
import pandas as pd
import numpy as np
import re
import pickle

# Machine Learning
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report

# Emoji Library
emoji= pd.read_csv('/content/drive/MyDrive/NLP(sentiment analysis and emoji analysis)/emoji.csv')

# <a name="p2">Importing Dataset</a>




In [None]:
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]

tweet = pd.read_csv('/content/drive/MyDrive/NLP(sentiment analysis and emoji analysis)/tweets.csv',names= DATASET_COLUMNS, encoding='latin-1')

In [None]:
# Checking whether the columns were placed correctly or not.

tweet.columns

Index(['sentiment', 'ids', 'date', 'flag', 'user', 'text'], dtype='object')

In [None]:
# Drop unused columns

tweet = tweet.drop(columns= ['ids','date','flag','user'])

In [None]:
# Top 5 columns

tweet.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
# Replace the sentiment column value of 4 to 1

tweet['sentiment'] = tweet['sentiment'].replace(4,1)

In [None]:
# Checking the Polarity of the sentiment column

tweet.sentiment.unique()

array([0, 1])

In [None]:
# Checking if there is any null value in the dataframe

tweet.isna().any()

sentiment    False
text         False
dtype: bool

In [None]:
#Checking whether our dataframe is null or not

tweet.isna().sum()

sentiment    0
text         0
dtype: int64

# <a name="p3">Import Emoji Dataset</a>



In [None]:
# Setup the data for emoji

emoji = pd.read_csv("/content/drive/MyDrive/NLP(sentiment analysis and emoji analysis)/emoji.csv")
emoji.head()

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons


# <a name="p4">Determine Emoji Polarity</a>




In [None]:
# compare the polarity of the dataset and turn the polarity to binary
# 0 = negative, 1= positive
polarity_ls = []
for index, row in emoji.iterrows():

    # polarity == sentiment
    # initial polarity is negative
    polarity = 0

    # positive if positive value is greater than negative value
    arg_1 = row['Positive'] > row['Negative']

    # positive if neutral value is odd and positive and negative value are equal
    arg_2 = row['Positive'] == row['Negative'] and row['Neutral'] % 2 != 0

    # positive if either of the two arguments are true
    if arg_1 or arg_2:
        polarity = 1
    polarity_ls.append(polarity)

# create new emoji dataset
new_emoji_df = pd.DataFrame(polarity_ls, columns=['sentiment'])
new_emoji_df['emoji'] = emoji['Emoji'].values
new_emoji_df.reset_index()
new_emoji_df

Unnamed: 0,sentiment,emoji
0,1,😂
1,1,❤
2,1,♥
3,1,😍
4,0,😭
...,...,...
964,1,➛
965,1,♝
966,1,❋
967,1,✆


# <a name="p5">Split Emojis in two polarities</a>



In [None]:
def sentiment_dataset(df, polarity):
    emoticon_df = df.loc[df['sentiment'] == polarity]
    df_emoticon_df = pd.DataFrame(emoticon_df)
    df_emoticon_df.reset_index(inplace=True, drop=True)
    return df_emoticon_df

In [None]:
# Positive Emojis

positive_emoji = sentiment_dataset(new_emoji_df, 1)
positive_emoji

Unnamed: 0,sentiment,emoji
0,1,😂
1,1,❤
2,1,♥
3,1,😍
4,1,😘
...,...,...
790,1,➛
791,1,♝
792,1,❋
793,1,✆


In [None]:
# Negative emojis

negative_emoji = sentiment_dataset(new_emoji_df, 0)
negative_emoji

Unnamed: 0,sentiment,emoji
0,0,😭
1,0,😩
2,0,😒
3,0,😔
4,0,█
...,...,...
169,0,🕔
170,0,🈂
171,0,🎰
172,0,҂


# <a name="p6">Preprocess Text</a>





In [None]:
posts = tweet['text']

In [None]:
temp = []
for text in posts:
    remove_keys = ('@', 'http://','https://', '&', '#')
    # remove words that starts with symbols from the remove keys
    clean_text = ' '.join(txt for txt in text.split() if not txt.startswith(remove_keys))
    temp.append(clean_text)
posts = temp
tweet['text'] = posts
posts

["- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",
 "is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!",
 'I dived many times for the ball. Managed to save 50% The rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 "no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",
 'not the whole crew',
 'Need a hug',
 "hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ?",
 "nope they didn't have it",
 'que me muera ?',
 "spring break in plain city... it's snowing",
 'I just re-pierced my ears',
 "I couldn't bear to watch it. And I thought the UA loss was embarrassing . . . . .",
 'It it counts, idk why I did either. you never talk to me anymore',
 "i would've been the first, but i didn't have a gun. not really though, zac snyder's just a doucheclown.",
 'I wish I got to watch it with you!! I miss you and 

In [None]:
tweet.text = tweet.text.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
tweet.text = tweet.text.apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))
tweet.text = tweet.text.apply(lambda x: re.sub(r'{link}', '', x))
tweet.text = tweet.text.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
tweet.text = tweet.text.apply(lambda x: re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', x))
tweet.text = tweet.text.apply(lambda x: re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', x))

In [None]:
# Convert the data to Unicode (essentially a **string** in Python3)

tweet["text"]=tweet["text"].astype('U')

In [None]:
# corresponding emoticon sysmbols
txt_emoji = [
    ':)', ':P', ':D', ':|', ":'(", ':O', ":*", '<3', ':(', ';)',
    'xD', ':/', '=D'
]
txt_emoji_pic =[
    '😊', '😛', '😄', '😐', '😢', '😲', '😘', '😍', '😧', '😉',
    '😁', '😒', '😀'
]

In [None]:
# Function to convert text to emoji icons

def convert_emoji(txt, conv_txt, conv_pic):
    temp = []
    for i in txt:
        for j in range(len(conv_txt)):
            if i == conv_txt[j]:
                i = conv_pic[j]
        temp.append(i)
    return ' '.join(temp)


In [None]:
# Function to split texts from emojis and call the other function to convert these signs to emojis.

def conv_emoji_on_data(df_data):
    conv_text = []
    for idx, row in df_data.iterrows():
        txt = [i for i in row['text'].split()]
        emoji_found = convert_emoji(txt, txt_emoji, txt_emoji_pic)
        conv_text.append(emoji_found)
    return conv_text

In [None]:
# Convert text based emojis from positive text into utf-8 emoticon symbols.

conv_text = conv_emoji_on_data(tweet)
conv_text

["- www that's a bummer ou shoulda got avid arr of hird ay to do it ;",
 "is upset that he can't update his acebook by texting it and might cry as a result chool today also lah",
 'dived many times for the ball anaged to save he rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 "no it's not behaving at all i'm mad why am i here because can't see you all over there",
 'not the whole crew',
 'eed a hug',
 "hey long time no see es ains a bit only a bit 'm fine thanks how's you",
 "nope they didn't have it",
 'que me muera',
 "spring break in plain city it's snowing",
 'just re-pierced my ears',
 "couldn't bear to watch it nd thought the loss was embarrassing",
 't it counts idk why did either you never talk to me anymore',
 "i would've been the first but i didn't have a gun not really though zac snyder's just a doucheclown",
 'wish got to watch it with you miss you and how was the premiere',
 "ollis' death scene will hurt me severely to watch on film wry is direc

In [None]:
# Convert the following words into these emoji icons.

add_emoji_txt = ['sad', 'unhappy', 'crying', 'smile', 'happy', 'love']
add_emoji_pic =['😔', '😧', '😆', '😭', '😊', '😍']


In [None]:
# Function to convert a list of words into a list of emojis.

def add_emoji_text(df_data):
    reform_pos_text = []
    for ct in df_data:
        txt = [i for i in ct.split()]
        emoji_found = convert_emoji(txt, add_emoji_txt, add_emoji_pic)
        reform_pos_text.append(emoji_found)
    return reform_pos_text

In [None]:
# Convert Selected words into emojis from texts

text_conv = add_emoji_text(conv_text)
text_conv

["- www that's a bummer ou shoulda got avid arr of hird ay to do it ;",
 "is upset that he can't update his acebook by texting it and might cry as a result chool today also lah",
 'dived many times for the ball anaged to save he rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 "no it's not behaving at all i'm mad why am i here because can't see you all over there",
 'not the whole crew',
 'eed a hug',
 "hey long time no see es ains a bit only a bit 'm fine thanks how's you",
 "nope they didn't have it",
 'que me muera',
 "spring break in plain city it's snowing",
 'just re-pierced my ears',
 "couldn't bear to watch it nd thought the loss was embarrassing",
 't it counts idk why did either you never talk to me anymore',
 "i would've been the first but i didn't have a gun not really though zac snyder's just a doucheclown",
 'wish got to watch it with you miss you and how was the premiere',
 "ollis' death scene will hurt me severely to watch on film wry is direc

In [None]:
# TFIDF vectorizer
import nltk
nltk.download('stopwords')
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True,
                            strip_accents='ascii', stop_words=stopset)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# print out the emoticons and sentiment values
e_c, p = 0, 0
for index, row in new_emoji_df.iterrows():
    print(f"{row['emoji']} = {row['sentiment']}")
    p += 1 if row['sentiment'] else 0
    e_c += 1

😂 = 1
❤ = 1
♥ = 1
😍 = 1
😭 = 0
😘 = 1
😊 = 1
👌 = 1
💕 = 1
👏 = 1
😁 = 1
☺ = 1
♡ = 1
👍 = 1
😩 = 0
🙏 = 1
✌ = 1
😏 = 1
😉 = 1
🙌 = 1
🙈 = 1
💪 = 1
😄 = 1
😒 = 0
💃 = 1
💖 = 1
😃 = 1
😔 = 0
😱 = 1
🎉 = 1
😜 = 1
☯ = 1
🌸 = 1
💜 = 1
💙 = 1
✨ = 1
😳 = 1
💗 = 1
★ = 1
█ = 0
☀ = 1
😡 = 0
😎 = 1
😢 = 1
💋 = 1
😋 = 1
🙊 = 1
😴 = 0
🎶 = 1
💞 = 1
😌 = 1
🔥 = 1
💯 = 1
🔫 = 0
💛 = 1
💁 = 1
💚 = 1
♫ = 1
😞 = 0
😆 = 1
😝 = 1
😪 = 0
� = 1
😫 = 0
😅 = 1
👊 = 1
💀 = 0
😀 = 1
😚 = 1
😻 = 1
© = 1
👀 = 1
💘 = 1
🐓 = 1
☕ = 1
👋 = 1
✋ = 1
🎊 = 1
🍕 = 1
❄ = 1
😥 = 1
😕 = 0
💥 = 1
💔 = 0
😤 = 0
😈 = 1
► = 1
✈ = 1
🔝 = 1
😰 = 0
⚽ = 1
😑 = 0
👑 = 1
😹 = 1
👉 = 1
🍃 = 1
🎁 = 1
😠 = 0
🐧 = 1
☆ = 1
🍀 = 1
🎈 = 1
🎅 = 1
😓 = 0
😣 = 0
😐 = 0
✊ = 1
😨 = 0
😖 = 0
💤 = 1
💓 = 1
👎 = 0
💦 = 1
✔ = 1
😷 = 0
⚡ = 1
🙋 = 1
🎄 = 1
💩 = 0
🎵 = 1
➡ = 1
😛 = 1
😬 = 1
👯 = 1
💎 = 1
🌿 = 1
🎂 = 1
🌟 = 1
🔮 = 1
❗ = 1
👫 = 1
🏆 = 1
✖ = 1
☝ = 1
😙 = 1
⛄ = 1
👅 = 1
♪ = 1
🍂 = 1
💏 = 1
🔪 = 1
🌴 = 1
👈 = 1
🌹 = 1
🙆 = 1
➜ = 1
👻 = 1
💰 = 1
🍻 = 1
🙅 = 0
🌞 = 1
🍁 = 1
⭐ = 1
▪ = 1
🎀 = 1
━ = 1
☷ = 1
🐷 = 1
🙉 = 1
🌺 = 1
💅 = 1
🐶 = 1
🌚 = 1
👽 = 1
🎤 = 1
👭 = 1
🎧 = 

In [None]:
# Check the percentage of Positive emojis in the dataset

print(f'Total Positive Emojis are ({p}:{e_c}) or {round(p / e_c * 100)}%')

Total Positive Emojis are (795:969) or 82%


In [None]:
# dependent variable will be linked as: 0 = negative, 1 = positive
y = tweet.sentiment
# convert 'sentence' from text to features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(tweet.text)

print(y.shape)
print(X.shape)
print(f'{X.shape[0]} observations X {X.shape[1]} unique words')


(1600000,)
(1600000, 363118)
1600000 observations X 363118 unique words


# <a name="p10">Splitting the Data</a>

In [None]:
# Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=None)

# we will train a naive bayes classifier
clf = naive_bayes.MultinomialNB()

clf.fit(X_train, y_train)

# test our models accuracy
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])


0.8486703258792468

# <a name="p11">Save the Model</a>



In [None]:
file = open('/content/drive/MyDrive/NLP(sentiment analysis and emoji analysis)/Sentiment-NB.pickle','wb')
pickle.dump(clf, file)
file.close()

# <a name="p12">Process inputs - Text and Emojis</a>



In [None]:
# Function that extracts either text and emojis

text = "Hello how are you? 💗😍 How is your day going?"

def extract_emoji_text(text = text):
    global allchars, emoji_list
    # remove all tagging and links, not need for sentiments
    remove_keys = ('@', 'http://', '&', '#')
    clean_text = ' '.join(txt for txt in text.split() if not txt.startswith(remove_keys))


    # setup the input, get the characters and the emoji lists
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.Emoji.values]

    # extract text
    clean_text = ' '.join([str for str in clean_text.split() if not any(i in str for i in emoji_list)])

    # extract emoji
    clean_emoji = ''.join([str for str in text.split() if any(i in str for i in emoji_list)])
    return (clean_text, clean_emoji)

allchars, emoji_list = 0, 0
(ct, ce) = extract_emoji_text()
print('\nAll Char:', allchars)
print('\nAll Emoji:',emoji_list)
print('\n', ct)
print('\n',ce)


All Char: ['H', 'e', 'l', 'l', 'o', ' ', 'h', 'o', 'w', ' ', 'a', 'r', 'e', ' ', 'y', 'o', 'u', '?', ' ', '💗', '😍', ' ', 'H', 'o', 'w', ' ', 'i', 's', ' ', 'y', 'o', 'u', 'r', ' ', 'd', 'a', 'y', ' ', 'g', 'o', 'i', 'n', 'g', '?']

All Emoji: ['💗', '😍']

 Hello how are you? How is your day going?

 💗😍


In [None]:
# Function to predict the sentiment in an text input, it does not apply for emojis.

def get_sentiment(s_input = 'Happy'):
    # turn input into array
    input_array= np.array([s_input])
    # vectorize the input
    input_vector = vectorizer.transform(input_array)
    # predict the score of vector
    pred_senti = clf.predict(input_vector)

    return pred_senti[0]
print(get_sentiment())

1


In [None]:
# Function to predict emojis sentiments, it returns a list. It does not apply for text inputs.

def get_emoji_sentiment(emoji_ls = '❤❤❤😭', emoji_df = new_emoji_df):
    emoji_val_ls = []
    for e in emoji_ls:
        get_emo_senti = [row['sentiment'] for index, row in emoji_df.iterrows() if row['emoji'] == e]
        emoji_val_ls.append(get_emo_senti[0])
    return emoji_val_ls

ges = get_emoji_sentiment()
print('Sentiment value of each emoji:',ges)

Sentiment value of each emoji: [1, 1, 1, 0]


In [None]:
# Function that calculates the final score to our inputs

def get_text_emoji_sentiment(input_test = 'love 😭'):
    # separate text and emoji
    (ext_text, ext_emoji) = extract_emoji_text(input_test)
    print(f'\tExtracted: "{ext_text}" , {ext_emoji}')

    # get text sentiment
    senti_text = get_sentiment(ext_text)
    print(f'\tText value: {senti_text}')

    # get emoji sentiment
    senti_emoji_value = sum(get_emoji_sentiment(ext_emoji, new_emoji_df))
    print_emo_val_avg = 0 if len(ext_emoji) == 0 else senti_emoji_value/len(ext_emoji)
    print(f'\tEmoji average value: {print_emo_val_avg}')

    # avg the sentiment of emojis and text
    senti_avg = (senti_emoji_value + senti_text) / (len(ext_emoji) + 1)
    print(f'\tAverage value: {senti_avg}')

    # set value of avg sentiment to either pos or neg
    senti_truth = "Positive😀" if senti_avg >= 0.6 else "Negative😩"

    return senti_truth

print(get_text_emoji_sentiment())

	Extracted: "love" , 😭
	Text value: 1
	Emoji average value: 0.0
	Average value: 0.5
Negative😩


In [None]:
# Function to print the Results

def print_status(test):
    print('____________________________________________________')
    print('')
    print(f' Your input is "{test}" \n')
    sentiment = get_text_emoji_sentiment(test)
    print('____________________________________________________')
    print(f'\n Your input is of "{sentiment}" sentiment'.upper())
    print('____________________________________________________')

In [None]:
import ipywidgets as widgets

import warnings; warnings.simplefilter('ignore')

In [None]:
# for text area
l = widgets.Layout(padding="0px 0px 0px 4px",width="250px",height="130px" )
post_tweet = widgets.Textarea(placeholder="Enter your text here",value="", layout=l)
print(post_tweet.value)
# for button
button = widgets.Button(description="Check sentiment", button_style='success')

output = widgets.Output()

def on_tweet_clicked(b):
    output.clear_output()
    with output:
        output.layout={'border': '1px solid blue', 'width':"400px"}
        print_status(post_tweet.value)






In [None]:
# List of all Emojis to check its sentiments in the result

emoji.Emoji.values

array(['😂', '❤', '♥', '😍', '😭', '😘', '😊', '👌', '💕', '👏', '😁', '☺', '♡',
       '👍', '😩', '🙏', '✌', '😏', '😉', '🙌', '🙈', '💪', '😄', '😒', '💃', '💖',
       '😃', '😔', '😱', '🎉', '😜', '☯', '🌸', '💜', '💙', '✨', '😳', '💗', '★',
       '█', '☀', '😡', '😎', '😢', '💋', '😋', '🙊', '😴', '🎶', '💞', '😌', '🔥',
       '💯', '🔫', '💛', '💁', '💚', '♫', '😞', '😆', '😝', '😪', '�', '😫', '😅',
       '👊', '💀', '😀', '😚', '😻', '©', '👀', '💘', '🐓', '☕', '👋', '✋', '🎊',
       '🍕', '❄', '😥', '😕', '💥', '💔', '😤', '😈', '►', '✈', '🔝', '😰', '⚽',
       '😑', '👑', '😹', '👉', '🍃', '🎁', '😠', '🐧', '☆', '🍀', '🎈', '🎅', '😓',
       '😣', '😐', '✊', '😨', '😖', '💤', '💓', '👎', '💦', '✔', '😷', '⚡', '🙋',
       '🎄', '💩', '🎵', '➡', '😛', '😬', '👯', '💎', '🌿', '🎂', '🌟', '🔮', '❗',
       '👫', '🏆', '✖', '☝', '😙', '⛄', '👅', '♪', '🍂', '💏', '🔪', '🌴', '👈',
       '🌹', '🙆', '➜', '👻', '💰', '🍻', '🙅', '🌞', '🍁', '⭐', '▪', '🎀', '━',
       '☷', '🐷', '🙉', '🌺', '💅', '🐶', '🌚', '👽', '🎤', '👭', '🎧', '👆', '🍸',
       '🍷', '®', '🍉', '😇', '☑', '🏃', '😿', '│', '💣', '🍺', '▶', '😲

In [None]:


# Run this cell to display the result window

display(post_tweet,button, output)
button.on_click(on_tweet_clicked)




Textarea(value='', layout=Layout(height='130px', padding='0px 0px 0px 4px', width='250px'), placeholder='Enter…

Button(button_style='success', description='Check sentiment', style=ButtonStyle())

Output()