In [1]:
import pandas as pd
import re
import emoji
import os

In [2]:
def get_data_path(file_name):
    path = os.getcwd()
    path = path.split("/")
    del path[-1]
    path.append("data")
    path.append("processed")
    path.append(file_name)
    return "/".join(path)

In [3]:

data = pd.read_csv(get_data_path("tweets.txt"), sep=',', header=None, names=["Tweet_id", "Tweet", "Unicode_emoji"])

In [4]:
data.head()

Unnamed: 0,Tweet_id,Tweet,Unicode_emoji
0,798577437726666752,so ready for thanksgiving \U0001f60a\U0001f60a...,
1,798577441920978944,This weather makes me wanna stay in bed and wa...,
2,798577441916809216,Okay \U0001f634 https://t.co/JPCJl9gRX7,
3,798577441925169153,LOOK AT THEM ....... \U0001f440 \n#Bts #Vhope ...,
4,798577446115127296,@em_pent @briianalamarii @ashlynnxx17 holy thr...,


In [5]:
def extract_emojis(sentence):
    return re.findall(r"\\U\w*", sentence, re.IGNORECASE)

In [6]:
for index, row in data.iterrows():
    Emoji = extract_emojis(row.Tweet)
    if len(Emoji)>0:
        data.loc[index, 'Unicode_emoji'] = emoji.emojize(Emoji[0])
    else:
        data.loc[index, 'Unicode_emoji'] = 'None'
data = data[data.Unicode_emoji != 'None']

In [7]:
data.head()

Unnamed: 0,Tweet_id,Tweet,Unicode_emoji
0,798577437726666752,so ready for thanksgiving \U0001f60a\U0001f60a...,\U0001f60a
1,798577441920978944,This weather makes me wanna stay in bed and wa...,\U0001f384
2,798577441916809216,Okay \U0001f634 https://t.co/JPCJl9gRX7,\U0001f634
3,798577441925169153,LOOK AT THEM ....... \U0001f440 \n#Bts #Vhope ...,\U0001f440
4,798577446115127296,@em_pent @briianalamarii @ashlynnxx17 holy thr...,\U0001f498


In [8]:
va_scores = pd.read_csv(get_data_path('emoji_va_scores.csv'))

In [9]:
va_scores.head()

Unnamed: 0.1,Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block,V,A,System
0,0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,0.851667,0.79,Android
1,1,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,0.811667,0.851667,iOS
2,2,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,0.905,0.805,Android
3,3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,0.926667,0.856667,iOS
4,4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons,0.068333,0.841667,Android


In [10]:
va_scores['Unicode_emoji'] = None
for index, row in va_scores.iterrows():
    va_scores.loc[index, 'Unicode_emoji'] = row.Emoji.encode('unicode-escape').decode('ASCII')

In [11]:
va_scores

Unnamed: 0.1,Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block,V,A,System,Unicode_emoji
0,0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,0.851667,0.790000,Android,\U0001f602
1,1,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,0.811667,0.851667,iOS,\U0001f602
2,2,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,0.905000,0.805000,Android,\U0001f60d
3,3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,0.926667,0.856667,iOS,\U0001f60d
4,4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons,0.068333,0.841667,Android,\U0001f62d
5,5,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons,0.171667,0.855000,iOS,\U0001f62d
6,6,😘,0x1f618,3648,0.854480,193,702,2753,FACE THROWING A KISS,Emoticons,0.858333,0.663333,Android,\U0001f618
7,7,😘,0x1f618,3648,0.854480,193,702,2753,FACE THROWING A KISS,Emoticons,0.885000,0.721667,iOS,\U0001f618
8,8,😊,0x1f60a,3186,0.813302,189,754,2243,SMILING FACE WITH SMILING EYES,Emoticons,0.861667,0.576667,Android,\U0001f60a
9,9,😊,0x1f60a,3186,0.813302,189,754,2243,SMILING FACE WITH SMILING EYES,Emoticons,0.881667,0.613333,iOS,\U0001f60a


In [12]:
result = pd.merge(data, va_scores, on='Unicode_emoji', how='left')

In [13]:
result = result[['Tweet_id', 'Tweet', 'Unicode_emoji', 'Emoji', 'V', 'A']]

In [14]:
result.head()

Unnamed: 0,Tweet_id,Tweet,Unicode_emoji,Emoji,V,A
0,798577437726666752,so ready for thanksgiving \U0001f60a\U0001f60a...,\U0001f60a,😊,0.861667,0.576667
1,798577437726666752,so ready for thanksgiving \U0001f60a\U0001f60a...,\U0001f60a,😊,0.881667,0.613333
2,798577441920978944,This weather makes me wanna stay in bed and wa...,\U0001f384,,,
3,798577441916809216,Okay \U0001f634 https://t.co/JPCJl9gRX7,\U0001f634,😴,0.621667,0.2
4,798577441916809216,Okay \U0001f634 https://t.co/JPCJl9gRX7,\U0001f634,😴,0.68,0.188333


In [15]:
result.to_csv(get_data_path('Dataset_tweets.csv'))