In [1]:
import pandas as pd
import numpy as np
import emoji
import nltk

In [2]:
import nltk
import ssl

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\doraz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\doraz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\doraz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\doraz\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## emoji related functions

In [3]:
def emojiname2icon(name):
    return emoji.emojize(f":{name.replace(' ','_')}:")

emojiname2icon('smiling face with hearts')

'🥰'

## Function: Remove special characters 

In [11]:
import re

def remove_special(text):
    # Remove URLs
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = url_pattern.sub('', text)
    
    # Remove words starting with # and @ for all languages
    hashtag_pattern = re.compile(r'(?:^|\s)[@#](?:\w+|[^\s\w])+')
    text = hashtag_pattern.sub('', text)
    
    # Remove punctuation
    punctuation_pattern = re.compile(r'[^\w\s]', re.UNICODE)
    text = punctuation_pattern.sub('', text)
    

    # Remove emoji, number, tab
    emoji_number_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"0-9"           # numbers
        u"\n\t"           # newline and tab characters

                           "]+", flags=re.UNICODE)
    text = emoji_number_pattern.sub(r'', text)



    
    return text

remove_special('💋💫😀\nProduction appear product door 1234')

'Production appear product door '

## Function: Remove Stop words and lemmatization

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('averaged_perceptron_tagger')
  
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def remove_stopword_and_lemmatized(text):

  
  #tokenized and change to lower
    word_tokens = word_tokenize(text.lower())

  #remove stop word
    filtered_sentence = [w for w in word_tokens if not w in stop_words]


  #get part of speech
    pos = nltk.pos_tag(filtered_sentence)


    #lemmatization with POS
    lemmeanized_sentence = [lemmatizer.lemmatize(filtered_sentence[i],get_wordnet_pos(pos[i][1])) if get_wordnet_pos(pos[i][1]) != None else lemmatizer.lemmatize(filtered_sentence[i]) for i in range(len(filtered_sentence))]

      #remove duplicates (in order)
    return " ".join(list(dict.fromkeys(lemmeanized_sentence)))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\doraz\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Function: apply preprocessing functions

In [10]:
def content_preprocessing(df):
    df['cleaned content'] = df['content'].apply(remove_special)
    df['cleaned_content'] = df['cleaned content'].apply(remove_stopword_and_lemmatized)
    
    #get emoji id for name
    df['emoji_id'] = df['emoji_name'].apply(lambda name: emojiname2id_dic[name])
    
    return df[['emoji_id','content','cleaned_content']]

## Combine data from different data sources

In [12]:
#Produce Emoji_DF[emoji_id, emoji_name]
top_20_emoji_name = [

    'face with tears of joy', # funny happy
    
    'red heart', #love
    
    'broken heart', #no love
    
    'thumbs up', #encouragement
    
    'smiling face with smiling eyes', #happy
    
    'loudly crying face', #pure sad
    
    'clapping hands', #congradualation
    
    'fire', #hot /sexy
    
    'face screaming in fear', #shock
    
    'pile of poo', #non-sense, disapproval
    
    'face with symbols on mouth', #anger
    
    'eggplant', #horny
    
    'face savoring food', # craving
    
    'hundred points', #approval
    
    'folded hands', #pray
    
    
]

emoji_reference = {'emoji_name':top_20_emoji_name,'emoji_id':[i for i in range(len(top_20_emoji_name))]}
emoji_df =  pd.DataFrame(emoji_reference)
emoji_df

emojiname2id_dic = {emoji_df.emoji_name[i]:emoji_df.index[i] for i in range(len(emoji_df.index))}
emojiid2name_dic = {emoji_df.index[i]:emoji_df.emoji_name[i] for i in range(len(emoji_df.index))}


# load data from different data source
scrape1 = pd.read_csv('processed_data.csv', index_col = 0)
scrape2 = pd.read_csv('processed_data_batch2.csv')


df = pd.DataFrame(columns=['emoji_name','emoji','content','cleaned content'])
df = pd.concat([df,scrape1,scrape2])
top_20_emoji_df = pd.DataFrame(columns=['emoji_name','emoji','content','cleaned content'])
for name in top_20_emoji_name:
    top_20_emoji_df = pd.concat([top_20_emoji_df,df[df.emoji_name==name]])

kaggle_tweets = pd.read_csv('kaggle_tweets 15 selected emoji.csv',names=['emoji_name','content'])


In [14]:
kaggle_tweets.shape

(7390718, 2)

In [15]:
#concat everything togehter 
total_df = pd.concat([top_20_emoji_df,kaggle_tweets])
total_df

Unnamed: 0,emoji_name,emoji,content,cleaned content
47,face with tears of joy,😂,@PeakSanti But why this photo?😀😀😂😂,But why this photo?😀😀😂😂
206,face with tears of joy,😂,@Sachin_Ro45 Mass troll 😀😀😀😀😂😂😂😂😂😂\n@AdnanSami...,Mass troll 😀😀😀😀😂😂😂😂😂😂\n plz watch this video
386,face with tears of joy,😂,The people I work with thinks the bank has all...,The people I work with thinks the bank has all...
426,face with tears of joy,😂,@Jamespa97888596 😂😂😂 all friends again 👌👌😀,😂😂😂 all friends again 👌👌😀
454,face with tears of joy,😂,How you so in love with a nigga that's never w...,How you so in love with a nigga that's never w...
...,...,...,...,...
7390713,loudly crying face,,&amp; to think I was going to split my tax ref...,
7390714,face with tears of joy,,&amp; to think I was going to split my tax ref...,
7390715,loudly crying face,,please !! remove the suspension from the pleas...,
7390716,clapping hands,,please. 👏🏼 so because i stan someone means i d...,


## sampled training and testing data from combined data

In [16]:
def get_n_sample(df, n, random_state):
    sample_data =  pd.DataFrame()
    for name in top_20_emoji_name:
        sample = df[df.emoji_name == name].sample(n=n,  random_state=random_state)
        sample_data = pd.concat([sample_data,sample])
    return sample_data

In [18]:
#randomly sampled from total df for tarining and testing 
from sklearn.model_selection import train_test_split

# Change sample size to 10000 from 8000; test_size changed to 0.2 from 0,4
a = get_n_sample(total_df,15000,4222)
train, test = train_test_split(get_n_sample(total_df,15000,4222), test_size=0.3,random_state=4222)

In [20]:
# apply data preprocessing on selected content
train_df, test_df = content_preprocessing(train), content_preprocessing(test)

In [21]:
# remove trained data with cleaned content > 0 words
train_df['cleaned_content_len']= train_df.cleaned_content.apply(lambda x: len(x.split()))
test_df['cleaned_content_len']= test_df.cleaned_content.apply(lambda x: len(x.split()))
train_df, test_df  = train_df[train_df.cleaned_content_len >=1],test_df[test_df.cleaned_content_len >=1]
train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['cleaned_content_len']= train_df.cleaned_content.apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['cleaned_content_len']= test_df.cleaned_content.apply(lambda x: len(x.split()))


Unnamed: 0,emoji_id,content,cleaned_content,cleaned_content_len
219336,7,announcement on Sunday 6pm ! 🔥\n,announcement sunday pm,3
2000022,6,Chelsea legend Frank Lampard will receive the ...,chelsea legend frank lampard receive football ...,12
4886330,8,HE JUST DESTROYED THIS MAN 😱\n,destroy man,2
57132,10,It’s twitter I’m sure 🤬\n,twitter im sure,3
2479832,5,Yeah and I think u need a phone no too😭\n,yeah think u need phone,5
...,...,...,...,...
2678399,6,Dirty Heads - Celebrate feat. The Unlikely Can...,dirty head celebrate feat unlikely candidate o...,12
3440404,4,ETHAN AND GRAYSON WERE NOMINATED FOR A SHORTY ...,ethan grayson nominate shorty award make sure ...,13
1005035,1,[EXO-CBX ALBUM GIVEAWAY] Hi! will be hosting h...,exocbx album giveaway hi host first please giv...,11
3623033,13,TEN who? He’s “HUNDRED” now!! 😂💯🙌🏼\n,ten he hundred,3


In [29]:
train_df.to_csv('train 15 emoji.csv', index = False)
test_df.to_csv('test 15 emoji.csv', index = False)



# Data

In [30]:
train_df

Unnamed: 0,emoji_id,content,cleaned_content,cleaned_content_len
219336,7,announcement on Sunday 6pm ! 🔥\n,announcement sunday pm,3
2000022,6,Chelsea legend Frank Lampard will receive the ...,chelsea legend frank lampard receive football ...,12
4886330,8,HE JUST DESTROYED THIS MAN 😱\n,destroy man,2
57132,10,It’s twitter I’m sure 🤬\n,twitter im sure,3
2479832,5,Yeah and I think u need a phone no too😭\n,yeah think u need phone,5
...,...,...,...,...
2678399,6,Dirty Heads - Celebrate feat. The Unlikely Can...,dirty head celebrate feat unlikely candidate o...,12
3440404,4,ETHAN AND GRAYSON WERE NOMINATED FOR A SHORTY ...,ethan grayson nominate shorty award make sure ...,13
1005035,1,[EXO-CBX ALBUM GIVEAWAY] Hi! will be hosting h...,exocbx album giveaway hi host first please giv...,11
3623033,13,TEN who? He’s “HUNDRED” now!! 😂💯🙌🏼\n,ten he hundred,3


In [31]:
test_df

Unnamed: 0,emoji_id,content,cleaned_content,cleaned_content_len
5026717,4,might be reuniting with la famiglia next month...,might reunite la famiglia next month che bello,8
5686547,12,getting that booty ate 🍑😋\n,get booty ate,3
1801897,1,Which baddies wnat to be posted?😋❤️\n,baddie wnat post,3
3639073,3,Yeah our King is avid reader and so wise and H...,yeah king avid reader wise son mohammed learns...,9
381513,7,The HOTTEST Air Jordans Of All Time😍🔥🔥🔥💦💦👌🏻\n,hot air jordan time,4
...,...,...,...,...
3775628,1,Veggie Facts! We RULE! ❤️❤️❤️\n,veggie fact rule,3
2145057,2,BLESSING 💋💘❤️💓💔💕💖💗💙💚💛💜🖤💝💞💟❣️💟💞💝🖤💜💛💚💙💗💖💕💔💓❤️💘💋💘...,blessing,1
5485976,5,Ugh this hurts me 😭😭😭\n,ugh hurt,2
6916006,0,"Perfect example of, ""Tit for tat"": 😂\n",perfect example tit tat,4


In [32]:
emoji_df

Unnamed: 0,emoji_name,emoji_id
0,face with tears of joy,0
1,red heart,1
2,broken heart,2
3,thumbs up,3
4,smiling face with smiling eyes,4
5,loudly crying face,5
6,clapping hands,6
7,fire,7
8,face screaming in fear,8
9,pile of poo,9
