In [2]:
import csv
import os
from emoji import UNICODE_EMOJI
import nltk
from utils import Indexer
emoji_set = set(UNICODE_EMOJI)

In [3]:
def process_text(text):
    text = text.replace("RT ", "")
    text = text.replace("\n", " ")
    words = text.split(" ")

    for word in words:
        if word.startswith("@") or word.startswith("http"):
            text = text.replace(word, "")
    
    return text.strip()

In [4]:
def get_emojis_in_text(text, emoji_set):
    char_set = set(text)
    #print(char_set)
    all_emojis = emoji_set & char_set
    
    return all_emojis

In [5]:
def clean_text(tweet):
    cleaned_str = tweet['processed_text']
    for emoji in tweet['emojis_in_text']:
        cleaned_str = tweet['processed_text'].replace(emoji, "")
        
    return cleaned_str

In [6]:
tweets = []
with open('twitter_october.csv', 'r') as csv_file:
    print("reading csv...")
    csv_reader = csv.DictReader(csv_file)
    print("done")
    print("processing text")
    count = 0
    for row in csv_reader:
        if row['text'] == 'text':
            continue
        row['processed_text'] = process_text(row['text'])
        row['emojis_in_text'] = get_emojis_in_text(row['processed_text'], emoji_set)
        tweets.append(row)
        count += 1
        if count % 500000 == 0:
            print("processed", count, "tweets")

reading csv...
done
processing text
processed 500000 tweets
processed 1000000 tweets
processed 1500000 tweets
processed 2000000 tweets
processed 2500000 tweets
processed 3000000 tweets
processed 3500000 tweets
processed 4000000 tweets
processed 4500000 tweets
processed 5000000 tweets


In [7]:
len(tweets)

5169003

In [8]:
tweets[0]

OrderedDict([('created_at', 'Sat Oct 20 20:41:00 +0000 2018'),
             ('id', '1053747937560420357'),
             ('id_str', '1053747937560420357'),
             ('text',
              'RT @Anthonyybaby: Said Michael was from Texas 😂😂 https://t.co/Y5w9DM6Zv3'),
             ('in_reply_to_status_id', ''),
             ('user_id', '1546406606'),
             ('user_id_str', '1546406606'),
             ('user_screen_name', 'sothats_jada'),
             ('processed_text', 'Said Michael was from Texas 😂😂'),
             ('emojis_in_text', {'😂'})])

In [9]:
from collections import Counter

def get_labels(tweet, indexer):
    
    all_emojis = tweet['emojis_in_text']
    emojis = {}
    c = Counter()
    for emoji in all_emojis:
        indexer.add_and_get_index(emoji)
        c[emoji] += tweet['processed_text'].count(emoji)
    
    #print(all_emojis)
    #print("txt", tweet['processed_text'])
    max_score = max(c.values())
    labels = [indexer.index_of(k) for k in c if c[k] == max_score]
    
    return labels

In [10]:
class DataPoint():
    def __init__(self, text, label):
        self.text = text
        self.label = label

In [11]:
def create_dataset(tweets, indexer, label_counter):
    dataset = []
    count = 0
    for idx, tweet in enumerate(tweets):
        #print(idx, tweet['processed_text'])
        try:
            labels = get_labels(tweet, indexer)
        except:
            continue
        cleaned_text = clean_text(tweet)
        for label in labels:
            if indexer.get_object(label) is not '♀' and indexer.get_object(label) is not '♂':
                datapoint = DataPoint(cleaned_text, label)
                dataset.append(datapoint)
                label_counter[indexer.get_object(label)] += 1
                
        count += 1
        if count % 500000 == 0:
            print("created", count, "datapoints")
            
    return dataset

In [12]:
indexer = Indexer()
label_counter = Counter()
dataset = create_dataset(tweets, indexer, label_counter)

created 500000 datapoints
created 1000000 datapoints
created 1500000 datapoints
created 2000000 datapoints
created 2500000 datapoints
created 3000000 datapoints
created 3500000 datapoints
created 4000000 datapoints
created 4500000 datapoints
created 5000000 datapoints


In [23]:
most_common = label_counter.most_common(32)
print(most_common)

print(most_common[8][0].decode())

[('😂', 617504), ('😭', 294681), ('❤', 280220), ('😍', 200159), ('🔥', 168289), ('🤣', 102484), ('💕', 101975), ('🙏', 97199), ('♀', 92854), ('✨', 87744), ('😩', 87401), ('💜', 86638), ('♂', 77390), ('😊', 74571), ('💀', 68176), ('🤷', 67752), ('👀', 66779), ('💙', 64025), ('🎃', 63107), ('🤔', 62330), ('😘', 62149), ('👏', 62025), ('🙌', 59498), ('🎉', 58619), ('💯', 58415), ('🤦', 55387), ('👍', 54447), ('👉', 54279), ('💖', 50452), ('🙄', 49965), ('😎', 49308), ('😁', 46164)]


AttributeError: 'str' object has no attribute 'decode'

In [25]:
common_emojis = []
for idx, x in enumerate(most_common):
    if idx == 8 or idx == 12:
        continue
    common_emojis.append(x[0])
print(common_emojis)

len(common_emojis)

['😂', '😭', '❤', '😍', '🔥', '🤣', '💕', '🙏', '✨', '😩', '💜', '😊', '💀', '🤷', '👀', '💙', '🎃', '🤔', '😘', '👏', '🙌', '🎉', '💯', '🤦', '👍', '👉', '💖', '🙄', '😎', '😁']


30