In [102]:
import csv
import os
from emoji import UNICODE_EMOJI, demojize, unicode_codes
import html
import nltk
import re
from utils import Indexer
emoji_set = set(['😂', '😭', '❤', '😍', '🔥', '🤣', '💕', '🙏', '✨', '😩', '💜', '😊', '💀', '🤷', '👀', '💙', '🎃', '🤔', '😘', '👏', '🙌', '🎉', '💯', '🤦', '👍', '👉', '💖', '🙄', '😎', '😁'])

In [103]:
def process_text(text):
    text = text.replace("RT ", "")
    text = text.replace("\n", " ")
    words = text.split(" ")

    for word in words:
        if word.startswith("@") or word.startswith("http"):
            text = text.replace(word, "")
    text = html.unescape(text)
    return text.strip()

In [104]:
def get_emojis_in_text(text, emoji_set):
    char_set = set(text)
    #print(char_set)
    all_emojis = emoji_set & char_set
    
    return all_emojis

In [113]:
# code mostly gotten from emoji package
def clean_tweet(tweet):
    delimiters=(":",":")
    _DEFAULT_DELIMITER = ":"
    pattern = re.compile(u'(%s[a-zA-Z0-9\+\-_&.ô’Åéãíç()!#*]+%s)' % delimiters)
    def replace(match):
        mg = match.group(1).replace(delimiters[0], _DEFAULT_DELIMITER).replace(delimiters[1], _DEFAULT_DELIMITER)
        return ""
    demojized_tweet = demojize(tweet['processed_text'])
    clean_text = pattern.sub(replace, demojized_tweet)
    return ' '.join(clean_text.split())

In [106]:
tweets = []
unique_tweet_ids = set()
with open('twitter_october.csv', 'r') as csv_file:
    print("reading csv...")
    csv_reader = csv.DictReader(csv_file)
    print("done")
    print("processing text")
    count = 0
    for row in csv_reader:
        if row['text'] == 'text':
            continue
        row['processed_text'] = process_text(row['text'])
        row['emojis_in_text'] = get_emojis_in_text(row['processed_text'], emoji_set)
        if not row['emojis_in_text'] or row['id'] in unique_tweet_ids:
            continue
        tweets.append(row)
        unique_tweet_ids.add(row['id'])
        count += 1
        if count % 500000 == 0:
            print("processed", count, "tweets")

reading csv...
done
processing text
processed 500000 tweets
processed 1000000 tweets
processed 1500000 tweets
processed 2000000 tweets
processed 2500000 tweets


In [107]:
len(tweets)

2944811

In [108]:
from collections import Counter

def get_labels(tweet, indexer):
    
    all_emojis = tweet['emojis_in_text']
    emojis = {}
    c = Counter()
    for emoji in all_emojis:
        indexer.add_and_get_index(emoji)
        c[emoji] += tweet['processed_text'].count(emoji)
    
    #print(all_emojis)
    #print("txt", tweet['processed_text'])
    max_score = max(c.values())
    labels = [indexer.index_of(k) for k in c if c[k] == max_score]
    
    return labels

In [109]:
class DataPoint():
    def __init__(self, text, label):
        self.text = text
        self.label = label

In [110]:
def create_dataset(tweets, indexer, label_counter):
    dataset = []
    count = 0
    for idx, tweet in enumerate(tweets):
        try:
            labels = get_labels(tweet, indexer)
        except:
            continue
            
        cleaned_text = clean_tweet(tweet)
        for label in labels:
            if indexer.get_object(label) is not '♀' and indexer.get_object(label) is not '♂':
                datapoint = DataPoint(cleaned_text, label)
                dataset.append(datapoint)
                label_counter[indexer.get_object(label)] += 1
                
        count += 1
        if count % 500000 == 0:
            print("created", count, "datapoints")
            
    return dataset

In [111]:
indexer = Indexer()
label_counter = Counter()
dataset = create_dataset(tweets[:100], indexer, label_counter)

for data in dataset:
    print (data.text)
    print (data.label)

Said Michael was from Texas
0
United vibe they so cuteee awwww #XFactor
1
Happy 47th Birthday to Snoop Dogg
2
TRUE FACTS
3
What did y'all call this back then in school? Me: fraps
0
When you fall for someone, everything about them becomes beautiful.
4
Personalize Yours #etsy#etsymntt #etsy #ebay #etsyretwt #epiconetsy #etsyaaa #bride…
5
Personalize Yours #etsy#etsymntt #etsy #ebay #etsyretwt #epiconetsy #etsyaaa #bride…
6
Why be mad if he keeping it
3
Appreciate ya
3
Appreciate ya
7
I don’t see any boyfriends getting love for Sweetest Day. It’s rough out here
0
We're keeping up with the lead! 20 more minutes to go.
6
When you see a fat ass pothole but it’s too late
0
Blessings on blessings
5
- LOVE YOU BTS.
9
We were terrific this evening, Andrew. You boys are top of the league, I see.
10
Ellis was two weeks late and my feet were like fucking slabs I’ll never ever do it again. Can’t wait to hear your news xx
0
Stop addressing everything & let them be mad FUCK EM
11
Was it his bellybutto