In [1]:
import csv
import os
from emoji import UNICODE_EMOJI, demojize, unicode_codes
import html
import nltk
import re
from utils import Indexer
import string
emoji_set = set(['😂', '😭', '❤', '😍', '🔥', '🤣', '💕', '🙏', '✨', '😩', '💜', '😊', '💀', '🤷', '👀', '💙', '🎃', '🤔', '😘', '👏', '🙌', '🎉', '💯', '🤦', '👍', '👉', '💖', '🙄', '😎', '😁'])
punc = string.punctuation.replace("'", "…")

In [2]:
def process_text(text):
    text = text.replace("RT ", "")
    text = text.replace("\n", " ").lower()
    #text = re.sub(r'[^\w\s]','',text)
    words = text.split(" ")

    for word in words:
        if "@" in word or word.startswith("http"):
            text = text.replace(word, "")
            
    text = text.translate(str.maketrans(" ", " ", punc))
    text = html.unescape(text)
    return text.strip()

In [3]:
def get_emojis_in_text(text, emoji_set):
    char_set = set(text)
    #print(char_set)
    all_emojis = emoji_set & char_set
    
    return all_emojis

In [4]:
# code mostly gotten from emoji package
def clean_tweet(tweet):
    delimiters=(":",":")
    _DEFAULT_DELIMITER = ":"
    pattern = re.compile(u'(%s[a-zA-Z0-9\+\-_&.ô’Åéãíç()!#*]+%s)' % delimiters)
    def replace(match):
        mg = match.group(1).replace(delimiters[0], _DEFAULT_DELIMITER).replace(delimiters[1], _DEFAULT_DELIMITER)
        return ""
    demojized_tweet = demojize(tweet['processed_text'])
    clean_text = pattern.sub(replace, demojized_tweet)
    return ' '.join(clean_text.split())

In [5]:
tweets = []
unique_tweet_ids = set()
with open('twitter_october.csv', 'r') as csv_file:
    print("reading csv...")
    csv_reader = csv.DictReader(csv_file)
    print("done")
    print("processing text")
    count = 0
    for row in csv_reader:
        if row['text'] == 'text':
            continue
        row['processed_text'] = process_text(row['text'])
        row['emojis_in_text'] = get_emojis_in_text(row['processed_text'], emoji_set)
        if not row['emojis_in_text'] or row['id'] in unique_tweet_ids:
            continue
        tweets.append(row)
        unique_tweet_ids.add(row['id'])
        count += 1
        if count % 200000 == 0:
            print("processed", count, "tweets")

reading csv...
done
processing text
processed 200000 tweets
processed 400000 tweets
processed 600000 tweets
processed 800000 tweets
processed 1000000 tweets
processed 1200000 tweets
processed 1400000 tweets
processed 1600000 tweets
processed 1800000 tweets
processed 2000000 tweets
processed 2200000 tweets
processed 2400000 tweets
processed 2600000 tweets
processed 2800000 tweets


In [6]:
len(tweets)

2936623

In [7]:
from collections import Counter

def get_labels(tweet, indexer):
    
    all_emojis = tweet['emojis_in_text']
    emojis = {}
    c = Counter()
    for emoji in all_emojis:
        indexer.add_and_get_index(emoji)
        c[emoji] += tweet['processed_text'].count(emoji)
    
    #print(all_emojis)
    #print("txt", tweet['processed_text'])
    max_score = max(c.values())
    labels = [indexer.index_of(k) for k in c if c[k] == max_score]
    
    return labels

In [8]:
class DataPoint():
    def __init__(self, text, label):
        self.text = text
        self.label = label

In [9]:
def create_dataset(tweets, indexer, label_counter):
    dataset = []
    count = 0
    for idx, tweet in enumerate(tweets):
        try:
            labels = get_labels(tweet, indexer)
        except:
            continue
            
        cleaned_text = clean_tweet(tweet)
        for label in labels:
            if indexer.get_object(label) is not '♀' and indexer.get_object(label) is not '♂':
                datapoint = DataPoint(cleaned_text, label)
                dataset.append(datapoint)
                label_counter[indexer.get_object(label)] += 1
                
        count += 1
        if count % 500000 == 0:
            print("created", count, "datapoints")
            
    return dataset

In [15]:
indexer = Indexer()
label_counter = Counter()
dataset = create_dataset(tweets[0:50000], indexer, label_counter)



In [16]:
len(dataset)

56928

In [17]:
index_dict = indexer.objs_to_ints

In [18]:
with open('indexer.csv', 'w') as f:
    for key in index_dict.keys():
        f.write("%s|%s\n"%(key,index_dict[key]))

In [19]:
with open('dataset.csv', 'w') as f:
    for datapoint in dataset:
        text = re.sub(r'[^\w\s]','', datapoint.text)
        f.write("%s|%s\n"%(datapoint.label,datapoint.text))