In [54]:
import csv
import os
from emoji import UNICODE_EMOJI, demojize, unicode_codes
import nltk
import re
from utils import Indexer
emoji_set = set(['😂', '😭', '❤', '😍', '🔥', '🤣', '💕', '🙏', '✨', '😩', '💜', '😊', '💀', '🤷', '👀', '💙', '🎃', '🤔', '😘', '👏', '🙌', '🎉', '💯', '🤦', '👍', '👉', '💖', '🙄', '😎', '😁'])

In [2]:
def process_text(text):
    text = text.replace("RT ", "")
    text = text.replace("\n", " ")
    words = text.split(" ")

    for word in words:
        if word.startswith("@") or word.startswith("http"):
            text = text.replace(word, "")
    
    return text.strip()

In [3]:
def get_emojis_in_text(text, emoji_set):
    char_set = set(text)
    #print(char_set)
    all_emojis = emoji_set & char_set
    
    return all_emojis

In [80]:
def clean_tweet(tweet):
    delimiters=(":",":")
    _DEFAULT_DELIMITER = ":"
    pattern = re.compile(u'(%s[a-zA-Z0-9\+\-_&.ô’Åéãíç()!#*]+%s)' % delimiters)
    def replace(match):
        mg = match.group(1).replace(delimiters[0], _DEFAULT_DELIMITER).replace(delimiters[1], _DEFAULT_DELIMITER)
        return ""
    demojized_tweet = demojize(tweet['processed_text'])
    clean_text = pattern.sub(replace, demojized_tweet)
    return ' '.join(clean_text.split())

In [5]:
tweets = []
with open('twitter_october.csv', 'r') as csv_file:
    print("reading csv...")
    csv_reader = csv.DictReader(csv_file)
    print("done")
    print("processing text")
    count = 0
    for row in csv_reader:
        if row['text'] == 'text':
            continue
        row['processed_text'] = process_text(row['text'])
        row['emojis_in_text'] = get_emojis_in_text(row['processed_text'], emoji_set)
        if not row['emojis_in_text']:
            continue
        tweets.append(row)
        count += 1
        if count % 500000 == 0:
            print("processed", count, "tweets")

reading csv...
done
processing text
processed 500000 tweets
processed 1000000 tweets
processed 1500000 tweets
processed 2000000 tweets
processed 2500000 tweets


In [6]:
len(tweets)

2954678

In [81]:
from collections import Counter

def get_labels(tweet, indexer):
    
    all_emojis = tweet['emojis_in_text']
    emojis = {}
    c = Counter()
    for emoji in all_emojis:
        indexer.add_and_get_index(emoji)
        c[emoji] += tweet['processed_text'].count(emoji)
    
    #print(all_emojis)
    #print("txt", tweet['processed_text'])
    max_score = max(c.values())
    labels = [indexer.index_of(k) for k in c if c[k] == max_score]
    
    return labels

In [82]:
class DataPoint():
    def __init__(self, text, label):
        self.text = text
        self.label = label

In [86]:
def create_dataset(tweets, indexer, label_counter):
    dataset = []
    count = 0
    for idx, tweet in enumerate(tweets):
        #print(idx, tweet['processed_text'])
        try:
            labels = get_labels(tweet, indexer)
        except:
            continue
            
#         print(tweet['processed_text'])
        cleaned_text = clean_tweet(tweet)
#         print(cleaned_text)
        for label in labels:
            if indexer.get_object(label) is not '♀' and indexer.get_object(label) is not '♂':
                datapoint = DataPoint(cleaned_text, label)
                dataset.append(datapoint)
                label_counter[indexer.get_object(label)] += 1
                
        count += 1
        if count % 500000 == 0:
            print("created", count, "datapoints")
            
    return dataset

In [94]:
indexer = Indexer()
label_counter = Counter()
dataset = create_dataset(tweets[:100], indexer, label_counter)

# print (tweets[:100])
for tweet in tweets[:100]:
    print (tweet)
    print ('\n')
# for data in dataset:
#     print (data.text)
#     print (data.label)
# print (dataset[1].text)
# print (dataset[1].label)

OrderedDict([('created_at', 'Sat Oct 20 20:41:00 +0000 2018'), ('id', '1053747937560420357'), ('id_str', '1053747937560420357'), ('text', 'RT @Anthonyybaby: Said Michael was from Texas 😂😂 https://t.co/Y5w9DM6Zv3'), ('in_reply_to_status_id', ''), ('user_id', '1546406606'), ('user_id_str', '1546406606'), ('user_screen_name', 'sothats_jada'), ('processed_text', 'Said Michael was from Texas 😂😂'), ('emojis_in_text', {'😂'})])


OrderedDict([('created_at', 'Sat Oct 20 20:41:00 +0000 2018'), ('id', '1053747937581391873'), ('id_str', '1053747937581391873'), ('text', 'United vibe 👏🏽👏🏽👏🏽 they so cuteee awwww #XFactor'), ('in_reply_to_status_id', ''), ('user_id', '153357937'), ('user_id_str', '153357937'), ('user_screen_name', 'jadachantina'), ('processed_text', 'United vibe 👏🏽👏🏽👏🏽 they so cuteee awwww #XFactor'), ('emojis_in_text', {'👏'})])


OrderedDict([('created_at', 'Sat Oct 20 20:41:00 +0000 2018'), ('id', '1053747937552072704'), ('id_str', '1053747937552072704'), ('text', 'RT @MefeaterM: Ha

In [11]:
index_dict = indexer.objs_to_ints

In [66]:
# sample = "United vibe 🏽🏽🏽 they so cuteee 🏻 awwww #XFactor"
# sample = "United vibe they so cuteee 🏻 awwww #XFactor"
# print ("👏🏻" in UNICODE_EMOJI)
def test_clean_tweet(tweet):
    delimiters=(":",":")
    _DEFAULT_DELIMITER = ":"
    pattern = re.compile(u'(%s[a-zA-Z0-9\+\-_&.ô’Åéãíç()!#*]+%s)' % delimiters)
    
    def replace(match):
        mg = match.group(1).replace(delimiters[0], _DEFAULT_DELIMITER).replace(delimiters[1], _DEFAULT_DELIMITER)
        return ""
#     demojized_tweet = demojize(tweet['processed_text'])
    demojized_tweet = demojize(tweet)

    return pattern.sub(replace, demojized_tweet)
sample = "United vibe 👏🏽👏🏽👏🏽 they so cuteee awwww #XFactor"

United vibe  they so cuteee awwww #XFactor

