In [61]:
import emoji
import pandas as pd
from pandas import DataFrame
from gensim.models.keyedvectors import KeyedVectors

In [62]:
def get_queries() -> DataFrame:
    # Read synonyms json-file
    terms = pd.read_json('../data/raw/queries.json')

    # Load pre-trained word vector model
    word_vector_filepath = '../models/GoogleNews-vectors-negative300.bin'
    word_vectors = KeyedVectors.load_word2vec_format(
        word_vector_filepath, binary=True, limit=400000
    )

    # Get similar word vectors
    similar_terms = {}
    for emotion in terms.columns:
        similar_vectors = word_vectors.most_similar(emotion, topn=6)
        search_terms = [term.replace('_',' ') for term, _ in similar_vectors]
        similar_terms[emotion] = search_terms

    return pd.concat([terms, DataFrame(similar_terms)]).reset_index(drop=True)
    

df_query = get_queries()
print(df_query.shape)

(33, 4)


In [63]:
import tweepy

filepath = '../api.key.json'
keys = pd.read_json(filepath, typ='series')
auth = tweepy.OAuth2BearerHandler(keys['BEARER_TOKEN'])
api = tweepy.API(auth)

print(api)

<tweepy.api.API object at 0x00000298C77F4BE0>


In [64]:
def create_df_by_queries(df_query:DataFrame, max_items:int) -> DataFrame:
    """
        Iterates of list of search terms and uses each term as a search query.
        Extracts tweet id, text, and timestamp and returns a Pandas DataFrame.
        Params: max_items defines number of elements returned per search term.
    """
    dataset = []
    for emotion in df_query.columns:
        for search_term in df_query[emotion]:
            if search_term.startswith(':'):
                search_term = emoji.emojize(search_term)
            search_term = f"#{search_term.lower()} -filter:retweets"
            for status in tweepy.Cursor(api.search_tweets, search_term, count=100, lang='en').items(max_items):
                json_str = status._json
                id_ = json_str['id']
                text = json_str['text']
                timestamp = pd.to_datetime(json_str['created_at']).strftime('%Y-%m-%d %H:%M:%S')
                dataset.append((id_, text, timestamp, emotion))
    return DataFrame(dataset, columns=['Id','Text', 'CreatedAt', 'Label']).drop_duplicates(subset='Text', keep='first')

df = create_df_by_queries(df_query, max_items=500)

# Checkpoint
df.to_csv('../data/raw/raw_emotions.csv', index=False)
print(df.shape)

(18945, 4)


In [67]:
df['Label'].value_counts()

joy        6440
fear       5039
anger      3802
sadness    3664
Name: Label, dtype: int64

In [65]:
# # Quick Analysis for better tags
# from collections import Counter

# df = pd.read_csv('../data/raw/raw_emotions.csv')

# df_tmp = df[df['Label'] == 'anger']

# hashtags = df_tmp['Text'].str.findall(r'#\w+')

# counter = Counter([tag for hashtag_list in hashtags.values for tag in hashtag_list])
# counter.most_common()

In [66]:
# ## find emoticons
# import re
# import emoji
# from collections import Counter
# from warnings import filterwarnings
# filterwarnings('ignore')

# df = pd.read_csv('../data/raw/raw_emotions.csv')
# df_tmp = df[df['Label'] == 'anger']

# emoji_patterns = re.compile('|'.join(re.escape(p) for p in emoji.UNICODE_EMOJI_ENGLISH))

# emoji_lists = df_tmp['Text'].str.findall(emoji_patterns)
# emojis = [emoji for emoji_list in emoji_lists.values for emoji in emoji_list]

# counter = Counter(emojis)
# print(counter.most_common(n=10))

# emoji_str = [emoji.demojize(e) for (e, _) in counter.most_common(n=10)]
# print(emoji_str)