In [2]:
import pandas as pd
import json
import ast
from pprint import pprint
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

## Reading data

In [8]:
df_origin = pd.read_csv('data/random_tweets.csv')
df_origin['context_annotations'] = df_origin['context_annotations'].apply(ast.literal_eval)
df_origin['entities'] = df_origin['entities'].apply(ast.literal_eval)

df = df_origin.drop('id', axis=1)

## Selecting the annotated samples

In [36]:
anotated_df = df.drop(df[df['context_annotations'].str.len() == 0].index)

In [37]:
anotated_df

Unnamed: 0,text,context_annotations,entities
3,#Israel is working with #HongKong on a trial w...,"[{'domain': {'id': '30', 'name': 'Entities [En...","{'hashtags': [{'start': 0, 'end': 7, 'tag': 'I..."
5,@coinexcom 1. ETH 2. LINK 3. CET 4. RIPPLE 5. ...,"[{'domain': {'id': '30', 'name': 'Entities [En...","{'hashtags': [{'start': 53, 'end': 60, 'tag': ..."
9,I forgot to tweet about going live. Still kind...,"[{'domain': {'id': '45', 'name': 'Brand Vertic...","{'annotations': [{'start': 132, 'end': 138, 'p..."
12,Count me in. I've got time on my hands to help...,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'urls': [{'start': 48, 'end': 71, 'url': 'htt..."
13,RT @Booker4KY: I am a proud Kentuckian. A lot ...,"[{'domain': {'id': '159', 'name': 'States', 'd...","{'annotations': [{'start': 67, 'end': 74, 'pro..."
...,...,...,...
177920,@MayraFlores2022 @newsmax You offer nothing mo...,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'mentions': [{'start': 0, 'end': 16, 'usernam..."
177924,@apoyo_bts07 @BTS_twt Higual #PremiosMTVMiaw #...,"[{'domain': {'id': '29', 'name': 'Events [Enti...","{'hashtags': [{'start': 30, 'end': 45, 'tag': ..."
177927,RT @starfuryevents: Competition Time! To celeb...,"[{'domain': {'id': '3', 'name': 'TV Shows', 'd...","{'annotations': [{'start': 77, 'end': 102, 'pr..."
177929,RT @Atlacoya5: Friendly reminder from WenZhou:...,"[{'domain': {'id': '172', 'name': 'Global TV S...","{'annotations': [{'start': 38, 'end': 44, 'pro..."


In [53]:
print('the number of annotated samples are: ', anotated_df.shape[0])

the number of annotated samples are:  67336


In [31]:
def get_domain_and_entities(annotations):
    subjects = set()
    for ann in annotations:
        subjects.add(ann['domain']['name'])
        subjects.add(ann['entity']['name'])
    return list(subjects)
print(get_domain_and_entities(anotated_df.iloc[0].context_annotations))

['Cryptocurrencies', 'Technology', 'Cybersecurity', 'Bloomberg Technology', 'Entities [Entity Service]', 'Information security', 'Events [Entity Service]', 'Interests and Hobbies Category']


In [38]:
anotated_df['subjects'] = anotated_df['context_annotations'].apply(get_domain_and_entities)

## Selecting the most common subjects to be used as classes
Here, we count the occurrencies of each subject, and select the 10 most common.

We choose to remove the subjects that has a therm in it's name equal to another common subject. For example: 'Brand Vertical' and 'Brand Category' is removed if 'Brand' is also common. That was done to avoid having subjects that looks too much like each other


In [48]:
from collections import Counter

subject_counter = Counter()
for sample in anotated_df.iloc:
    subject_counter.update(sample['subjects'])

In [111]:
used_classes = subject_counter.most_common(10)
removed_classes = set()
for i in range(len(used_classes)):
    for j in range(i + 1,len(used_classes)):
        if (used_classes[i][0] in used_classes[j][0]) or (used_classes[j][0] in used_classes[i][0]):
            removed_classes.add(j)

            
removed_classes = list(removed_classes)
removed_classes.sort(reverse=True)
print('The following classes will be removed:')
for removed_class in removed_classes:
    print(used_classes[removed_class])
    del used_classes[removed_class]
    
used_classes_names = [used_class[0] for used_class in used_classes]
used_classes_names

The following classes will be removed:
('Brand Vertical', 7350)
('Brand Category', 12140)


['Person',
 'Brand',
 'Musician',
 'Music Genre',
 'Politician',
 'Interests and Hobbies Category',
 'TV Shows',
 'K-pop']

## Selecting samples from the used classes

At this point, each sample can belong to more then 1 class, so we select the sample if any of it's classes is common

In [112]:
anotated_df_common = anotated_df[anotated_df['subjects'].apply(
    lambda x: any([subject in used_classes_names for subject in x])
)].copy()

In [118]:
print('there are', anotated_df_common.shape[0], 'samples for the',len(used_classes),'used classes')

there are 54148 samples for the 8 used classes


## Removing the unused subjects

In [114]:
anotated_df_common['subjects'] = anotated_df_common['subjects'].apply(
    lambda subjects: [subject for subject in subjects if subject in used_classes_names]
)

In [115]:
anotated_df_common

Unnamed: 0,text,context_annotations,entities,subjects
3,#Israel is working with #HongKong on a trial w...,"[{'domain': {'id': '30', 'name': 'Entities [En...","{'hashtags': [{'start': 0, 'end': 7, 'tag': 'I...",[Interests and Hobbies Category]
9,I forgot to tweet about going live. Still kind...,"[{'domain': {'id': '45', 'name': 'Brand Vertic...","{'annotations': [{'start': 132, 'end': 138, 'p...",[Brand]
12,Count me in. I've got time on my hands to help...,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'urls': [{'start': 48, 'end': 71, 'url': 'htt...","[Person, Politician]"
13,RT @Booker4KY: I am a proud Kentuckian. A lot ...,"[{'domain': {'id': '159', 'name': 'States', 'd...","{'annotations': [{'start': 67, 'end': 74, 'pro...","[Politician, Person]"
16,@PaulCowland_ @DrewPritchard @discoveryplusUK ...,"[{'domain': {'id': '45', 'name': 'Brand Vertic...","{'mentions': [{'start': 0, 'end': 13, 'usernam...",[Brand]
...,...,...,...,...
177914,RT @itzz_blitz1: Mr doyin okupe is hinting tha...,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'annotations': [{'start': 17, 'end': 24, 'pro...","[Person, Politician]"
177917,RT @YourGrowFriend: I have a dream wherein Pre...,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'mentions': [{'start': 3, 'end': 18, 'usernam...","[Politician, Person]"
177920,@MayraFlores2022 @newsmax You offer nothing mo...,"[{'domain': {'id': '10', 'name': 'Person', 'de...","{'mentions': [{'start': 0, 'end': 16, 'usernam...","[Person, Politician]"
177924,@apoyo_bts07 @BTS_twt Higual #PremiosMTVMiaw #...,"[{'domain': {'id': '29', 'name': 'Events [Enti...","{'hashtags': [{'start': 30, 'end': 45, 'tag': ...","[K-pop, Music Genre, Musician, Person]"


In [116]:
'asd' in 'as ddd' or 'as ddd' in 'asd'

False

In [86]:
a = set()

In [89]:
a.add('as')

In [90]:
a

{'a', 'as', 's'}

[8, 3]