In [1]:
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm
from more_itertools import take

## Load All Tweets sorted by id

In [2]:
user_ids = defaultdict(dict)

with open('../data/all_tweets_sorted.tsv', mode='r', encoding='utf8') as fin:
    index = 1
    for line in tqdm(fin):
        if line.startswith('id'):
            continue

        parts = line.strip().split('\t')
        d_key = f'{index}_{parts[0].strip()}'
        user_ids[d_key]['tweets'] = True
        user_ids[d_key]['metrics'] = False
        user_ids[d_key]['distil'] = False

        index += 1

print(f'\nLoaded ids from tweets: {len(user_ids)}')

8581628it [00:18, 455467.50it/s]


Loaded ids from tweets: 8581627





## Load All Tweet metrics sorted by id

In [3]:
with open('../data/all_tweet_metrics_sorted.csv', mode='r', encoding='utf8') as fin:
    index = 1
    for line in tqdm(fin):
        if line.startswith('id'):
            continue

        parts = line.strip().split(',')
        d_key = f'{index}_{parts[0].strip()}'
        if d_key not in user_ids:
            user_ids[d_key]['tweets'] = False
            user_ids[d_key]['distil'] = False

        user_ids[d_key]['metrics'] = True
        index += 1

8581628it [00:09, 890961.95it/s] 


## Load All DistilBert results sorted by id

In [4]:

with open('../data/all_tweets_distilbert_sorted.csv', mode='r', encoding='utf8') as fin:
    index = 1
    for line in tqdm(fin):
        if line.startswith('id'):
            continue

        parts = line.strip().split(',')
        d_key = f'{index}_{parts[0].strip()}'
        if d_key not in user_ids:
            user_ids[d_key]['tweets'] = False
            user_ids[d_key]['metrics'] = False

        user_ids[d_key]['distil'] = True
        index += 1

8581628it [00:11, 733923.43it/s]


## Make dataframe to compare

In [5]:
data = []
for d_key, d_val in tqdm(user_ids.items()):
    data.append((d_key, d_val['tweets'], d_val['metrics'], d_val['distil']))

df = pd.DataFrame(data, columns=['id', 't', 'm', 'd'])

print(df.info())
print(df.head())

100%|██████████| 8581627/8581627 [00:02<00:00, 3417468.69it/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8581627 entries, 0 to 8581626
Data columns (total 4 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   id      object
 1   t       bool  
 2   m       bool  
 3   d       bool  
dtypes: bool(3), object(1)
memory usage: 90.0+ MB
None
           id     t     m     d
0      1_7094  True  True  True
1     2_56935  True  True  True
2  3_10581681  True  True  True
3  4_13458821  True  True  True
4  5_22038081  True  True  True


## Comparisons

In [6]:
len(df[(df.t == True) & (df.m == True) & (df.d == True)])

8581627

In [7]:
len(df)

16895377

In [8]:
len(df) - len(df[df.t])

8313750

In [9]:
len(df[df.t & df.m]) - len(df[df.t])

0

In [10]:
len(df[df.t & df.d])

267777

## Detect Issue

In [11]:
d_series = df.id[df.d]
d_series = d_series.map(lambda x: x.split('_')[1])
d_series.head()

0        7094
1       56935
2    10581681
3    13458821
4    22038081
Name: id, dtype: object

In [12]:
d_counts = Counter(d_series)
take(5, d_counts.items())

TypeError: 'dict_items' object is not subscriptable

In [16]:
t_series = df.id[df.t]
t_series = t_series.map(lambda x: x.split('_')[1])
t_series.head()

0        7094
1       56935
2    10581681
3    13458821
4    22038081
Name: id, dtype: object

In [17]:
d_counts['7094']
Out[19]: 1
for i in t_series.index:
    id = t_series.loc[i]
    d_counts[id] -= 1

In [18]:
take(5, d_counts.items())

[('7094', 0), ('56935', 0), ('10581681', 0), ('13458821', 0), ('22038081', 0)]

In [19]:
mismatch = 0
duplicated = 0
not_done = 0

for id, count in tqdm(d_counts.items()):
    if count != 0:
        mismatch += 1

    if count < 0:
        duplicated += 1
    elif count > 0:
        not_done += 1

print(f'Mismatch: {mismatch}, Duplicated: {duplicated}, Not Done: {not_done}')


100%|██████████| 8163928/8163928 [00:01<00:00, 5670409.11it/s]

Mismatch: 97, Duplicated: 97, Not Done: 0





## Separate Tweets not DistilBerted

In [2]:
tweets = defaultdict(list)
with open('all_tweets.tsv', mode='r', encoding='utf8') as fin:
    for line in tqdm(fin):
        parts = line.strip().split('\t')
        parts[0] = parts[0][1:-1]
        tweets[parts[0]].append(parts[1])

8581627it [00:26, 322348.75it/s]


In [3]:
with open('all_tweets_distilbert.csv', mode='r', encoding='utf8') as fin,\
    open('all_tweets_distilbert(2).csv', mode='w', encoding='utf8') as fout:

    for line in tqdm(fin):
        parts = line.strip().split(',')
        d_key = parts[0][1:-1]

        if d_key in tweets:
            tweets[d_key].pop(0)
            fout.write(line)

8581527it [00:30, 283987.64it/s]


In [5]:
with open('all_tweets_distilbert_remaining.tsv', mode='w', encoding='utf8') as fout:
    for d_key, d_val in tqdm(tweets.items()):
        if len(d_val) > 0:
            for twt in d_val:
                fout.write(f'{d_key}\t{twt.strip()}\n')

100%|██████████| 8163928/8163928 [00:01<00:00, 5940731.45it/s]


## After separation cross-match

In [13]:
df = pd.read_csv('all_tweets_distilbert.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,7094,0.013405,0.061273,0.886439,0.033778,0.002741,0.002365,love
1,56935,0.008617,0.876822,0.01339,0.080446,0.016895,0.00383,joy
2,10581681,0.55907,0.005555,0.000891,0.422958,0.010552,0.000974,sadness
3,13458821,0.001669,0.005266,0.004516,0.956323,0.029439,0.002787,anger
4,22038081,0.035983,0.002554,0.010766,0.943404,0.006737,0.000557,anger


In [14]:
prev_counts = Counter(df.loc[:, 0])

In [15]:
take(5, prev_counts.items())

[(7094, 1), (56935, 1), (10581681, 1), (13458821, 1), (22038081, 1)]

In [16]:
df = pd.read_csv('all_tweets_distilbert(2).csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,7094,0.013405,0.061273,0.886439,0.033778,0.002741,0.002365,love
1,56935,0.008617,0.876822,0.01339,0.080446,0.016895,0.00383,joy
2,10581681,0.55907,0.005555,0.000891,0.422958,0.010552,0.000974,sadness
3,13458821,0.001669,0.005266,0.004516,0.956323,0.029439,0.002787,anger
4,22038081,0.035983,0.002554,0.010766,0.943404,0.006737,0.000557,anger


In [18]:
for i in df.index:
    id = df.loc[i, 0]
    if id in prev_counts:
        prev_counts[id] -= 1
    else:
        print(f'error {id}')

In [None]:
ids = []
for d_key, d_val in tqdm(prev_counts.items()):
    if d_val != 0:
        ids.append(d_key)

print(len(ids))