In [2]:
import sys,os,inspect

sys.path.insert(1, os.path.join(sys.path[0], '..'))

import pandas as pd
import numpy as np
import json
import tqdm
import datetime as dt
from io import StringIO

import azure.cosmos.cosmos_client as azurecosmos
import azure.storage.blob as azureblob

from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
import torch

from cs_config import *
from cs_tools import *

  from .autonotebook import tqdm as notebook_tqdm


# Remove AC tweets by SU accounts

In [17]:
su = pd.read_csv('../data/mps_seed_info.csv', dtype={'twitter_user_id': object})
su

Unnamed: 0,os_user_id,name,twitter_screen_name,party,constituency,twitter_followers,twitter_user_id,twitter_verified,twitter_found,token,seed_flag,twitter_screen_name_orig,twitter_screen_name_old,updated
0,1,Aaron Bell,aaronbell4nul,Conservative,Newcastle-under-Lyme,5314.0,240808845,True,True,MP,True,aaronbell4nul,[],
1,2,Abena Oppong-Asare,abenaopp,Labour,Erith and Thamesmead,17255.0,350223904,True,True,MP,True,abenaopp,[],
2,3,Adam Afriyie,adamafriyie,Conservative,Windsor,17412.0,22031058,True,True,MP,True,adamafriyie,[],
3,4,Afzal Khan,afzal4gorton,Labour,"Manchester, Gorton",32676.0,202610289,True,True,MP,True,afzal4gorton,[],
4,5,Alan Brown,alanbrownsnp,Scottish National Party,Kilmarnock and Loudoun,12506.0,3011043981,True,True,MP,True,alanbrownsnp,[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,587,Yasmin Qureshi,yasminqureshimp,Labour,Bolton South East,24956.0,72341341,True,True,MP,True,yasminqureshimp,[],
587,588,Yvette Cooper,yvettecoopermp,Labour,"Normanton, Pontefract and Castleford",344927.0,328634628,True,True,MP,True,yvettecoopermp,[],
588,589,Yvonne Fovargue,y_fovarguemp,Labour,Makerfield,13811.0,568174146,True,True,MP,True,y_fovarguemp,[],
589,590,Zarah Sultana,zarahsultana,Labour,Coventry South,218758.0,3056307455,True,True,MP,True,zarahsultana,[],


In [37]:
# get tweets that are by MPs

m_cosmos = get_cosmos_client(Cosmos.host, Cosmos.key, Cosmos.mps_db, Cosmos.mps_container)

first = True
array_str = ""
for id in su[su.twitter_user_id.notna()].twitter_user_id.tolist():
    if first: array_str += f"\"{id}\""
    else: array_str += f", \"{id}\""
    first=False

su_twids = query_cosmos_field(
    m_cosmos,
    'id',
    filter=f"c.bucket=\"audience_contact\" and c.valid=true and ARRAY_CONTAINS([{array_str}], c.user_id)"
)

In [38]:
len(su_twids)

(7847, 7847)

In [40]:
# remove tweets with ids found from query
df = pd.read_csv('../data/mps_valid_ac_tweets_idtxt_rc.csv', dtype={'id':object})
df2 = df[~df.id.isin(su_twids)].reset_index(drop=True)
df.shape[0], df2.shape[0], df.shape[0]-df2.shape[0]

(2599492, 2591833, 7659)

In [41]:
df2.to_csv('../data/mps_valid_ac_tweets_idtxt_rc_nosu.csv', index=False, encoding='utf-8-sig')

# Analysis of remaining/replies

In [42]:
df = pd.read_csv('../data/mps_valid_ac_tweets_idtxt_rc_nosu.csv', dtype={'id':object})
df

Unnamed: 0,id,text_replaced_b,replycount
0,1489750623213948928,"As of 12:00am, 5th February 2022, [MP] owes ou...",0.0
1,1489750748212641794,"Hey [MP] - saw your pathetic ‘drip,drip,drip’ ...",0.0
2,1489750775907692546,Not just in the UK [MP],0.0
3,1489750802570915844,[USER] world leaders are waiting for you on Bi...,0.0
4,1489750860519329805,[USER] would you a factual talking figure chal...,0.0
...,...,...,...
2591828,1520191052979703808,It all started in England and still emanates f...,0.0
2591829,1520191085137612801,[MP] what type of porn was it just out of curi...,1.0
2591830,1520191097993060352,Approximately 90% of #Tigray’s pop. needs huma...,0.0
2591831,1520191151399227393,|'The main hospital in #Ethiopia’s war-ravaged...,0.0


In [43]:
def rc_counts(df, checks=[0,1,2,3,10,100]):
    total = df.shape[0]
    anyrep = df[df.replycount>0].shape[0]

    dfr = pd.DataFrame(data={
        'replies': checks,
        'count': [df[df.replycount==c].shape[0] for c in checks],
        'prop_of_all': [df[df.replycount==c].shape[0]/total for c in checks],
        'prop_of_replies': [df[df.replycount==c].shape[0]/anyrep if c>0 else np.nan for c in checks],
        'at_least_count': [df[df.replycount>=c].shape[0] for c in checks],
        'at_least_prop_of_all': [df[df.replycount>=c].shape[0]/total for c in checks],
        'at_least_prop_of_replies': [df[df.replycount>=c].shape[0]/anyrep if c>0 else np.nan for c in checks],
    })  
    return dfr

dfr = rc_counts(df)
dfr

Unnamed: 0,replies,count,prop_of_all,prop_of_replies,at_least_count,at_least_prop_of_all,at_least_prop_of_replies
0,0,2348693,0.90619,,2591833,1.0,
1,1,146433,0.056498,0.602258,243140,0.09381,1.0
2,2,35686,0.013769,0.146771,96707,0.037312,0.397742
3,3,15478,0.005972,0.063659,61021,0.023544,0.250971
4,10,1524,0.000588,0.006268,20986,0.008097,0.086312
5,100,25,1e-05,0.000103,2882,0.001112,0.011853


In [51]:
df[df.replycount>0].reset_index(drop=True).to_csv('../data/mps_valid_ac_tweets_idtxt_rc_nosu_anyreplies.csv', index=False, encoding='utf-8-sig')

In [50]:
# save copy of df of only tweets with replies - for labelling

df[df.replycount>0].sort_values(by='replycount', ascending=False).iloc[2]

id                                               1499851517859438594
text_replaced_b    We do not mind if the next [USER] TV show is a...
replycount                                                    6498.0
Name: 836179, dtype: object

In [6]:
# upload files to blob storage
csblob = get_blob_client(CounterSpeechBlobStorage.connect_str, 'data')
upload_file_to_blob(csblob, '../data/mps_valid_ac_tweets_idtxt_rc_nosu.csv', 'mps_valid_ac_tweets_idtxt_rc_nosu')
upload_file_to_blob(csblob, '../data/mps_valid_ac_tweets_idtxt_rc_nosu_anyreplies.csv', 'mps_valid_ac_tweets_idtxt_rc_nosu_anyreplies')