In [2]:
import sys, os
import pandas as pd
import numpy as np
import json
import tqdm
import datetime as dt
from io import StringIO

import azure.cosmos.cosmos_client as azurecosmos
import azure.storage.blob as azureblob

from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
import torch

from cs_config import *
from cs_tools import *

  from .autonotebook import tqdm as notebook_tqdm


# Cosmos Examples

Get clients

In [2]:
f_cosmos = get_cosmos_client(Cosmos.host, Cosmos.key, Cosmos.footballer_db, Cosmos.footballer_container)
m_cosmos = get_cosmos_client(Cosmos.host, Cosmos.key, Cosmos.mps_db, Cosmos.mps_container)

## Queries

Query results (ie. tweets) are returned as lists of dictionaries

### Select: whole documents

In [3]:
# limit results to 1 for readability
# set print_info to true to inspect query string

r = query_cosmos(m_cosmos, select='*', limit=1, print_info=True)
r[0].keys()

Querying central: SELECT * FROM c OFFSET 0 LIMIT 1
1 results returned


dict_keys(['created_at', 'id', 'id_str', 'text', 'source', 'truncated', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'retweeted_status', 'is_quote_status', 'quote_count', 'reply_count', 'retweet_count', 'favorite_count', 'entities', 'favorited', 'retweeted', 'filter_level', 'lang', 'timestamp_ms', 'id_num', 'date', 'datetime', '_rid', '_self', '_etag', '_attachments', 'user_id', 'text_replaced', 'seed_MP', 'non_seed_USER', 'MP', 'text_replaced_b', 'no_content', 'valid_lang', 'seed_author', 'type_RT', 'type_QT', 'type_RP', 'type_SA', 'type_str', 'seed_parent', 'bucket', 'valid', 'processed', '_ts'])

Bad pipe message: %s [b'\xcd{,,\xb6iRZ.\xc0\xc2\xe1\x94\x17\x1c)\xecW \xc9\xf6\xf6\xe3\x8a\xc7\x80@\x80\xec\xdc\xc1\xd1\x04S\xe1\xf59<mM\xdd\xe4\x08\xeb\x88),\xcbl\xa3\x0e\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08']
Bad pipe message: %s [b'\t\x08\n\x08\x0b\x08\x04']
Bad pipe message: %s [b'\x08\x06\x04\x01\x05\x01\x06', b'']
Bad pipe message: %s [b'\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 T\xad\xd5\xc2\xed\x9b\xa8\xb5\xa7\x11\x83\xaa\xa6\xb7\xdb\x95\xd3f\xfa\x1d\xe9\xe0']
Bad pipe message: %s [b'\xf4\xb1', b"\xe4\x7f\x9a\x00\x8f\xec\xd1\xde\xe0\xc7i\x80\xe3s\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa

### Select: individual fields

In [4]:
# We can select only the fields we want to return - eg. just tweet id
r = query_cosmos_field(m_cosmos, field='id', limit=1, print_info=True)
print(r)

# We can also select multiple fields
r = query_cosmos(m_cosmos, select='c.id, c.text_replaced_b, c.seed_MP', limit=1, print_info=True)
print(r)

Querying central: SELECT c.id FROM c OFFSET 0 LIMIT 1
1 results returned
['1503158969224814592']
Querying central: SELECT c.id, c.text_replaced_b, c.seed_MP FROM c OFFSET 0 LIMIT 1
1 results returned
[{'id': '1503158969224814592', 'text_replaced_b': 'A very serious question...  Why are the lives and prospects of the homeless and destitute in Britain of less value to the British Government than the undocumented migrants crossing the English Channel from safe-haven-country France? [MP]. Why? You lot are employed by us!', 'seed_MP': ['@pritipatel']}]


### Select: document counts

In [5]:
r = query_cosmos_count(
    m_cosmos, 
    print_info=True
)
r

Querying central: SELECT VALUE COUNT(1) FROM c
1 results returned


45886884

### Query based on datetime

In [6]:
# convert datetime object to correct format (DT_COSMOS) for query
start = dt_string_conversion('01/02/22', DatetimeFormats.DT_DATE, DatetimeFormats.DT_COSMOS)
end = dt_string_conversion('01/03/22', DatetimeFormats.DT_DATE, DatetimeFormats.DT_COSMOS)

# in the correct format, we can do comparisons on datetime to gets tweet between certain dates and times
r = query_cosmos_count(
    m_cosmos, 
    dt_start=start, 
    dt_end=end, 
    print_info=True
)
r

Querying central: SELECT VALUE COUNT(1) FROM c WHERE c.datetime>="2022-02-01T00:00:00.0000000Z" AND c.datetime<"2022-03-01T00:00:00.0000000Z"
1 results returned


10853535

### Query based on variables

In [7]:
# to query by string values, include escaped quotes in the query
r = query_cosmos_count(
    m_cosmos, 
    filter="c.bucket=\"audience_contact\""
)
print(r)

# to query by boolean values, use lowercase true/false
r = query_cosmos_count(
    m_cosmos, 
    filter="c.valid=true"
)
print(r)

# combine multiple filters with AND and OR
r = query_cosmos_count(
    m_cosmos, 
    filter="(c.valid=true AND c.bucket=\"audience_contact\") AND (c.type_str=\"standalone\" OR c.type_str=\"reply\")",
    print_info=True
)
print(r)

2805981
43539322
Querying central: SELECT VALUE COUNT(1) FROM c WHERE ((c.valid=true AND c.bucket="audience_contact") AND (c.type_str="standalone" OR c.type_str="reply"))
1 results returned
1695323


### Query by tweet IDs

In [8]:
# We can retrieve documents given a list of tweet IDs

tweet_ids = query_cosmos_field(m_cosmos, field='id', limit=1000, print_info=True)

r = query_cosmos_by_ids(m_cosmos, ids=tweet_ids, select='c.id, c.text')
r[0]

Querying central: SELECT c.id FROM c OFFSET 0 LIMIT 1000


Batching IDs: 100%|██████████| 999/999 [00:00<00:00, 421879.75it/s]
Querying batches:   0%|          | 0/1 [00:00<?, ?it/s]

1000 results returned
[1000]


Querying batches: 100%|██████████| 1/1 [00:02<00:00,  2.33s/it]

1000 docs retrieved in 0:00:02.331617





{'id': '1503158969224814592',
 'text': 'A very serious question...\n\nWhy are the lives and prospects of the homeless and destitute in Britain of less value to the British Government than the undocumented migrants crossing the English Channel from safe-haven-country France? @pritipatel. Why? You lot are employed by us!'}

### Create tweet dataframe

In [9]:
df = create_tweet_df(r, cols=['id','text'])
df

Unnamed: 0,id,text
0,1503158969224814592,A very serious question...\n\nWhy are the live...
1,1503158970059534341,The catastrophic humanitarian crisis in #Tigra...
2,1503158970034311172,"The UAE, China, Turkey and Iran continues to s..."
3,1503158972215443463,For over 16 month 'the Ethiopian government ha...
4,1503158975243915271,So @BorisJohnson has ditched the Animals Abroa...
...,...,...
995,1503161097125347333,The Int community has displayed their condemna...
996,1503161098886950920,"Tigray Health Bureau's findings confirm 120,00..."
997,1503161099327197185,@ai_clayton @AnnDuffieldnews @thecoastguy @All...
998,1503161100489138183,"@sajidjavid Profit must be in there as well, t..."


### Sample tweets from Cosmos DB

In [3]:
# get a random sample of tweet IDs
all_ids, valid_ids, sample_ids = get_random_cosmos_sample_ids(
    m_cosmos,
    n=100,
    dt_start=dt_string_conversion('01/02/22', DatetimeFormats.DT_DATE, DatetimeFormats.DT_COSMOS),
    dt_end=dt_string_conversion('02/02/22', DatetimeFormats.DT_DATE, DatetimeFormats.DT_COSMOS),
    filter="c.valid=true AND c.bucket=\"audience_contact\" AND c.type_str=\"standalone\"",
    # exclude_ids = [list of ids to exclude, eg. from previous sample]
)

- Total Pool Size  : 20425
- Excluded IDs     : N/A
- Valid Pool Size  : 20425
- Sample size      : 100
Retrieved 100 samples from 20425 ids


In [13]:
cs_blob = get_blob_client(CounterSpeechBlobStorage.connect_str, 'test')

# get a random sample of documents, uploaded to blob storage, with custom processing step
sample_docs = create_random_cosmos_sample(
    m_cosmos,
    cs_blob,
    n=10,
    dt_start=dt.datetime.strftime(dt.datetime(2022,2,1,0), DatetimeFormats.DT_COSMOS),
    dt_end=dt.datetime.strftime(dt.datetime(2022,2,1,6), DatetimeFormats.DT_COSMOS),
    # processing = *function to apply to all docs in sample before upload*
    filter="c.valid=true AND c.bucket=\"audience_contact\" AND c.type_str=\"standalone\"",
    # blob_prefix = *prefix to add to blob name, ie. folder, eg. 'sample1/'*
    # save_prefix = *local path to folder, eg. 'sample1/'*
    # exclude_ids = [list of ids to exclude, eg. from previous sample]
    return_ids=False,
    return_docs=True
)

### Getting ids:


Batching IDs: 100%|██████████| 9/9 [00:00<00:00, 30198.99it/s]
Querying batches:   0%|          | 0/1 [00:00<?, ?it/s]

- Total Pool Size  : 1869
- Excluded IDs     : N/A
- Valid Pool Size  : 1869
- Sample size      : 10
Retrieved 10 samples from 1869 ids
### Saving ids locally to '' (local)
### Saving ids to test:
### Getting sample docs:
[10]


Querying batches: 100%|██████████| 1/1 [00:00<00:00,  2.54it/s]
Blob Upload: 100%|██████████| 10/10 [00:00<00:00, 53.29it/s]

10 docs retrieved in 0:00:00.393356
### Uploading sample docs to blob container test:





# Create df from tweets

In [15]:
df = create_tweet_df(sample_docs, cols=['id','text_replaced_b','datetime','seed_MP'])
df

Unnamed: 0,id,text_replaced_b,datetime,seed_MP
0,1488301096439328773,Day 13 It's now just 58 days until the energy ...,2022-02-01T00:00:07.0000000Z,"[@BorisJohnson, @KwasiKwarteng]"
1,1488303047491334149,"Ayder Hospital in Mekelle, the largest hospita...",2022-02-01T00:07:52.0000000Z,[@vickyford]
2,1488305281830469632,Kudos to those who delivered some home truths ...,2022-02-01T00:16:45.0000000Z,[@DawnButlerBrent]
3,1488331550613819392,Medics from Ayder hospital in Mekelle said mor...,2022-02-01T02:01:08.0000000Z,"[@DavidLammy, @lynbrownmp]"
4,1488341347392270337,[MP] BULLY!,2022-02-01T02:40:03.0000000Z,[@Ianblackford_MP]
5,1488349589178662912,[MP] please could you explain why there is a n...,2022-02-01T03:12:48.0000000Z,[@BorisJohnson]
6,1488363404926586882,"Thousands of deaths in #Tigray ""overwhelmingly...",2022-02-01T04:07:42.0000000Z,[@SarahChampionMP]
7,1488367634123038723,"In #Tigray, half of all pregnant & breastfeedi...",2022-02-01T04:24:31.0000000Z,[@jeremycorbyn]
8,1488377834569621506,The entire population of Tigray is starving. ‘...,2022-02-01T05:05:03.0000000Z,[@vickyford]
9,1488391444515725313,Good Moaning [MP] I have come across a book ...,2022-02-01T05:59:08.0000000Z,[@Keir_Starmer]


# Blob Storage Examples

### Get client

In [4]:
cs_blob = get_blob_client(CounterSpeechBlobStorage.connect_str, 'test')

### Upload dataframe as csv

In [11]:
df = pd.DataFrame(data={'a': [1, 2, 3], 'b': [4, 5, 6]})
upload_df_to_blob_as_csv(cs_blob, 'testdf', df)
df

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


### Download csv to dataframe

In [12]:
df2 = download_csv_from_blob_to_df(cs_blob, 'testdf.csv')
df2

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


# Load huggingface models for inference

### Footballer Abuse Model

Local model

In [11]:
model = HuggingfaceInferenceModel('footballer_abuse_model/', 'temp/', 16)
df = model(['I hate you', 'I love you', '[MP]  your a schmuck. Imagine if it was your wife that passed'], return_df=True)
df

Didn't find file footballer_abuse_model/tokenizer.json. We won't load it.
loading file footballer_abuse_model/spm.model
loading file None
loading file footballer_abuse_model/added_tokens.json
loading file footballer_abuse_model/special_tokens_map.json
loading file footballer_abuse_model/tokenizer_config.json
Adding [MASK] to the vocabulary
Adding [USER] to the vocabulary
Adding [PLAYER] to the vocabulary
Adding [BODY] to the vocabulary
Adding [CLUB] to the vocabulary
Adding [URL] to the vocabulary
loading configuration file footballer_abuse_model/config.json
Model config DebertaV2Config {
  "_name_or_path": "footballer_abuse_model/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-

Unnamed: 0,text,probs,labels
0,I hate you,0.992004,1
1,I love you,0.001655,0
2,[MP] your a schmuck. Imagine if it was your w...,0.961913,1


### Hatemoji

Huggingface hosted model

In [6]:
model = HuggingfaceInferenceModel('HannahRoseKirk/Hatemoji', 'temp/', 16)
df = model(['I hate you', 'I love you', '[MP]  your a schmuck. Imagine if it was your wife that passed'], return_df=True)
df

loading configuration file https://huggingface.co/HannahRoseKirk/Hatemoji/resolve/main/config.json from cache at /home/onlinesafety/.cache/huggingface/transformers/a0c8d234e4a92f0d24045042af6129ea92d0fd9505db052a14b68fda7e420495.7904fe6ee2bb189dc549bfe42f870c902aa3d5f49b3a81862227be11f8f17adb
Model config DebertaConfig {
  "_name_or_path": "HannahRoseKirk/Hatemoji",
  "architectures": [
    "DebertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "

Unnamed: 0,text,probs,labels
0,I hate you,4e-05,0
1,I love you,3.8e-05,0
2,[MP] your a schmuck. Imagine if it was your w...,0.841205,1
