In [1]:
import glob
import json
from typing import Tuple, List, AnyStr

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, PreTrainedTokenizer
from tqdm import tqdm

from datareader import text_to_batch_transformer

In [36]:
class AggDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: BertTokenizer):
        self.dataset = df
        self.dataset['statement'] = self.dataset['statement'].str.normalize(
            'NFKD')
        self.tokenizer = tokenizer

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, index: int) -> Tuple:
        row = self.dataset.values[index]
        return text_to_batch_transformer([row[0]], self.tokenizer)

In [3]:
bert_model = 'bert-base-uncased'
bert_config = BertConfig.from_pretrained(bert_model, num_labels=2)
tokenizer = BertTokenizer.from_pretrained(bert_model)
model = BertForSequenceClassification.from_pretrained(
    bert_model, config=bert_config)
model.load_state_dict(
    torch.load("models/pheme-pu-solo/model_charliehebdo.pth"))

<All keys matched successfully>

In [31]:
def parse_tweet(series: pd.Series) -> pd.Series:
    series = series.str.replace(r"(@[\w|\d]+|\#[\w|\d]+|https\S+)", " ")
    for s in [r"\s{2,}", r"RT:\s?", r"^s+\$", r"\\u", r"[^\s\w,!?]"]:
        series = series.str.replace(s, "")
    return series.str.replace(r"\s+", " ")

def collate_batch_transformer(
        input_data: Tuple) -> Tuple[torch.Tensor, torch.Tensor]:
    input_ids = [i[0][0] for i in input_data]
    masks = [i[1][0] for i in input_data]
    max_length = max([len(i) for i in input_ids])

    input_ids = [(i + [0] * (max_length - len(i))) for i in input_ids]
    masks = [(m + [0] * (max_length - len(m))) for m in masks]

    assert (all(len(i) == max_length for i in input_ids))
    assert (all(len(m) == max_length for m in masks))
    return torch.tensor(input_ids), torch.tensor(masks)



In [103]:
df_agg = pd.read_csv("../Data/PU/PHEME.csv", sep="\t")
df_agg = df_agg[df_agg["Topic"]=="sydneysiege"]
df_agg["statement"] = df_agg["Statement"]
df_agg["statement"].to_list()

["BREAKING: Hostages are being forced to hold an ISIS flag at a Lindt cafe in Sydney's Martin Place, as police man the doors outside #7NEWS",
 'BREAKING: Gunman takes hostages in cafe in Martin Place, Sydney  http://t.co/vj8G6m6VYs',
 'UPDATE: An ISIS flag is being displayed in the window of a cafeÃÅ under siege in Sydney‚Äôs Martin Place. #9News',
 "#BREAKING: Hostages are being held and a siege is taking place at Sydney's Lindt Chocolat Cafe in Martin Place.",
 'BREAKING: A Sydney cafe at Martin Place is being held up - hostages inside have their hands against the windows, ISIS flag visible.. #7News',
 "BREAKING: Live coverage of hostage situation unfolding in Sydney's Martin Place http://t.co/TTG8ye71Zg http://t.co/iTwradHTb3",
 'Terrifying photo of hostages in Martin Place in Sydney, being held by men waving an ISIS flag http://t.co/6ozBRQVNlG',
 'RT @tomsteinfort: Terrifying photo of hostages in Martin Place in Sydney, being held by men waving an ISIS flag http://t.co/CRqVuAd0PN',

In [97]:
def explode(row):
    return eval(row)

df_agg = pd.read_csv("../Data/AggregateAnalysis.csv", sep="\t")
df_agg = pd.DataFrame(df_agg["sample_tweets"].apply(explode).explode().tolist(), columns = ["Id", "statement"])
# df_agg["statement"] = parse_tweet(df_agg["statement"])


In [106]:
device = torch.device("cuda:0") if torch.cuda.is_available else torch.device("cpu")
dataset = AggDataset(df_agg, tokenizer)
dataloader = DataLoader(dataset, batch_size=64, collate_fn=collate_batch_transformer)
with torch.no_grad():
    labels = list()
    model = model.to(device)
    for batch in tqdm(dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids = batch[0]
        masks = batch[1]
        logits = model(input_ids, attention_mask=masks)
        labels.extend(list(np.argmax(nn.Softmax()(logits[0]).detach().cpu().numpy(), axis=-1)))

  labels.extend(list(np.argmax(nn.Softmax()(logits[0]).detach().cpu().numpy(), axis=-1)))
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  8.90it/s]


In [109]:
labels

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,


In [93]:
df_agg["statement"].tolist()

['FAKE!!!! That fence Been there! U yankees have Never been to the border! ',
 'DAN CRENSHAW is a GlobalistONE',
 'Abbott only started attempting to do anything about everything, this past year Hes a RINO',
 'No, you didnt read properly The daily cases on 2020 were Max 700 a day Total about 25k for 2020 In 2021, we had Max a day So way more people in Vic got covid, less deaths in 2021My unvaxxed friend got covid and she had an awful experience Vaxxed friend got covid, nothing',
 '',
 'Only a moron thinks the government is the answer',
 'I believe in 2 The rest of it is just blowing smoke You cant end property taxes, modifications are extremely rare, the school indoctrination stuff is absolutely ridiculous and the border is federal Anything to actually help Texans, like expanding Medicare, legal weed?',
 'Followed all of the unfollowed!Thanks Madame Tickle!!',
 'Thank you , my sweet friend! Appreciate the ride! Will catch up on following ASAP! Hope everyone had a great New years day!!',