In [1]:
import pandas as pd
import os
import json
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers.data.processors.utils import InputExample

In [2]:
raw_data = pd.read_csv('./data/raw_data.csv')
raw_data.head()

Unnamed: 0,text_comments,text_only,comments_only,label,count
0,"Breaking: At least 10 dead, 5 injured after tO...","Breaking: At least 10 dead, 5 injured after tO...",The religion of peace strikes again.\n[SEP]Hi ...,rumour,9
1,France: 10 people dead after shooting at HQ of...,France: 10 people dead after shooting at HQ of...,MT France: 10 dead after shooting at HQ of sat...,rumour,7
2,Ten killed in shooting at headquarters of Fren...,Ten killed in shooting at headquarters of Fren...,must be that peace loving religion again\n[SEP...,rumour,5
3,BREAKING: 10 dead in shooting at headquarters ...,BREAKING: 10 dead in shooting at headquarters ...,WTF &gt; BREAKING 10 dead in shooting at headq...,rumour,13
4,Reuters: 10 people shot dead at headquarters o...,Reuters: 10 people shot dead at headquarters o...,watch yourself in Paris bud\n[SEP]islamist ter...,rumour,16


In [3]:
rumour_data = raw_data[raw_data['label']=='rumour']
rumour_data.head(10)

Unnamed: 0,text_comments,text_only,comments_only,label,count
0,"Breaking: At least 10 dead, 5 injured after tO...","Breaking: At least 10 dead, 5 injured after tO...",The religion of peace strikes again.\n[SEP]Hi ...,rumour,9
1,France: 10 people dead after shooting at HQ of...,France: 10 people dead after shooting at HQ of...,MT France: 10 dead after shooting at HQ of sat...,rumour,7
2,Ten killed in shooting at headquarters of Fren...,Ten killed in shooting at headquarters of Fren...,must be that peace loving religion again\n[SEP...,rumour,5
3,BREAKING: 10 dead in shooting at headquarters ...,BREAKING: 10 dead in shooting at headquarters ...,WTF &gt; BREAKING 10 dead in shooting at headq...,rumour,13
4,Reuters: 10 people shot dead at headquarters o...,Reuters: 10 people shot dead at headquarters o...,watch yourself in Paris bud\n[SEP]islamist ter...,rumour,16
5,10 people have died in a shooting at the Paris...,10 people have died in a shooting at the Paris...,herregud RT@SkyNews: 10 people have died in a ...,rumour,22
6,French radio says attackers on offices of Pari...,French radio says attackers on offices of Pari...,figures....when will France and the EU learn.\...,rumour,10
7,BREAKING: 10 reportedly shot dead at Paris HQ ...,BREAKING: 10 reportedly shot dead at Paris HQ ...,"How dreadful.\n[SEP]Nowhere is safe anymore? ""...",rumour,3
8,BREAKING NEWS: Ten dead in shooting at headqua...,BREAKING NEWS: Ten dead in shooting at headqua...,“@jenanmoussa: BREAKING NEWS: Ten dead in shoo...,rumour,10
9,Witness says multiple gunmen involved in shoot...,Witness says multiple gunmen involved in shoot...,En français et plus à jour\n[SEP]This issue m...,rumour,2


In [4]:
sample_data = rumour_data.sample(n=20)

In [5]:
sample_data1 = sample_data[['text_comments','label']]
sample_data1 = sample_data1.rename(columns = {'text_comments':'text'})
sample_data1['label'] = LabelEncoder().fit_transform(sample_data1['label'])

sample_data2 = sample_data[['text_only','label']]
sample_data2 = sample_data2.rename(columns = {'text_only':'text'})
sample_data2['label'] = LabelEncoder().fit_transform(sample_data2['label'])

In [6]:
sample_data1

Unnamed: 0,text,label
4011,Updated: The shooting incident in Ottawa had t...,0
2195,"In 2009 #Ferguson PD beat a black man, then ch...",0
2181,So police narrative in #Ferguson is unarmed te...,0
4983,I hope for everyones sake that this is fake bu...,0
4103,Sergeant-at-Arms Kevin Vickers hailed as hero ...,0
2130,KKK already raising money for the police offic...,0
4081,"Ottawa shooting: Nathan Cirillo, reservist fro...",0
4960,Update: Five hostages escape as a gunman conti...,0
279,#CharlieHebdo latest:\n- 2 gunmen believed hol...,0
3259,BREAKING: A #Lufthansa #Germanwings Airbus jet...,0


In [7]:
sample_data2

Unnamed: 0,text,label
4011,Updated: The shooting incident in Ottawa had t...,0
2195,"In 2009 #Ferguson PD beat a black man, then ch...",0
2181,So police narrative in #Ferguson is unarmed te...,0
4983,I hope for everyones sake that this is fake bu...,0
4103,Sergeant-at-Arms Kevin Vickers hailed as hero ...,0
2130,KKK already raising money for the police offic...,0
4081,"Ottawa shooting: Nathan Cirillo, reservist fro...",0
4960,Update: Five hostages escape as a gunman conti...,0
279,#CharlieHebdo latest:\n- 2 gunmen believed hol...,0
3259,BREAKING: A #Lufthansa #Germanwings Airbus jet...,0


In [8]:
data1_InputExamples = sample_data1.apply(lambda x: InputExample(guid=None, text_a = x['text'], text_b = None, label = x['label']), axis = 1)

data2_InputExamples = sample_data2.apply(lambda x: InputExample(guid=None, text_a = x['text'], text_b = None, label = x['label']), axis = 1)


In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F

import torch.optim as optim
from torchtext.data import BucketIterator

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup,
    BertConfig,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,
    BertweetTokenizer,
    AutoModel,
    AutoTokenizer
)

from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers.data.processors.utils import InputExample, DataProcessor

import logging

logger=logging.getLogger(__name__)

MODEL_CLASSES={
    "bert":(BertConfig,BertTokenizer),
    "bertweet":(BertConfig,BertweetTokenizer)
}

my_label_list=[0, 1]
MAX_SEQ_LENGTH=200

class BertForClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 2

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output, pooled_output=outputs[:2]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        
        outputs = (logits, pooled_output, sequence_output,)

        if labels is not None:
            
            if self.num_labels == 1:
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        
        return outputs  # loss, logits, pooled_output, sequence_output
    


In [10]:
model_path = 'text_comments'

In [11]:
args_eval={"model_name_or_path": "./trained_models/classification_models_" + model_path,
    "config_name": "./trained_models/classification_models_" + model_path,
    "tokenizer_name": "./trained_models/classification_models_" + model_path,
      }

config_class, tokenizer_class = MODEL_CLASSES["bert"]
model_class=BertForClassification


config = config_class.from_pretrained(
    args_eval["config_name"],
    finetuning_task="", 
    cache_dir=None,
)
tokenizer1 = tokenizer_class.from_pretrained(
    args_eval["tokenizer_name"],
    do_lower_case=True,
    cache_dir=None,
)
model1 = model_class.from_pretrained(
    args_eval["model_name_or_path"],
    from_tf=bool(".ckpt" in args_eval["model_name_or_path"]),
    config=config,
    cache_dir=None,
)


model1.to("cuda")

BertForClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [12]:
data1_features = convert_examples_to_features(data1_InputExamples, tokenizer1, label_list=my_label_list, output_mode="classification",  max_length=MAX_SEQ_LENGTH )


val_input_ids = torch.tensor([f.input_ids for f in data1_features], dtype=torch.long)
val_attention_mask = torch.tensor([f.attention_mask for f in data1_features], dtype=torch.long)
val_token_type_ids = torch.tensor([f.token_type_ids for f in data1_features], dtype=torch.long)
val_the_labels = torch.tensor([f.label for f in data1_features], dtype=torch.long)


eval_dataset1 = TensorDataset(val_input_ids, val_attention_mask, val_token_type_ids, val_the_labels)



In [13]:
def evaluate(model, tokenizer, eval_dataset):


    logger.info("***** Running evaluation  *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", 16)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    eval_sampler =RandomSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=16)

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to("cuda") for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps

    preds = np.argmax(preds, axis=1)
    
#     accuracy,f1 = acc_and_f1(preds, out_label_ids)


    return preds

In [14]:
preds1 = evaluate(model1, tokenizer1, eval_dataset1)

print(preds1)

Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.86it/s]

[1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 0 1 1]





In [15]:
model_path = 'comments_only'

In [16]:
args_eval={"model_name_or_path": "./trained_models/classification_models_" + model_path,
    "config_name": "./trained_models/classification_models_" + model_path,
    "tokenizer_name": "./trained_models/classification_models_" + model_path,
      }

config_class, tokenizer_class = MODEL_CLASSES["bert"]
model_class=BertForClassification


config = config_class.from_pretrained(
    args_eval["config_name"],
    finetuning_task="", 
    cache_dir=None,
)
tokenizer2 = tokenizer_class.from_pretrained(
    args_eval["tokenizer_name"],
    do_lower_case=True,
    cache_dir=None,
)
model2 = model_class.from_pretrained(
    args_eval["model_name_or_path"],
    from_tf=bool(".ckpt" in args_eval["model_name_or_path"]),
    config=config,
    cache_dir=None,
)


model2.to("cuda")

BertForClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [17]:
data2_features = convert_examples_to_features(data2_InputExamples, tokenizer2, label_list=my_label_list, output_mode="classification",  max_length=MAX_SEQ_LENGTH )


val_input_ids = torch.tensor([f.input_ids for f in data2_features], dtype=torch.long)
val_attention_mask = torch.tensor([f.attention_mask for f in data2_features], dtype=torch.long)
val_token_type_ids = torch.tensor([f.token_type_ids for f in data2_features], dtype=torch.long)
val_the_labels = torch.tensor([f.label for f in data2_features], dtype=torch.long)


eval_dataset2 = TensorDataset(val_input_ids, val_attention_mask, val_token_type_ids, val_the_labels)

In [18]:
preds2 = evaluate(model2, tokenizer2, eval_dataset2)

print(preds2)

Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 15.24it/s]

[1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]





In [19]:
for i in range(len(preds1)):
    if preds1[i]==0 and preds2[i]==1:
        print('ORIGINAL POST:\n')
        print(sample_data1.iloc[i,0].split('[SEP]')[0])
        print('COMMENTS')
        for j in range(1,len(sample_data1.iloc[i,0].split('[SEP]'))):
            print(sample_data1.iloc[i,0].split('[SEP]')[j])
            
        print('-'*60)

ORIGINAL POST:

So police narrative in #Ferguson is unarmed teen suspected in unarmed robbery was shot to death. We've still got a problem, Chief.

COMMENTS
Your little narrative is unraveling. That tends to happen when you jump to conclusions.

picture of robber shows a weapon

so shooting an unarmed black teen in cold blood b/c he could be a robbery suspect is ok?



The conclusion that was jumped to was made by officers when they shot MB. (cont)

It does not show a weapon. Nor do the police claim there was a weapon. You can read the police report online.

just saying cop had reason.to susoect and be alert, if suspect then fought cop or went for gun, that may be reason to shoot

You don't know what happened. 'They' shot him? Multiple cops, huh? A few days ago, he was just an innocent kid, remember?

oversimplification missing most important piece of narrative--struggle b/t MB &amp; cop, where MB reached for gun...

case in court will likely turn on that piece of narrative

I explain 