# Misogyny Classifier Training

In [14]:
import pandas as pd
import re, torch, time, datetime, random, collections
from transformers import BertTokenizer, AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from itertools import chain

## 1 Load the data

In [2]:
mis_df = pd.read_csv('final_labels.csv')

## 2 Preprocessing

### 2.1 Delete symbols using regular expression

In [3]:
r_anum = re.compile(r'([^\sa-z0-9.(?)!])+')
r_white = re.compile(r'[\s.(?)!]+')

text = []
text_sen = []
for body in mis_df['body']:
    body = str(body)
    body_clean = r_white.sub(' ', r_anum.sub('', body.lower())).strip()
    text.append(body_clean)

In [4]:
mis_df['cleaned_text'] = text

### 2.2 Tokenization

In [5]:
# Create sentence and label lists
sentences = mis_df.cleaned_text

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences_with_token = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

### 2.3 Create labels

In [6]:
labels = []
for label_str in mis_df.level_1:
    if label_str == "Nonmisogynistic":
        labels.append(0)
    elif label_str == "Misogynistic":
        labels.append(1)
    else:
        labels.append(2)

In [7]:
occurrences = collections. Counter(labels)
occurrences

Counter({0: 5868, 1: 699})

In [8]:
mis_df['labels'] = labels

In [9]:
MAX_LEN = 128

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [None]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2020, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2020, test_size=0.1)

### 3 On with BERT!

In [None]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
torch.cuda.empty_cache() 

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [None]:
# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
gpu = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if gpu:
    n_gpu = torch.cuda.device_count()
    torch.cuda.get_device_name(0)

In [None]:
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of    185.    Elapsed: 0:00:52.
  Batch    80  of    185.    Elapsed: 0:01:44.
  Batch   120  of    185.    Elapsed: 0:02:36.
  Batch   160  of    185.    Elapsed: 0:03:28.

  Average training loss: 0.31
  Training epcoh took: 0:04:00

Running Validation...
  Accuracy: 0.90
  Validation took: 0:00:10

Training...
  Batch    40  of    185.    Elapsed: 0:00:52.
  Batch    80  of    185.    Elapsed: 0:01:44.
  Batch   120  of    185.    Elapsed: 0:02:36.
  Batch   160  of    185.    Elapsed: 0:03:28.

  Average training loss: 0.22
  Training epcoh took: 0:04:00

Running Validation...
  Accuracy: 0.90
  Validation took: 0:00:10

Training complete!


In [None]:
model.save_pretrained("bert_mis_gpu")

In [None]:
model.cpu()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
model.save_pretrained("bert_mis_cpu")

## 4. Define some functions

In [51]:
# Delete certain keywords from dataframe

def df_shrink(df,keyword_list):
    to_be_deleted = []
    for keyword in df['Keyword']:
        if keyword in keyword_list:
            to_be_deleted.append(1)
        else:
            to_be_deleted.append(0)
    df['to_be_deleted'] = to_be_deleted
    df1 = df[df['to_be_deleted'] == 0]
    return df1

In [12]:
# Train the classifier and return the predictions
MAX_LEN = 128
batch_size = 32 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("bert-mis", num_labels=2)
model.cuda()

def train(df):

    sentences = df['Clean Text']
    sentences_with_token = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
    
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences_with_token]

    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Create a mask of 1s for each token followed by 0s for padding
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask) 

    # Convert to tensors.
    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    
    # Create the DataLoader.
    prediction_data = TensorDataset(prediction_inputs, prediction_masks)
    prediction_dataloader = DataLoader(prediction_data, batch_size=batch_size)
    
    # Tracking variables 
    predictions = []

    # Predict 
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask = batch

        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate predictions
            outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

        results = outputs[0]

        # Move logits and labels to CPU
        results = results.detach().cpu().numpy()

        # Store predictions and true labels
        predictions.append(results)

    return predictions

## 4. Now, load and preprocess the Twitter data

In [2]:
tweets_df = pd.read_csv('tweets_df_full_export.csv')

In [3]:
len(tweets_df)

2763637

### 4.1 Take a smaller sample

In [5]:
tweets_df_smaller = tweets_df.sample(n=300000)

### 4.2 Delete words occurred in this sample less than 1,000 times

In [6]:
occurrences_1 = collections.Counter(tweets_df_smaller.Keyword)
len(occurrences_1)

420

In [7]:
keywords_rare = [k for k,v in occurrences_1.items() if v < 1000]

In [8]:
len(keywords_rare)

361

In [10]:
tweets_df_smaller_1 = df_shrink(tweets_df_smaller,keywords_rare)

In [11]:
tweets_df_smaller_1.shape

(259903, 11)

### 4.3 Train

In [13]:
%time predictions = train(tweets_df_smaller_1)

CPU times: user 11min 50s, sys: 1.26 s, total: 11min 52s
Wall time: 11min 53s


In [16]:
# Get the prediction results
predictions_flat = list(chain.from_iterable(predictions))

prediction_set = []
for i in range(len(tweets_df_smaller_1)):
    prediction_set.append(np.argmax(predictions_flat[i]))
    
tweets_df_smaller_1['mis_prediction'] = prediction_set
mis_keywords = tweets_df_smaller_1['Keyword'][tweets_df_smaller_1['mis_prediction'] == 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
keywords_count = collections.Counter(tweets_df_smaller_1['Keyword'])
mis_keywords_count = collections.Counter(mis_keywords)
weights_df = pd.DataFrame({'n':pd.Series(keywords_count), 'mis_n':pd.Series(mis_keywords_count)})

In [21]:
weights_df.to_csv('weights_1.csv')

### 4.4 Second round training - selecting keywords

In [27]:
already_used_words = list(weights_df.index)

In [24]:
occurrences_2 = collections.Counter(tweets_df.Keyword)

In [33]:
keywords_rare_1 = [k for k,v in occurrences_2.items() if v < 1000]

In [52]:
tweets_df_1 = df_shrink(tweets_df,already_used_words + keywords_rare_1)

In [53]:
len(tweets_df_1)

320001

In [54]:
%time predictions = train(tweets_df_1)

CPU times: user 14min 34s, sys: 1.77 s, total: 14min 35s
Wall time: 14min 37s


In [55]:
# Get the prediction results
predictions_flat = list(chain.from_iterable(predictions))

prediction_set = []
for i in range(len(tweets_df_smaller_1)):
    prediction_set.append(np.argmax(predictions_flat[i]))
    
tweets_df_smaller_1['mis_prediction'] = prediction_set
mis_keywords = tweets_df_smaller_1['Keyword'][tweets_df_smaller_1['mis_prediction'] == 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [56]:
keywords_count = collections.Counter(tweets_df_smaller_1['Keyword'])
mis_keywords_count = collections.Counter(mis_keywords)
weights_df = pd.DataFrame({'n':pd.Series(keywords_count), 'mis_n':pd.Series(mis_keywords_count)})

In [57]:
weights_df.to_csv('weights_2.csv')