# Requirements

In [None]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

!pip install transformers
!pip install bert-tensorflow

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 31.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 64.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 56.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 15.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive') 
model_save_name = 'homo_4e_16bs.pt'
path = F"/content/gdrive/My Drive/{model_save_name}" 

Mounted at /content/gdrive


In [None]:
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

#Prepare data - HOPE

In [None]:
train = pd.read_csv("Hope_ENG_train.csv")
dev = pd.read_csv("Hope_ENG_dev.csv")
data = pd.concat([train, dev])
data = data.rename(columns={"label;;": "labels"})
data = data.dropna()

In [None]:
#let's see the names of the labels
data.labels.value_counts()

Non_hope_speech;;    23258
Hope_speech;;         2217
Non_hope_speech;        77
Hope_speech;            16
Non_hope_speech         12
Hope_speech              1
Name: labels, dtype: int64

In [None]:
# we set hope_speech = 1; non_hope_speech = 0
def label_col (row):
  if row['labels'] == 'Non_hope_speech;;':
    return 0
  elif row['labels'] == 'Hope_speech;;':
    return 1
  elif row['labels'] == 'Non_hope_speech;':
    return 0
  elif row['labels'] == 'Hope_speech;':
    return 1
  elif row['labels'] == 'Non_hope_speech':
    return 0
  elif row['labels'] == 'Hope_speech':
    return 1


In [None]:
#save the file which we are going to use from now on
data['labels'] = data.apply(lambda row: label_col(row), axis=1)
data.to_csv('data.csv')

# New file starts from here

In [None]:
data = pd.read_csv('data.csv')

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset in traning and validation(test)
X_train, X_val, Y_train, Y_val = train_test_split(
    data.index.values,
    data.labels.values,
    test_size=0.15,
    random_state=17,
    stratify=data.labels.values
)

In [None]:
# Check datasets composition
data['data_type'] = ['not_set'] * data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'
data.groupby(['labels', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,text
labels,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1
0,train,19844,19844
0,val,3503,3503
1,train,1899,1899
1,val,335,335


In [None]:
# Encode training dataset using the tokenizer
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,  # so we know when a sentence is finished
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

# Encode validation dataset using the tokenizer
encoded_data_val = tokenizer.batch_encode_plus(
    data[data.data_type == 'val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Extract IDs, attention masks and labels from training dataset
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type == 'train'].labels.values)
labels_train

tensor([0, 0, 0,  ..., 0, 0, 1])

In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type == 'val'].labels.values)

In [None]:
# Create train and validation dataset from extracted features
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
print("Train dataset length: {}\nValidation dataset length: {}".format(len(dataset_train), len(dataset_val)))

Train dataset length: 21743
Validation dataset length: 3838


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Define the size of each batch
batch_size = 16

# Load training dataset
dataloader_train= DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size)

# Load valuation dataset
dataloader_val= DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size)


In [None]:
from transformers import BertForSequenceClassification
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels = 2,
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Define model optimizer -> Adam
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, 
    eps=1e-8
)
# Define model scheduler
epochs = 4
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [None]:
import random

# Define random seeds
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Define processor type for torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Returns the F1 score computed on the predictions
def f1_score_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')



# Returns the precision, accuracy and recall score computed on the predictions
def prec_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return precision_score(labels_flat, preds_flat, average='weighted')

def recall_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return recall_score(labels_flat, preds_flat, average='weighted')
  

def acc_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return accuracy_score(labels_flat, preds_flat)

In [None]:
# Evaluates the model using the validation set
def evaluate(dataloader_val):
  model.eval()
  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in dataloader_val:
      batch = tuple(b.to(device) for b in batch)
      inputs = {'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2],
        }

      with torch.no_grad():
          outputs = model(**inputs)

      loss = outputs[0]
      logits = outputs[1]
      loss_val_total += loss.item()

      logits = logits.detach().cpu().numpy()
      label_ids = inputs['labels'].cpu().numpy()
      predictions.append(logits)
      true_vals.append(label_ids)

  loss_val_avg = loss_val_total / len(dataloader_val)

  predictions = np.concatenate(predictions, axis=0)
  true_vals = np.concatenate(true_vals, axis=0)

  return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs + 1)):

    model.train()  # model is training

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()  # to backpropagate

        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                      1.0)  # prevents the gradient from being too small or too big

        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    
    torch.save(model, path)
    tqdm.write(f'\nEpoch {epoch}/{epochs}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')  # make sure that model is still training

    val_loss, predictions, true_vals = evaluate(dataloader_val)  # to check overtraining (or overfitting)
    val_f1 = f1_score_func(predictions, true_vals)
    val_prec = prec_func(predictions, true_vals)
    val_recall = recall_func(predictions, true_vals)
    val_acc = acc_func(predictions, true_vals)

    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score(weighted) : {val_f1}')
    tqdm.write(f'Prec Score(weighted) : {val_prec}')
    tqdm.write(f'Recall Score(weighted) : {val_recall}')
    tqdm.write(f'Acc Score : {val_acc}')

#Testing Hope

In [None]:
#Load test set
data_test = pd.read_csv('English hope speech new test.csv')
data_test

Unnamed: 0,text;;
0,These Abandon Hope videos only cement my pessi...
1,Wow your videos are long. They have good points;;
2,I know this is none of my business but 75k+ in...
3,Hope? Is that a new Pokémon?;;
4,Almost every time someone depicts a davidian s...
...,...
384,Great questions finally... great answers as we...
385,That is a man.;;
386,I wish people who criticise her would watch th...
387,I just wanna hug Madonna.nnnI LOVE how she is ...


In [None]:
data_test = data_test.rename(columns={"text;;": "text"})

In [None]:
# Encode validation dataset using the tokenizer
encoded_data_test = tokenizer.batch_encode_plus(
    data_test.text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)




In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

dataset_test = TensorDataset(input_ids_test, attention_masks_test)
print("Test dataset length: {}".format(len(dataset_test)))

Test dataset length: 389


In [None]:
from torch.utils.data import DataLoader
dataloader_test = DataLoader(dataset_test)

In [None]:
# Evaluates the model using the validation set
def predict(dataset_test):
    predictions = []

    for row in dataset_test:
      row = tuple(r.to(device) for r in row)
      inputs = {'input_ids': row[0],
        'attention_mask': row[1]
        }

      with torch.no_grad():
          outputs = model(**inputs)

      logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      predictions.append(logits)

    return predictions

# Predict values for test dataset
predictions = predict(dataloader_test)

In [None]:
print(len(predictions))
results = []
for i, prediction in enumerate(predictions):
  predicted = np.argmax(prediction, axis=1)[0]
  # print(f"index: {i} -- prediction: {predicted}")
  results.append(predicted)

print(results)
print(results.count(0))
print(results.count(1))

389
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,

In [None]:
pred = []
for prediction in results:
  pred.append(prediction)

In [None]:
#Save the predicted labels in a csv file 
data_test['pred'] = pred
data_test.to_excel('hope_4e_16bs.xlsx', index=False)

# Convert numerical preds back to text

In [None]:
preds = pd.read_excel('hope_4e_16bs.xlsx.xlsx')

In [None]:
# we set hope_speech = 1; non_hope_speech = 0
def label_col (row):
  if row['pred'] == 0:
    return 'Non_hope_speech'
  elif row['pred'] == 1:
    return 'Hope_speech'
  

In [None]:
preds['pred'] = preds.apply(lambda row: label_col(row), axis=1)

In [None]:
preds

Unnamed: 0,id,text,pred
0,1,What do you mean by the word sniped?,Non_hope_speech
1,2,I love this video!! I’m bisexual and it’s just...,Hope_speech
2,3,ya the irony but then i don't want to come off...,Non_hope_speech
3,4,A PERSON'S CHARACTER MATTERS. PERIOD!!,Non_hope_speech
4,5,@Blaster of Gasters,Non_hope_speech
...,...,...,...
2841,2842,+Ashrenneemakeup I think it's all a deliberate...,Non_hope_speech
2842,2843,Sheriff David Clarke. This guy is amazing.,Non_hope_speech
2843,2844,Abandorn Hope Situation,Non_hope_speech
2844,2845,Sheriff Clarke you are a person of such strong...,Non_hope_speech


In [None]:
preds.to_excel('preds_converted_hope.xlsx')