# **#Phase 1** Train on Train + Dev -> First model

## Requirements

In [None]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset
import torch.nn.functional as F

!pip install transformers
!pip install bert-tensorflow

In [None]:
from google.colab import drive
drive.mount('/content/gdrive') 
model_save_name = 'Homo_lt.pt'
path = F"/content/gdrive/My Drive/{model_save_name}" 

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

## Prepare Data - HOMO Train + Dev

In [None]:
data1 = pd.read_excel("homo_train.xlsx")
data1 = data1.rename(columns={'text            ': 'text'})
data2 = pd.read_excel("homo_dev.xlsx")
data2 = data2.rename(columns={"label": "category"})
data2 = data2.rename(columns={'text                        ': 'text'})
data = pd.concat([data1, data2], axis=0)
data = data.dropna()
data = data.rename(columns={"category": "labels"})
data

Unnamed: 0,category,text
0,Non-anti-LGBT+ content,"I support her, very smart ponnu"
1,Homophobic,priyadharshini kannan same gender attraction ...
2,Non-anti-LGBT+ content,Bro u name and phone number (or)mobile number ...
3,Non-anti-LGBT+ content,experience Thaks bro I love you so much bro ...
4,Non-anti-LGBT+ content,world is becoming bad day by day....
...,...,...
787,Non-anti-LGBT+ content,"Hi ma, I am a mother of 2 kids ma, I support H..."
788,Non-anti-LGBT+ content,Behavior is very cheaper. Recently I travelled...
789,Non-anti-LGBT+ content,Boomi Raja then u should never watch this my d...
790,Non-anti-LGBT+ content,Fables movie mind blowing


In [None]:
def label_col (row):
  if row['labels'] == 'Non-anti-LGBT+ content':
    return 0
  elif row['labels'] == 'Homophobic':
    return 1
  elif row['labels'] == 'Transphobic':
    return 2

In [None]:
data['labels'] = data.apply(lambda row: label_col(row), axis=1)
data.to_csv('data.csv')
data.labels.value_counts()

Non-anti-LGBT+ content    3732
Homophobic                 215
Transphobic                  8
Name: labels, dtype: int64

## Training

In [None]:
#data = pd.read_csv('data.csv')

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset in traning and validation(test)
X_train, X_val, Y_train, Y_val = train_test_split(
    data.index.values,
    data.labels.values,
    test_size=0.10,
    random_state=17,
    stratify=data.labels.values
)

In [None]:
# Check datasets composition
data['data_type'] = ['not_set'] * data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'
data.groupby(['labels', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,text
labels,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1
0,train,3358,3358
0,val,374,374
1,train,194,194
1,val,21,21
2,train,7,7
2,val,1,1


In [None]:
data = data.dropna()

In [None]:
# Encode training dataset using the tokenizer
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True, 
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

# Encode validation dataset using the tokenizer
encoded_data_val = tokenizer.batch_encode_plus(
    data[data.data_type == 'val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Extract IDs, attention masks and labels from training dataset
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type == 'train'].labels.values)
labels_train

tensor([0, 1, 0,  ..., 0, 0, 0])

In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type == 'val'].labels.values)

In [None]:
# Create train and validation dataset from extracted features
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
print("Train dataset length: {}\nValidation dataset length: {}".format(len(dataset_train), len(dataset_val)))

Train dataset length: 3559
Validation dataset length: 396


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Define the size of each batch
batch_size = 16

# Load training dataset
dataloader_train= DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size)

# Load valuation dataset
dataloader_val= DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size)


In [None]:
from transformers import BertForSequenceClassification
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels = 3,
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Define model optimizer -> Adam
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, 
    eps=1e-8
)
# Define model scheduler
epochs = 4
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [None]:
import random

# Define random seeds
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Define processor type for torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Returns the F1 score computed on the predictions
def f1_score_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

# Returns the precision, accuracy and recall score computed on the predictions
def prec_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return precision_score(labels_flat, preds_flat, average='weighted')

def recall_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return recall_score(labels_flat, preds_flat, average='weighted')
  

def acc_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return accuracy_score(labels_flat, preds_flat)

In [None]:
# Evaluates the model using the validation set
def evaluate(dataloader_val):
  model.eval()
  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in dataloader_val:
      batch = tuple(b.to(device) for b in batch)
      inputs = {'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2],
        }

      with torch.no_grad():
          outputs = model(**inputs)

      loss = outputs[0]
      logits = outputs[1]
      loss_val_total += loss.item()

      logits = logits.detach().cpu().numpy()
      label_ids = inputs['labels'].cpu().numpy()
      predictions.append(logits)
      true_vals.append(label_ids)

  loss_val_avg = loss_val_total / len(dataloader_val)

  predictions = np.concatenate(predictions, axis=0)
  true_vals = np.concatenate(true_vals, axis=0)

  return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs + 1)):

    model.train()  # model is training

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()  # to backpropagate

        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                      1.0)  # prevents the gradient from being too small or too big

        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    
    torch.save(model, path)
    tqdm.write(f'\nEpoch {epoch}/{epochs}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')  # make sure that model is still training

    val_loss, predictions, true_vals = evaluate(dataloader_val)  # to check overtraining (or overfitting)
    val_f1 = f1_score_func(predictions, true_vals)
    val_prec = prec_func(predictions, true_vals)
    val_recall = recall_func(predictions, true_vals)
    val_acc = acc_func(predictions, true_vals)

    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score(weighted) : {val_f1}')
    tqdm.write(f'Prec Score(weighted) : {val_prec}')
    tqdm.write(f'Recall Score(weighted) : {val_recall}')
    tqdm.write(f'Acc Score : {val_acc}')

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/223 [00:00<?, ?it/s]


Epoch 1/4
Training loss: 0.2732727803172953
Validation loss: 0.18075892359018325
F1 Score(weighted) : 0.9174603174603174
Prec Score(weighted) : 0.8919753086419753
Recall Score(weighted) : 0.9444444444444444
Acc Score : 0.9444444444444444


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2:   0%|          | 0/223 [00:00<?, ?it/s]


Epoch 2/4
Training loss: 0.1558717460759953
Validation loss: 0.14834365148097275
F1 Score(weighted) : 0.9379421877385494
Prec Score(weighted) : 0.9362080860147871
Recall Score(weighted) : 0.9494949494949495
Acc Score : 0.9494949494949495


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3:   0%|          | 0/223 [00:00<?, ?it/s]


Epoch 3/4
Training loss: 0.10869033877516475
Validation loss: 0.15509031355381012
F1 Score(weighted) : 0.9507896049952125
Prec Score(weighted) : 0.9495622895622896
Recall Score(weighted) : 0.952020202020202
Acc Score : 0.952020202020202


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4:   0%|          | 0/223 [00:00<?, ?it/s]


Epoch 4/4
Training loss: 0.07565013915008026
Validation loss: 0.15792244888842105
F1 Score(weighted) : 0.9526400874082992
Prec Score(weighted) : 0.9500516980831941
Recall Score(weighted) : 0.9570707070707071
Acc Score : 0.9570707070707071


  _warn_prf(average, modifier, msg_start, len(result))


# **#Phase 2** Getting proba for HopeData + Adding Hope & Non-Hope -> Second model

## Predict on Hope Data

In [None]:
# Evaluates the model using the validation set
def predict(dataset_test):
    predictions = []
    probas = []
    for row in dataset_test:
      row = tuple(r.to(device) for r in row)
      inputs = {'input_ids': row[0],
        'attention_mask': row[1]
        }

      with torch.no_grad():
          outputs = model(**inputs)

      logits = outputs[0]
      probas.append(F.softmax(logits, dim=-1))
      logits = logits.detach().cpu().numpy()
      predictions.append(logits)

    return probas,predictions

In [None]:
dataTrain = pd.read_csv("Hope_ENG_train.csv", header=None)
dataDev = pd.read_csv("Hope_ENG_dev.csv", header=None)
data = pd.concat([dataTrain,dataDev])
data = data.dropna()
data = data.rename(columns={1: "labels"})
data = data.rename(columns={0: 'text'})
data

Unnamed: 0,text,labels
0,these tiktoks radiate gay chaotic energy and i...,Non_hope_speech
1,@Champions Again He got killed for using false...,Non_hope_speech
2,It's not that all lives don't matter,Non_hope_speech
3,Is it really that difficult to understand? Bla...,Non_hope_speech
4,Whenever we say black isn't that racists? Why...,Non_hope_speech
...,...,...
2836,Such fake sentiment. .,Non_hope_speech
2837,@A G black lives arent undervalued compared to...,Non_hope_speech
2838,People who pulled it down can and will be arre...,Non_hope_speech
2839,@Aaron Castellanos It will be a two hour movie...,Non_hope_speech


In [None]:
data.labels.value_counts()

Non_hope_speech    23347
Hope_speech         2234
Name: labels, dtype: int64

In [None]:
'''def label_col (row):
  if row['labels'] == 'Non-anti-LGBT+ content':
    return 0
  elif row['labels'] == 'Homophobic':
    return 1
  elif row['labels'] == 'Transphobic':
    return 2
'''

def label_col(row):
  if row['labels'] == 'Non_hope_speech;;':
    return 1
  elif row['labels'] == 'Hope_speech;;':
    return 0
  elif row['labels'] == 'Non_hope_speech;':
    return 1
  elif row['labels'] == 'Hope_speech;':
    return 0
  elif row['labels'] == 'Non_hope_speech':
    return 1
  elif row['labels'] == 'Hope_speech':
    return 0

In [None]:
data['labels'] = data.apply(lambda row: label_col(row), axis=1)
data

Unnamed: 0,text,labels
0,these tiktoks radiate gay chaotic energy and i...,1
1,@Champions Again He got killed for using false...,1
2,It's not that all lives don't matter,1
3,Is it really that difficult to understand? Bla...,1
4,Whenever we say black isn't that racists? Why...,1
...,...,...
2836,Such fake sentiment. .,1
2837,@A G black lives arent undervalued compared to...,1
2838,People who pulled it down can and will be arre...,1
2839,@Aaron Castellanos It will be a two hour movie...,1


In [None]:
data_test = data[data["labels"] == 1] 
data_test = data_test[:8500]
data_test

Unnamed: 0,text,labels
0,these tiktoks radiate gay chaotic energy and i...,1
1,@Champions Again He got killed for using false...,1
2,It's not that all lives don't matter,1
3,Is it really that difficult to understand? Bla...,1
4,Whenever we say black isn't that racists? Why...,1
...,...,...
9268,@ya-lol-ey I'm glad to hear that.,1
9269,Ya'll are saying God wants us to love everyone...,1
9270,Because there are MORE lives that matter and t...,1
9272,@Wren Linnet I have been reading recently that...,1


In [None]:
# Encode validation dataset using the tokenizer
encoded_data_test = tokenizer.batch_encode_plus(
    data_test.text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)



In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

dataset_test = TensorDataset(input_ids_test, attention_masks_test)
print("Test dataset length: {}".format(len(dataset_test)))

Test dataset length: 8500


In [None]:
from torch.utils.data import DataLoader
dataloader_test = DataLoader(dataset_test)

In [None]:
probas, predictions = predict(dataloader_test)

In [None]:
results = []
for i, prediction in enumerate(predictions):
  predicted = np.argmax(prediction, axis=1)[0]
  results.append(predicted)

In [None]:
pred = []
for prediction in results:
  pred.append(prediction)

In [None]:
data_test['pred'] = pred
data_test

Unnamed: 0,text,labels,pred
0,these tiktoks radiate gay chaotic energy and i...,1,0
1,@Champions Again He got killed for using false...,1,0
2,It's not that all lives don't matter,1,0
3,Is it really that difficult to understand? Bla...,1,0
4,Whenever we say black isn't that racists? Why...,1,0
...,...,...,...
9268,@ya-lol-ey I'm glad to hear that.,1,0
9269,Ya'll are saying God wants us to love everyone...,1,0
9270,Because there are MORE lives that matter and t...,1,1
9272,@Wren Linnet I have been reading recently that...,1,0


In [None]:
data_test.pred.value_counts()

0    7929
1     571
Name: pred, dtype: int64

In [None]:
probs_1 = []
probs_2 = []
for pro in probas:
  probs_1.append(pro[0].cpu().numpy()[1])  
  probs_2.append(pro[0].cpu().numpy()[2])  
data_test['probs_1'] = probs_1
data_test['probs_2'] = probs_2

In [None]:
poolDataWithP = data_test[data_test["pred"] == 1] 
poolDataWithP

Unnamed: 0,text,labels,pred,probs_1,probs_2
10,There is justice for the natives the governmen...,1,1,0.542852,0.019499
12,Injustice is the way the world works. A millio...,1,1,0.791034,0.024194
40,The whole thing is a joke...lol all black liv...,1,1,0.813446,0.026244
49,The media and crooked politicians whipped this...,1,1,0.656302,0.022861
52,Black lives matter and not all lives?nWhat hyp...,1,1,0.847641,0.029826
...,...,...,...,...,...
9210,nothing will change and not going to and I wou...,1,1,0.800124,0.024935
9217,Petition for the rebuild and reinforced model ...,1,1,0.873590,0.033854
9230,The Supreme Holy High GOD has created all men ...,1,1,0.516016,0.016153
9247,black lives matter apparently no lives matter ...,1,1,0.874505,0.034932


In [None]:
sorted_data_test = data_test.sort_values(by='probs_1', ascending=False)
sorted_data_test = sorted_data_test[:200]
sorted_data_test  

Unnamed: 0,text,labels,pred,probs_1,probs_2
7714,There is no threat to justice because there is...,1,1,0.911506,0.040698
1199,This is protest that will bring justice not th...,1,1,0.910517,0.036565
4432,Racism will end when humanity destroys itself.,1,1,0.909233,0.043774
5511,its hypocrisy in purest form...,1,1,0.908857,0.040580
4241,obviously the mosques will now be burnt to the...,1,1,0.908400,0.048675
...,...,...,...,...,...
2612,These people don't want justice,1,1,0.841419,0.029958
1931,Are they gonna pull down all the schools,1,1,0.840574,0.030726
2861,all them people booing and some of them was wh...,1,1,0.839554,0.031872
1943,A nation of opioid and meth addicts hearing vo...,1,1,0.839229,0.041479


In [None]:
sorted_data_test=sorted_data_test.drop(['probs_1'], 1) 
sorted_data_test=sorted_data_test.drop(['probs_2'], 1) 
sorted_data_test=sorted_data_test.drop(['labels'], 1) 
sorted_data_test. rename (columns = {'pred':'labels'}, inplace = True)
sorted_data_test

Unnamed: 0,text,labels
7714,There is no threat to justice because there is...,1
1199,This is protest that will bring justice not th...,1
4432,Racism will end when humanity destroys itself.,1
5511,its hypocrisy in purest form...,1
4241,obviously the mosques will now be burnt to the...,1
...,...,...
2612,These people don't want justice,1
1931,Are they gonna pull down all the schools,1
2861,all them people booing and some of them was wh...,1
1943,A nation of opioid and meth addicts hearing vo...,1


In [None]:
temp = data[data["labels"] == 0] 
temp

Unnamed: 0,text,labels
9,Network Engineer here- 23 and currently workin...,0
19,I'm still hiding my gender to my parents and t...,0
20,all lives matter .without that we never have p...,0
26,Randomgirlwhosings0804 Why does she feel the n...,0
45,it doesn't matter what your family does when y...,0
...,...,...
2792,i’m actually about to start my college on civi...,0
2801,@Sasha Dumse that is true. But we should ALL l...,0
2808,Women need to keep fighting,0
2833,“God gave me a choice and my choice is love” t...,0


In [None]:
train = pd.read_csv('data.csv')
train

Unnamed: 0.1,Unnamed: 0,labels,text
0,0,0,"I support her, very smart ponnu"
1,1,1,priyadharshini kannan same gender attraction ...
2,2,0,Bro u name and phone number (or)mobile number ...
3,3,0,experience Thaks bro I love you so much bro ...
4,4,0,world is becoming bad day by day....
...,...,...,...
3950,787,0,"Hi ma, I am a mother of 2 kids ma, I support H..."
3951,788,0,Behavior is very cheaper. Recently I travelled...
3952,789,0,Boomi Raja then u should never watch this my d...
3953,790,0,Fables movie mind blowing


In [None]:
data = pd.concat([train,sorted_data_test,temp])  
data  

Unnamed: 0.1,Unnamed: 0,labels,text
0,0.0,0,"I support her, very smart ponnu"
1,1.0,1,priyadharshini kannan same gender attraction ...
2,2.0,0,Bro u name and phone number (or)mobile number ...
3,3.0,0,experience Thaks bro I love you so much bro ...
4,4.0,0,world is becoming bad day by day....
...,...,...,...
2792,,0,i’m actually about to start my college on civi...
2801,,0,@Sasha Dumse that is true. But we should ALL l...
2808,,0,Women need to keep fighting
2833,,0,“God gave me a choice and my choice is love” t...


## New Training Phase

In [None]:
model_save_name = 'Homo_Enhanced_lt.pt'
path = F"/content/gdrive/My Drive/{model_save_name}" 

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset in traning and validation(test)
X_train, X_val, Y_train, Y_val = train_test_split(
    data.index.values,
    data.labels.values,
    test_size=0.10,
    random_state=17,
    stratify=data.labels.values
)

In [None]:
# Check datasets composition
data['data_type'] = ['not_set'] * data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'
data.groupby(['labels', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,text
labels,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1
0,train,3292,5252
0,val,440,714
1,train,194,363
1,val,21,52
2,train,7,7
2,val,1,1


In [None]:
data = data.dropna()

In [None]:
# Encode training dataset using the tokenizer
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True, 
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

# Encode validation dataset using the tokenizer
encoded_data_val = tokenizer.batch_encode_plus(
    data[data.data_type == 'val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)



In [None]:
# Extract IDs, attention masks and labels from training dataset
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type == 'train'].labels.values)
labels_train

tensor([0, 1, 0,  ..., 0, 0, 0])

In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type == 'val'].labels.values)

In [None]:
# Create train and validation dataset from extracted features
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
print("Train dataset length: {}\nValidation dataset length: {}".format(len(dataset_train), len(dataset_val)))

Train dataset length: 3493
Validation dataset length: 462


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Define the size of each batch
batch_size = 16

# Load training dataset
dataloader_train= DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size)

# Load valuation dataset
dataloader_val= DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size)


In [None]:
from transformers import BertForSequenceClassification
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels = 3,
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Define model optimizer -> Adam
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, 
    eps=1e-8
)
# Define model scheduler
epochs = 4
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [None]:
import random

# Define random seeds
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Define processor type for torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Returns the F1 score computed on the predictions
def f1_score_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

# Returns the precision, accuracy and recall score computed on the predictions
def prec_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return precision_score(labels_flat, preds_flat, average='weighted')

def recall_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return recall_score(labels_flat, preds_flat, average='weighted')
  

def acc_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return accuracy_score(labels_flat, preds_flat)

In [None]:
# Evaluates the model using the validation set
def evaluate(dataloader_val):
  model.eval()
  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in dataloader_val:
      batch = tuple(b.to(device) for b in batch)
      inputs = {'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2],
        }

      with torch.no_grad():
          outputs = model(**inputs)

      loss = outputs[0]
      logits = outputs[1]
      loss_val_total += loss.item()

      logits = logits.detach().cpu().numpy()
      label_ids = inputs['labels'].cpu().numpy()
      predictions.append(logits)
      true_vals.append(label_ids)

  loss_val_avg = loss_val_total / len(dataloader_val)

  predictions = np.concatenate(predictions, axis=0)
  true_vals = np.concatenate(true_vals, axis=0)

  return loss_val_avg, predictions, true_vals

In [None]:
drive.mount('/content/gdrive') 

for epoch in tqdm(range(1, epochs + 1)):

    model.train()  # model is training

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()  # to backpropagate

        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                      1.0)  # prevents the gradient from being too small or too big

        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    
    torch.save(model, path)
    tqdm.write(f'\nEpoch {epoch}/{epochs}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')  # make sure that model is still training

    val_loss, predictions, true_vals = evaluate(dataloader_val)  # to check overtraining (or overfitting)
    val_f1 = f1_score_func(predictions, true_vals)
    val_prec = prec_func(predictions, true_vals)
    val_recall = recall_func(predictions, true_vals)
    val_acc = acc_func(predictions, true_vals)

    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score(weighted) : {val_f1}')
    tqdm.write(f'Prec Score(weighted) : {val_prec}')
    tqdm.write(f'Recall Score(weighted) : {val_recall}')
    tqdm.write(f'Acc Score : {val_acc}')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 1/4
Training loss: 0.30145578717464183
Validation loss: 0.18523628646829005
F1 Score(weighted) : 0.9291521486643437
Prec Score(weighted) : 0.9070294784580498
Recall Score(weighted) : 0.9523809523809523
Acc Score : 0.9523809523809523


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 2/4
Training loss: 0.17032412021765359
Validation loss: 0.15954487167041878
F1 Score(weighted) : 0.9343156255173207
Prec Score(weighted) : 0.9544515498962354
Recall Score(weighted) : 0.9545454545454546
Acc Score : 0.9545454545454546


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 3/4
Training loss: 0.11636793274713093
Validation loss: 0.15718349173729276
F1 Score(weighted) : 0.9469797178130511
Prec Score(weighted) : 0.9433869899501828
Recall Score(weighted) : 0.9545454545454546
Acc Score : 0.9545454545454546


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 4/4
Training loss: 0.08112253150456164
Validation loss: 0.1660958607761382
F1 Score(weighted) : 0.9502993460440269
Prec Score(weighted) : 0.9490458033504391
Recall Score(weighted) : 0.9588744588744589
Acc Score : 0.9588744588744589


  _warn_prf(average, modifier, msg_start, len(result))


## Predict on Test HOMO

In [None]:
data_test = pd.read_excel('homo_test.xlsx')
data_test.to_csv('data.csv')
data_test = pd.read_csv('data.csv')
data_test = data_test.drop(['Unnamed: 0'], axis = 1)
data_test = data_test.rename(columns={'text                    ': 'text'})
data_test = data_test.dropna()
data_test

Unnamed: 0,text
0,Spr....2016 poitan feel happy with my partner ...
1,R u still with ur partner
2,excellent movie..no unnecessary drama or scene...
3,"For those who speak about culture., pre marita..."
4,Best movie and people not understand relations...
...,...
985,Looks like Karthik took advantage on Varun whe...
986,i am really crying pro😢😢😢😭😭😭😭😭😭 ...
987,They may be transgender but don't ever forgot ...
988,It is their own choice. I support them No doub...


In [None]:
# Encode validation dataset using the tokenizer
encoded_data_test = tokenizer.batch_encode_plus(
    data_test.text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)



In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

dataset_test = TensorDataset(input_ids_test, attention_masks_test)
print("Test dataset length: {}".format(len(dataset_test)))

Test dataset length: 990


In [None]:
from torch.utils.data import DataLoader
dataloader_test = DataLoader(dataset_test)

In [None]:
# Evaluates the model using the validation set
def predict(dataset_test):
    predictions = []

    for row in dataset_test:
      row = tuple(r.to(device) for r in row)
      inputs = {'input_ids': row[0],
        'attention_mask': row[1]
        }

      with torch.no_grad():
          outputs = model(**inputs)

      logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      predictions.append(logits)

    return predictions

# Predict values for test dataset
predictions = predict(dataloader_test)

In [None]:
results = []
for i, prediction in enumerate(predictions):
  predicted = np.argmax(prediction, axis=1)[0]
  # print(f"index: {i} -- prediction: {predicted}")
  results.append(predicted)
pred = []
for prediction in results:
  pred.append(prediction)
  
data_test['pred'] = pred
data_test

Unnamed: 0,text,pred
0,Spr....2016 poitan feel happy with my partner ...,0
1,R u still with ur partner,0
2,excellent movie..no unnecessary drama or scene...,0
3,"For those who speak about culture., pre marita...",0
4,Best movie and people not understand relations...,0
...,...,...
985,Looks like Karthik took advantage on Varun whe...,0
986,i am really crying pro😢😢😢😭😭😭😭😭😭 ...,0
987,They may be transgender but don't ever forgot ...,0
988,It is their own choice. I support them No doub...,0


In [None]:
# convrt names
def label_col (row):
  if row['pred'] == 0:#'Non-anti-LGBT+ content':
    return 'Non-anti-LGBT+ content'
  elif row['pred'] == 1: #'Homophobic':
    return 'Homophobic'
  elif row['pred'] == 2:#'Transphobic':
    return #'Transphobic':
    
data_test['pred'] = data_test.apply(lambda row: label_col(row), axis=1)
data_test.to_excel('preds_homo_doubleaugmentation.xlsx', index=False)
from google.colab import files
files.download('preds_homo_doubleaugmentation.xlsx')
data_test

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,text,pred
0,Spr....2016 poitan feel happy with my partner ...,Non-anti-LGBT+ content
1,R u still with ur partner,Non-anti-LGBT+ content
2,excellent movie..no unnecessary drama or scene...,Non-anti-LGBT+ content
3,"For those who speak about culture., pre marita...",Non-anti-LGBT+ content
4,Best movie and people not understand relations...,Non-anti-LGBT+ content
...,...,...
985,Looks like Karthik took advantage on Varun whe...,Non-anti-LGBT+ content
986,i am really crying pro😢😢😢😭😭😭😭😭😭 ...,Non-anti-LGBT+ content
987,They may be transgender but don't ever forgot ...,Non-anti-LGBT+ content
988,It is their own choice. I support them No doub...,Non-anti-LGBT+ content


# **#Phase 3** Adding only top-K non-Hope -> Third model

In [None]:
data = pd.concat([train,sorted_data_test])  
data #3955+200=4155 

Unnamed: 0.1,Unnamed: 0,labels,text
0,0.0,0,"I support her, very smart ponnu"
1,1.0,1,priyadharshini kannan same gender attraction ...
2,2.0,0,Bro u name and phone number (or)mobile number ...
3,3.0,0,experience Thaks bro I love you so much bro ...
4,4.0,0,world is becoming bad day by day....
...,...,...,...
2612,,1,These people don't want justice
1931,,1,Are they gonna pull down all the schools
2861,,1,all them people booing and some of them was wh...
1943,,1,A nation of opioid and meth addicts hearing vo...


## New Training Phase

In [None]:
model_save_name = 'Homo_EnhancedLess_lt.pt'
path = F"/content/gdrive/My Drive/{model_save_name}" 

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset in traning and validation(test)
X_train, X_val, Y_train, Y_val = train_test_split(
    data.index.values,
    data.labels.values,
    test_size=0.10,
    random_state=17,
    stratify=data.labels.values
)

In [None]:
# Check datasets composition
data['data_type'] = ['not_set'] * data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'
data.groupby(['labels', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,text
labels,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1
0,train,3351,3351
0,val,381,381
1,train,193,363
1,val,22,52
2,train,7,7
2,val,1,1


In [None]:
data = data.dropna()

In [None]:
# Encode training dataset using the tokenizer
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True, 
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

# Encode validation dataset using the tokenizer
encoded_data_val = tokenizer.batch_encode_plus(
    data[data.data_type == 'val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)



In [None]:
# Extract IDs, attention masks and labels from training dataset
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type == 'train'].labels.values)
labels_train

tensor([0, 1, 0,  ..., 0, 0, 0])

In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type == 'val'].labels.values)

In [None]:
# Create train and validation dataset from extracted features
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
print("Train dataset length: {}\nValidation dataset length: {}".format(len(dataset_train), len(dataset_val)))

Train dataset length: 3551
Validation dataset length: 404


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Define the size of each batch
batch_size = 16

# Load training dataset
dataloader_train= DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size)

# Load valuation dataset
dataloader_val= DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size)


In [None]:
from transformers import BertForSequenceClassification
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels = 3,
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Define model optimizer -> Adam
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, 
    eps=1e-8
)
# Define model scheduler
epochs = 4
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [None]:
import random

# Define random seeds
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Define processor type for torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [None]:
for epoch in tqdm(range(1, epochs + 1)):

    model.train()  # model is training

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()  # to backpropagate

        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                      1.0)  # prevents the gradient from being too small or too big

        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    
    torch.save(model, path)
    tqdm.write(f'\nEpoch {epoch}/{epochs}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')  # make sure that model is still training

    val_loss, predictions, true_vals = evaluate(dataloader_val)  # to check overtraining (or overfitting)
    val_f1 = f1_score_func(predictions, true_vals)
    val_prec = prec_func(predictions, true_vals)
    val_recall = recall_func(predictions, true_vals)
    val_acc = acc_func(predictions, true_vals)

    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score(weighted) : {val_f1}')
    tqdm.write(f'Prec Score(weighted) : {val_prec}')
    tqdm.write(f'Recall Score(weighted) : {val_recall}')
    tqdm.write(f'Acc Score : {val_acc}')

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 1/4
Training loss: 0.28866802285182047
Validation loss: 0.2327439164241346
F1 Score(weighted) : 0.9154379769187111
Prec Score(weighted) : 0.8893797176747378
Recall Score(weighted) : 0.943069306930693
Acc Score : 0.943069306930693


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 2/4
Training loss: 0.15686382398193888
Validation loss: 0.19165030940292546
F1 Score(weighted) : 0.9285711263434034
Prec Score(weighted) : 0.9284709297997469
Recall Score(weighted) : 0.9455445544554455
Acc Score : 0.9455445544554455


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 3/4
Training loss: 0.10374869877862669
Validation loss: 0.19849143178166392
F1 Score(weighted) : 0.9437731555413607
Prec Score(weighted) : 0.9428908880735788
Recall Score(weighted) : 0.9529702970297029
Acc Score : 0.9529702970297029


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 4/4
Training loss: 0.07638257932912156
Validation loss: 0.20901718714202827
F1 Score(weighted) : 0.9437731555413607
Prec Score(weighted) : 0.9428908880735788
Recall Score(weighted) : 0.9529702970297029
Acc Score : 0.9529702970297029


  _warn_prf(average, modifier, msg_start, len(result))


## Predict on Test HOMO

In [None]:
data_test = pd.read_excel('homo_test.xlsx')
data_test.to_csv('data.csv')
data_test = pd.read_csv('data.csv')
data_test = data_test.drop(['Unnamed: 0'], axis = 1)
data_test = data_test.rename(columns={'text                    ': 'text'})
data_test = data_test.dropna()
data_test

Unnamed: 0,text
0,Spr....2016 poitan feel happy with my partner ...
1,R u still with ur partner
2,excellent movie..no unnecessary drama or scene...
3,"For those who speak about culture., pre marita..."
4,Best movie and people not understand relations...
...,...
985,Looks like Karthik took advantage on Varun whe...
986,i am really crying pro😢😢😢😭😭😭😭😭😭 ...
987,They may be transgender but don't ever forgot ...
988,It is their own choice. I support them No doub...


In [None]:
# Encode validation dataset using the tokenizer
encoded_data_test = tokenizer.batch_encode_plus(
    data_test.text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)



In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

dataset_test = TensorDataset(input_ids_test, attention_masks_test)
print("Test dataset length: {}".format(len(dataset_test)))

Test dataset length: 990


In [None]:
from torch.utils.data import DataLoader
dataloader_test = DataLoader(dataset_test)

In [None]:
# Evaluates the model using the validation set
def predict(dataset_test):
    predictions = []

    for row in dataset_test:
      row = tuple(r.to(device) for r in row)
      inputs = {'input_ids': row[0],
        'attention_mask': row[1]
        }

      with torch.no_grad():
          outputs = model(**inputs)

      logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      predictions.append(logits)

    return predictions

# Predict values for test dataset
predictions = predict(dataloader_test)

In [None]:
results = []
for i, prediction in enumerate(predictions):
  predicted = np.argmax(prediction, axis=1)[0]
  # print(f"index: {i} -- prediction: {predicted}")
  results.append(predicted)
pred = []
for prediction in results:
  pred.append(prediction)
data_test['pred'] = pred
data_test

Unnamed: 0,text,pred
0,Spr....2016 poitan feel happy with my partner ...,0
1,R u still with ur partner,0
2,excellent movie..no unnecessary drama or scene...,0
3,"For those who speak about culture., pre marita...",0
4,Best movie and people not understand relations...,0
...,...,...
985,Looks like Karthik took advantage on Varun whe...,0
986,i am really crying pro😢😢😢😭😭😭😭😭😭 ...,0
987,They may be transgender but don't ever forgot ...,0
988,It is their own choice. I support them No doub...,0


In [None]:
def label_col (row):
  if row['pred'] == 0:#'Non-anti-LGBT+ content':
    return 'Non-anti-LGBT+ content'
  elif row['pred'] == 1: #'Homophobic':
    return 'Homophobic'
  elif row['pred'] == 2:#'Transphobic':
    return #'Transphobic':
data_test['pred'] = data_test.apply(lambda row: label_col(row), axis=1)
data_test.to_excel('preds_homo_augmentation.xlsx', index=False)
from google.colab import files
files.download('preds_homo_augmentation.xlsx')
data_test

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,text,pred
0,Spr....2016 poitan feel happy with my partner ...,Non-anti-LGBT+ content
1,R u still with ur partner,Non-anti-LGBT+ content
2,excellent movie..no unnecessary drama or scene...,Non-anti-LGBT+ content
3,"For those who speak about culture., pre marita...",Non-anti-LGBT+ content
4,Best movie and people not understand relations...,Non-anti-LGBT+ content
...,...,...
985,Looks like Karthik took advantage on Varun whe...,Non-anti-LGBT+ content
986,i am really crying pro😢😢😢😭😭😭😭😭😭 ...,Non-anti-LGBT+ content
987,They may be transgender but don't ever forgot ...,Non-anti-LGBT+ content
988,It is their own choice. I support them No doub...,Non-anti-LGBT+ content
