In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import csv

%matplotlib inline

In [None]:
# display all pandas columns
pd.set_option('display.max_columns', None)

### Open and Examine Data

In [None]:
input_csv_path = './input_ai_data.csv'

In [None]:
def load_input_csv_to_dataframe(input_csv_path: str) -> pd.DataFrame:
    # load all lines into list
    all_lines = []

    with open(input_csv_path, "r") as f:
        reader = csv.reader(f, delimiter=",")
        for line in tqdm(reader):
            all_lines.append(line[:30])

    # make into dataframe
    df = pd.DataFrame(all_lines)
    df.columns = df.iloc[0]
    df = df[1:]
    
    return df

df = load_input_csv_to_dataframe(input_csv_path)

In [None]:
# get columns relevent to remark rating
df = df[['Category ID', 'Player Level', 'User Note', 'Occurrence Remarks', 'Expert Remark Rating', 'Expert Level',]]
df = df.drop_duplicates()
print(df.shape)

In [None]:
# take all before taxon tree (what was added post processing)
df['User Note'] = df['User Note'].apply(lambda x: x.split('Taxon Tree')[0])

In [None]:
# process user note; join together all users' comments
df['Joint User Note'] = df['User Note'] + df['Occurrence Remarks']
df = df.drop(['User Note', 'Occurrence Remarks'], axis=1)

In [None]:
# make sure all are valid
df = df[df['Joint User Note'] != '']

In [None]:
# drop rows where nan expert remark rating
df = df[df['Expert Remark Rating'] != '']

In [None]:
sns.countplot(x='Expert Remark Rating', data=df)
plt.title('Expert Remark Distribution');

In [None]:
df = df.reset_index(drop=True)

In [None]:
# turn expert remark rating into an int
df['Expert Remark Rating'] = df['Expert Remark Rating'].apply(lambda x: int(float(x)))

### Preprocess text
https://machinelearningknowledge.ai/11-techniques-of-text-preprocessing-using-nltk-in-python/

In [None]:
df.head(1)

In [None]:
#!pip install nltk 

In [None]:
import re
import nltk
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
en_stopwords = stopwords.words('english')


def remove_whitespace(text):
    return  " ".join(text.split())


def remove_stopwords(text):
    result = [token for token in text if token not in en_stopwords]
    return result


def remove_punct(text):
    tokenizer = RegexpTokenizer(r"\w+")
    lst=tokenizer.tokenize(' '.join(text))
    return lst


def lemmatization(text):
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    
    return result


def remove_tag(text):
    text=' '.join(text)
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)


def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
subset_df = df.sample(10_000).copy() # take subset of 10000 first

In [None]:
tqdm.pandas()
print('Removing Whitespace')
subset_df['Joint User Note'] = subset_df['Joint User Note'].progress_apply(remove_whitespace)

print('Tokenizing words')
subset_df['Joint User Note'] = subset_df['Joint User Note'].progress_apply(word_tokenize) 

print('Removing stopwords')
subset_df['Joint User Note'] = subset_df['Joint User Note'].progress_apply(lambda x: remove_stopwords(x)) 

print('Removing punctuation')
subset_df['Joint User Note'] = subset_df['Joint User Note'].progress_apply(lambda x: " ".join(remove_punct(x)))

print('Removing URLs')
subset_df['Joint User Note'] = subset_df['Joint User Note'].progress_apply(remove_urls) 

print('Lemmatizing')
subset_df['Joint User Note'] = subset_df['Joint User Note'].progress_apply(lambda x: " ".join(lemmatization(x.split())))

In [None]:
# create and save text file
with open('./remarks.txt', 'w+') as wf:
    for el, jun in subset_df[['Expert Remark Rating', 'Joint User Note']].values:
        string = f"__label__{el} {jun}"
        wf.write(string)
        wf.write('\n')

In [None]:
# for total of 10000 data points
!head -n 9000 remarks.txt > remarks.train
!tail -n 1000 remarks.txt > remarks.valid

### Train Using FastText
https://fasttext.cc/docs/en/supervised-tutorial.html

In [None]:
import fasttext

In [None]:
model = fasttext.train_supervised(input="remarks.train", lr=1, epoch=100, wordNgrams=3)

In [None]:
#model.save_model("model_remarks.bin") #load_model does the inverse

# model.test("remarks.valid") # precision, recall at 1; how often highest ranked document contains correct answer
model.test("remarks.valid", k=5) # precision, recall at 5. want > 0.2 precision, otherwise no better than random

In [None]:
model.predict("what an amazing kangaroo", k=-1)

In [None]:
model.test("remarks.valid", k=-1)

### Results are not that great; let's try BERT
https://colab.research.google.com/github/rap12391/transformers_multilabel_toxic/blob/master/toxic_multilabel.ipynb#scrollTo=0DF3ddjej5vd

In [None]:
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from tqdm import tqdm, trange
from ast import literal_eval

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) 

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(subset_df['Expert Remark Rating'].values.reshape(-1,1))

In [None]:
# create list of labels and comments
labels = list(enc.transform(subset_df['Expert Remark Rating'].values.reshape(-1,1)).toarray())
comments = list(subset_df['Joint User Note'].values)

In [None]:
max_length = 100
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

In [None]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids'] # token type ids
attention_masks = encodings['attention_mask'] # attention masks

In [None]:
# Use train_test_split to split our data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids,attention_masks,
                                                            random_state=2020, test_size=0.10, stratify = labels)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [None]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
from transformers import BertForSequenceClassification

# Load model, the pretrained model will include a single linear classification layer on top for classification. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)
model.cuda()

In [None]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
from transformers import AdamW
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

In [None]:
num_labels=6

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # # Forward pass for multiclass classification
    # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    # loss = outputs[0]
    # logits = outputs[1]

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(
        logits.view(-1,num_labels),
        b_labels.type_as(logits).view(-1,num_labels)
    ) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  threshold = 0.50
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

  print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)

In [None]:
test_df = df.sample(10_000).copy() # take subset of 10000 to test
test_df['Joint User Note'] = test_df['Joint User Note'].progress_apply(remove_whitespace)
test_df['Joint User Note'] = test_df['Joint User Note'].progress_apply(word_tokenize) 
test_df['Joint User Note'] = test_df['Joint User Note'].progress_apply(lambda x: remove_stopwords(x)) 
test_df['Joint User Note'] = test_df['Joint User Note'].progress_apply(lambda x: " ".join(remove_punct(x)))
test_df['Joint User Note'] = test_df['Joint User Note'].progress_apply(remove_urls) 
test_df['Joint User Note'] = test_df['Joint User Note'].progress_apply(lambda x: " ".join(lemmatization(x.split())))

In [None]:
test_labels = list(enc.transform(test_df['Expert Remark Rating'].values.reshape(-1,1)).toarray())
test_comments = list(test_df['Joint User Note'].values)

In [None]:
# Encoding input data
test_encodings = tokenizer.batch_encode_plus(test_comments,max_length=max_length,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

In [None]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)
# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
# Save test dataloader
#torch.save(test_dataloader,'test_data_loader')

In [None]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

In [None]:
str_target_labels = [str(e) for e in enc.categories_[0]]
pred_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=str_target_labels)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

In [None]:
idx2label = dict(zip(range(6),str_target_labels))
print(idx2label)

In [None]:
# Getting indices of where boolean one hot vector true_bools is True so we can use idx2label to gather label names
true_label_idxs, pred_label_idxs=[],[]
for vals in true_bools:
  true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [None]:
# Gathering vectors of label names using idx2label
true_label_texts, pred_label_texts = [], []
for vals in true_label_idxs:
  if vals:
    true_label_texts.append([idx2label[val] for val in vals])
  else:
    true_label_texts.append(vals)

for vals in pred_label_idxs:
  if vals:
    pred_label_texts.append([idx2label[val] for val in vals])
  else:
    pred_label_texts.append(vals)

In [None]:
# Decoding input ids to comment text
comment_texts = [tokenizer.decode(text,skip_special_tokens=True,clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [None]:
# Converting lists to df
comparisons_df = pd.DataFrame({'comment_text': comment_texts, 'true_labels': true_label_texts, 'pred_labels':pred_label_texts})
comparisons_df.to_csv('comparisons.csv')
comparisons_df.head()