In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import csv

%matplotlib inline

In [None]:
# display all pandas columns
pd.set_option('display.max_columns', None)

### Open and Examine Data

In [None]:
input_csv_path = './input_ai_data.csv'

In [None]:
def load_input_csv_to_dataframe(input_csv_path: str) -> pd.DataFrame:
    # load all lines into list
    all_lines = []

    with open(input_csv_path, "r") as f:
        reader = csv.reader(f, delimiter=",")
        for line in tqdm(reader):
            all_lines.append(line[:31])

    # make into dataframe
    df = pd.DataFrame(all_lines)
    df.columns = df.iloc[0]
    df = df[1:]
    
    return df

df = load_input_csv_to_dataframe(input_csv_path)

In [None]:
# get columns relevent to remark rating
df = df[['Category ID', 'Occurrence Remarks', 'Expert Field Note Rating', 'Expert Remark Rating']]
df = df.drop_duplicates()
print(df.shape)

In [None]:
df.head(2)

In [None]:
df.sample(1)['Occurrence Remarks'].iloc[0]

In [None]:
# make sure all occurrence remarks are valid
df = df[df['Occurrence Remarks'] != '']
print(df.shape)

In [None]:
# drop rows where nan expert field note rating; also where it is 0
df = df[df['Expert Field Note Rating'] != '']
df = df[df['Expert Field Note Rating'] != '0.0']


In [None]:
df['Expert Field Note Rating'].value_counts()

In [None]:
# really not that many data points to go off of..
sns.countplot(x='Expert Field Note Rating', data=df)
plt.title('Expert Field Note Distribution');

In [None]:
# really not that many data points to go off of..
sns.countplot(x='Expert Remark Rating', data=df)
plt.title('Expert Remark Rating Distribution');

In [None]:
df = df.reset_index(drop=True)

In [None]:
# turn expert remark rating into an int
df['Expert Field Note Rating'] = df['Expert Field Note Rating'].apply(lambda x: int(float(x)))
df['Expert Remark Rating'] = df['Expert Remark Rating'].apply(lambda x: int(float(x)))

In [None]:
# field note rating must start from 0
# df['Expert Field Note Rating'] -= 1

In [None]:
df.shape

### Preprocess text
https://machinelearningknowledge.ai/11-techniques-of-text-preprocessing-using-nltk-in-python/

In [None]:
df.head(1)

In [None]:
#!pip install nltk 

In [None]:
import re
import nltk
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
en_stopwords = stopwords.words('english')


def remove_whitespace(text):
    return  " ".join(text.split())


def remove_stopwords(text):
    result = [token for token in text if token not in en_stopwords]
    return result


def remove_punct(text):
    tokenizer = RegexpTokenizer(r"\w+")
    lst=tokenizer.tokenize(' '.join(text))
    return lst


def lemmatization(text):
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    
    return result


def remove_tag(text):
    text=' '.join(text)
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)


def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
tqdm.pandas()
print('Removing Whitespace')
df['Occurrence Remarks'] = df['Occurrence Remarks'].progress_apply(remove_whitespace)

print('Tokenizing words')
df['Occurrence Remarks'] = df['Occurrence Remarks'].progress_apply(word_tokenize) 

print('Removing stopwords')
df['Occurrence Remarks'] = df['Occurrence Remarks'].progress_apply(lambda x: remove_stopwords(x)) 

print('Removing punctuation')
df['Occurrence Remarks'] = df['Occurrence Remarks'].progress_apply(lambda x: " ".join(remove_punct(x)))

print('Removing URLs')
df['Occurrence Remarks'] = df['Occurrence Remarks'].progress_apply(remove_urls) 

print('Lemmatizing')
df['Occurrence Remarks'] = df['Occurrence Remarks'].progress_apply(lambda x: " ".join(lemmatization(x.split())))

In [None]:
df.head(1)

In [None]:
df = df.drop_duplicates('Occurrence Remarks', keep='last')

In [None]:
# include category, see if it makes a difference.
df['Occurrence Remarks'] = df.apply(lambda x: f"category id {x['Category ID']}. {x['Occurrence Remarks']}", axis=1)

In [None]:
df.shape

In [None]:
df.to_pickle('./remarks_data.pickle') # save as picklefile

In [None]:
# create and save text file
with open('./remarks.txt', 'w+') as wf:
    for el, jun in df[['Expert Field Note Rating', 'Occurrence Remarks']].values:
        string = f"__label__{el} {jun}"
        wf.write(string)
        wf.write('\n')

In [None]:
# for total of 10000 data points
!head -n 1200 remarks.txt > remarks.train
!tail -n 262 remarks.txt > remarks.valid

### Train Using FastText
https://fasttext.cc/docs/en/supervised-tutorial.html

In [None]:
import fasttext

In [None]:
model = fasttext.train_supervised(input="remarks.train", lr=1, epoch=100, wordNgrams=3)

In [None]:
#model.save_model("model_remarks.bin") #load_model does the inverse

# model.test("remarks.valid") # precision, recall at 1; how often highest ranked document contains correct answer
model.test("remarks.valid", k=5) # precision, recall at 5. want > 0.2 precision, otherwise no better than random

In [None]:
# predict simply reflects the distribution of the data; almost guessing here..
model.predict("category level 23. what an amazing kangaroo", k=-1)

In [None]:
model.test("remarks.valid", k=-1)

### Results are not that great; let's try BERT
-  MultiLabel Classification: https://colab.research.google.com/github/rap12391/transformers_multilabel_toxic/blob/master/toxic_multilabel.ipynb#scrollTo=0DF3ddjej5vd
-  MultiClass Classification: https://github.com/susanli2016/NLP-with-Python/blob/master/Text_Classification_With_BERT.ipynb

In [None]:
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset

In [None]:
df['Expert Field Note Rating'] -= 1 # make 0-4

In [None]:
num_labels = len(df['Expert Field Note Rating'].unique())
num_labels

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df['Expert Field Note Rating'].values, 
                                                  test_size=0.10, 
                                                  random_state=42, 
                                                  stratify=df['Expert Field Note Rating'].values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['Expert Field Note Rating', 'data_type']).count()

In [None]:
X_train = df[df.data_type=='train']['Occurrence Remarks'].values
X_val = df[df.data_type=='val']['Occurrence Remarks'].values
y_train = df[df.data_type=='train']['Expert Field Note Rating'].values
y_val = df[df.data_type=='val']['Expert Field Note Rating'].values

In [None]:
# try under-sampling, see if this improves our results (we have a lot less data as a result)
from imblearn.under_sampling  import RandomUnderSampler

under_sampler = RandomUnderSampler(random_state=42)
X_res, y_res = under_sampler.fit_resample(X_train.reshape(-1,1), y_train.reshape(-1,1))
print(X_res.shape, y_res.shape)
X_train=X_res.reshape(-1)
y_train=y_res.reshape(-1)

In [None]:
X_train.shape, X_val.shape

In [None]:
# load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    X_train, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding=True, 
    max_length=256, # tested 512 here, doesn't make much of a difference
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    X_val, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding=True, 
    max_length=256, # can go up to 512 but we run into some memory issues..
    return_tensors='pt',
    truncation=True
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_val)

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
len(dataset_train), len(dataset_val)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=num_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16 

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

In [None]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
df['Expert Field Note Rating'].unique() # can try to predict results as-is

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

# see what gpus are available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'models/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    
HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=num_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

In [None]:
model.load_state_dict(torch.load('./models/finetuned_BERT_epoch_4.model', map_location=torch.device('cpu')))

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)
flattened_predictions = np.argmax(predictions, axis=1).flatten()

In [None]:
pd.DataFrame([X_val, flattened_predictions]).T.to_csv('output_predictions.csv')

In [None]:
flattened_predictions

In [None]:
# !pip install sklearn
import sklearn
from sklearn.metrics import classification_report
print(sklearn.metrics.classification_report(true_vals, flattened_predictions))

In [None]:
accuracy_per_class(predictions, true_vals)

### Load and Make Predictions on Sample String

In [None]:
import torch
import numpy as np

from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers import BertForSequenceClassification


def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
import re
import nltk
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
en_stopwords = stopwords.words('english')

def remove_whitespace(text):
    return  " ".join(text.split())


def remove_stopwords(text):
    result = [token for token in text if token not in en_stopwords]
    return result


def remove_punct(text):
    tokenizer = RegexpTokenizer(r"\w+")
    lst=tokenizer.tokenize(' '.join(text))
    return lst


def lemmatization(text):
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    
    return result


def remove_tag(text):
    text=' '.join(text)
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)


def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def preprocess_input_text(text):
    text = remove_whitespace(text)
    text = word_tokenize(text)
    text = remove_stopwords(text)
    text = remove_punct(text)
    text = lemmatization(text)
    text = " ".join(text)
    text = remove_urls(text)
    return text


In [None]:
SAMPLE_STRING = """category level 23. how must  i get level 1, this cannot be happening"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_labels = 5

# load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=num_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)
model.load_state_dict(torch.load('./models/finetuned_BERT_epoch_5.model', map_location=torch.device('cpu')))

# preprocess
processed_sample_string = [preprocess_input_text(SAMPLE_STRING)]

# tokenize
encoded_string = tokenizer.batch_encode_plus(
    processed_sample_string, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding=True, 
    max_length=512, # can go up to 512 but we run into some memory issues..
    return_tensors='pt',
    truncation=True
)

# separate to input ids, attention mask and labels
input_ids_sample = encoded_string['input_ids']
attention_masks_sample = encoded_string['attention_mask']
labels_sample = torch.tensor(np.ones(len(processed_sample_string))).type(torch.LongTensor)

# create tensor dataset
dataset_sample = TensorDataset(input_ids_sample, attention_masks_sample, labels_sample)
dataloader_sample = DataLoader(dataset_sample, sampler=SequentialSampler(dataset_sample), batch_size=1)

In [None]:
_, predictions, true_vals = evaluate(dataloader_sample)

In [None]:
predictions.argmax()