In [17]:
!pip install -q transformers datasets

In [18]:
import pandas as pd

train_df = pd.read_json('../input/interview-type/comp_qu-cl_03_train.jsonl',  lines=True)
test_df = pd.read_json('../input/interview-type/comp_qu-cl_03_test.jsonl',  lines=True)
train_df.head(), test_df.head()

In [19]:
test_df['prompt'] = test_df['prompt'].apply(lambda x:x.replace('\n', ' '))
train_df['prompt'] = train_df['prompt'].apply(lambda x:x.replace('\n', ' '))
test_df

In [20]:
test_df['preds'] = test_df['prompt'].apply(lambda x: x.split('$&$')[1].split('|'))
train_df['preds'] = train_df['prompt'].apply(lambda x: x.split('$&$')[1].split('|'))
test_df

In [21]:
train_df['completion'] = train_df['completion'].str.replace('END', '')
test_df['completion'] = test_df['completion'].str.replace('END', '')
train_df.sample(5)

In [22]:
train_df

In [23]:
train_df['new_prompt'] = train_df.apply(lambda x:x['prompt'].split('$&$')[0], axis=1)
train_df['prompt'] = train_df['new_prompt']
train_df

In [24]:
test_df['new_prompt'] = test_df.apply(lambda x:x['prompt'].split('$&$')[0], axis=1)
test_df['prompt'] = test_df['new_prompt']
test_df

* sentences preparation
* preds prepar.
* load model and tokenizer

In [25]:
test_df.sample(5)

In [26]:
len(train_df.completion.unique())

In [27]:
train_df.describe()

In [28]:
train_df.to_csv('comp_qu-cl_03_train.csv', index=False)
test_df.to_csv('comp_qu-cl_03_test.csv', index=False)

In [29]:
from datasets import load_dataset

data_files = {
    'train': './comp_qu-cl_03_train.csv',
    'test' : './comp_qu-cl_03_test.csv'
}
ds = load_dataset('csv', data_files = data_files)
ds

In [30]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# model_name='distilbert-base-uncased-finetuned-sst-2-english'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 154,ignore_mismatched_sizes=True)

In [31]:
from datasets import ClassLabel
classes=train_df.completion.unique()
labels=ClassLabel(num_classes=154, names=classes)
classes

# Load a pretrained Model and tokenizer

In [10]:
!mkdir model
!mkdir tokenizer

In [11]:
file_paths = ['../input/interviewsubjectclassification/tokenizer.json',
'../input/interviewsubjectclassification/tokenizer_config.json',
'../input/interviewsubjectclassification/special_tokens_map.json' ,
              '../input/interviewsubjectclassification/vocab.txt'
]
import json
import pickle

def load_files(file_paths, folder_path):
    for path in file_paths:
        # check whether path is json or text
        if path.endswith('.json'):
            with open(path, 'r') as f:
                data = json.load(f)
            with open(folder_path + '/' + path.split('/')[-1], 'w') as f:
                json.dump(data, f)
        elif path.endswith('.bin'):
            with open(path, 'rb') as f:
                data = f.read()
            with open(folder_path + '/' + path.split('/')[-1], 'wb') as f:
                f.write(data)
        else:
            with open(path, 'r') as f:
                data = f.read()
            with open(folder_path + '/' + path.split('/')[-1], 'w') as f:
                f.write(data)

load_files(file_paths, './tokenizer')

In [12]:
model_path = [
    '../input/interviewsubjectclassification/config.json',
    '../input/interviewsubjectclassification/pytorch_model.bin',
]
load_files(model_path, './model')

In [13]:
ls ./tokenizer

In [32]:
tokenizer = AutoTokenizer.from_pretrained('./tokenizer')
model = AutoModelForSequenceClassification.from_pretrained('./model')

In [33]:
def list_formatting(li):
    new_li = []
    for word in li:
        word = word.strip()
        word = word + ' '
        new_li.append(word)
    return new_li


test_df['new_preds'] = test_df['preds'].apply(list_formatting) 

In [None]:
import shutil
import os
def delete_folder(folder_path):
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(e)
delete_folder('./tokenizer')

In [34]:
!ls ./model

In [35]:
tokenizer 

### Preprocessing the data

In [36]:
ds_clean = ds['train'].train_test_split(test_size = 0.1, seed=42)
ds_clean['validation'] = ds_clean.pop('test')
ds_clean['test'] = ds['test']
ds_clean

In [37]:
from transformers import DataCollatorWithPadding

def tokenize(example):
    example['labels']=labels.str2int(example['completion'])
    return tokenizer(example['prompt'], truncation=True)

tokenized_ds = ds_clean.map(tokenize, batched=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [38]:
tokenized_ds

In [39]:
tokenized_ds = tokenized_ds.remove_columns(['prompt', 'completion', 'preds', 'new_prompt'])
tokenized_ds.set_format('torch')
tokenized_ds

In [40]:
tokenized_ds['train'][0]

In [41]:
from torch.utils.data import DataLoader

train_loader=DataLoader(tokenized_ds['train'], shuffle=True, batch_size=16, collate_fn=collator)
test_loader=DataLoader(tokenized_ds['test'], collate_fn=collator)
val_loader=DataLoader(tokenized_ds['validation'], collate_fn=collator)

In [42]:
import torch
from transformers import get_scheduler

device = 'cuda' if torch.cuda.is_available() else 'cpu'
NUM_EPOCHS=5
NUM_TRAINING_STEP=NUM_EPOCHS* len(train_loader)

optim=torch.optim.AdamW(model.parameters(), lr=5e-5)
lr_scheduler=get_scheduler(name='linear', optimizer=optim, num_training_steps=NUM_TRAINING_STEP, num_warmup_steps=0)

In [43]:
for batch in train_loader:
    break
{k:v.shape for k, v in batch.items()}

In [44]:
from transformers import get_scheduler
import torch

optim = torch.optim.AdamW(model.parameters(), lr = 5e-5)
device = 'cuda' if torch.cuda.is_available() else 'cpu'


num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optim,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [45]:
criteria = torch.nn.CrossEntropyLoss()

In [48]:
print(f'Number of data points in the test set', len(test_loader))
print(f'Number of batches in the train set with each batch size as 16 is', len(train_loader))

In [50]:
from tqdm.auto import tqdm
def predict_proba(sentence):
    model.to(device)
    model.eval()
    count = 0
    all_top_k = []
    progress_bar = tqdm(test_loader)
    idx = 0
    for batch in progress_bar:
        
        batch = {k: v.to(device) for k, v in batch.items()}
        desired_classes = test_df[idx:idx+16]['new_preds']

            
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)

        top_k = {}
        for i in range(len(test_df.iloc[idx]['new_preds'])):
            try:
                class_label = labels.str2int(test_df.iloc[idx]['new_preds'][i])
                top_k[test_df.iloc[idx]['new_preds'][i]] = probs[0, class_label]
            except:
                top_k[i] = 0
        all_top_k.append(top_k)
        progress_bar.update(1)
        idx += 1
    return all_top_k
all_top_k = predict_proba(test_df['prompt'].tolist())

In [52]:
len(all_top_k)

* if k or the value is int then remove that pair
* detach and bring that to cpu.
* create a dictionary

In [54]:
all_top_k_cpu = []

for li in all_top_k:
    temp = {}
    for k, v in li.items():
        if not isinstance(v, int) and not isinstance(k, int):
            temp[k] = v.cpu().numpy()
    
    all_top_k_cpu.append(temp)
all_top_k_cpu[0]

In [55]:
from collections import OrderedDict

top_ks = []
for li in all_top_k_cpu:
    max_prob = 0
    for k, v in li.items():
        if max_prob < v:
            max_prob = v
            max_word = k
    top_ks.append(max_word)
top_ks[0]

In [56]:
all_top_k_cpu[3]

In [59]:
df_final=pd.DataFrame({
    'Prompt':test_df['prompt'],
    'Completion':test_df['completion'],
    'Prediction': top_ks,
    'Top-K':all_top_k_cpu
})
df_final.sample(10)

In [60]:
from sklearn.metrics import f1_score
f1_score(df_final['Completion'], df_final['Prediction'], average='micro')

In [66]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn
sn.set()
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def metrics(actual, predicted):
    accuracy = accuracy_score(actual, predicted)
    precision = precision_score(actual, predicted, average='macro')
    recall = recall_score(actual, predicted, average='macro')
    f1 = f1_score(actual, predicted, average='macro')
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', f1)
    return accuracy, precision, recall, f1

def confusion_metrics(actual, predicted):
    cm = confusion_matrix(actual, predicted)
    cr = classification_report(actual, predicted)
    print('Confusion Matrix:')
    print(cm)
    print('Classification Report:')
    print(cr)
    return cm, cr
a = confusion_metrics(df_final['Completion'], df_final['Prediction'])
b = metrics(df_final['Completion'], df_final['Prediction'])

In [67]:
acc, pre, rec, f1 = metrics(df_final['Completion'], df_final['Prediction'])
cr = classification_report(df_final['Completion'], df_final['Prediction'])
with open('report.txt', 'w') as f:
    f.write(str(cr))
with open('metric.txt', 'w') as f:
    f.write('Accuracy'+str(acc))
    f.write('Precision'+str(pre))
    f.write('Recall'+str(rec))
    f.write('F1'+str(f1))

In [69]:
df_final.to_csv('final.csv',index=False)

In [70]:
import numpy as np

In [72]:
train_df['completion'].value_counts()

In [74]:
!pip install -q wandb
import wandb
!wandb login

In [76]:
import wandb

wandb.init(project="DistilBert-SST-2", entity="justalearner")

In [77]:
wandb.watch(model)

In [78]:
print(device)

In [None]:
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score

Num_epochs = 3

model.to(device)
progress_bar = tqdm(train_loader, desc='Training')
train_loss_per_epoch, val_loss_per_epoch, train_acc_per_epoch, val_acc_per_epoch = [], [], [], []
for epoch in tqdm(range(Num_epochs)):
    model.train()
    batch_train_losses, batch_train_acc=[], []
    for batch in progress_bar:
        batch = {k:v.to(device) for k, v in batch.items()}
        output = model(input_ids = batch['input_ids'], attention_mask=batch['attention_mask']
                      )
        loss = criteria(output.logits, batch['labels'])    
        acc = accuracy_score(batch['labels'].cpu().numpy(), output.logits.argmax(dim=1).cpu().numpy())
        # f1= f1_score(batch['labels'].cpu().numpy(), output.logits.argmax(dim=1).cpu().numpy())
        loss.backward()
        optim.step()
        lr_scheduler.step()
        optim.zero_grad()
        batch_train_losses.append(loss.item())
        batch_train_acc.append(acc)
    train_loss_per_epoch.append(np.mean(batch_train_losses))
    avg_loss_per_batch = np.mean(batch_train_losses)
    progress_bar.set_description(f'loss on epoch {epoch}:{avg_loss_per_batch}, acc:{np.mean(batch_train_acc)}')
    train_loss_per_epoch.append(avg_loss_per_batch)

    model.eval()
    with torch.no_grad():
        batch_val_losses, batch_val_acc = [], []
        for batch in val_loader:
            batch = {k:v.to(device) for k, v in batch.items()}
            output = model(input_ids = batch['input_ids'], attention_mask=batch['attention_mask']
                          )
            loss = criteria(output.logits, batch['labels'])
            acc = accuracy_score(batch['labels'].cpu().numpy(), output.logits.argmax(dim=1).cpu().numpy())
            batch_val_losses.append(loss.item())
            batch_val_acc.append(acc)
        val_loss_per_epoch.append(np.mean(batch_val_losses))
        val_acc_per_epoch.append(np.mean(batch_val_acc))
        progress_bar.set_description(f'val loss on epoch {epoch}:{np.mean(batch_val_losses)}, acc:{np.mean(batch_val_acc)}')
        wandb.log({'train_loss': avg_loss_per_batch, 'val_loss': np.mean(batch_val_losses), 'val_acc': np.mean(batch_val_acc)
        , 'train_acc': np.mean(batch_train_acc)})
        progress_bar.set_description(f'val loss on epoch {epoch}:{np.mean(batch_val_losses)}, acc:{np.mean(batch_val_acc)}')
        progress_bar.refresh()

        

In [79]:
from datasets import load_metric

precision = load_metric('precision')
recall = load_metric('recall')
f1 = load_metric('f1')

model.eval()
for batch in tqdm(val_loader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    f1.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

print(f'f1 : ', f1.compute(average='macro'))
print(f'precision : ', precision.compute(average='macro'))
print(f'recall : ', recall.compute(average='macro'))

In [None]:
tokenizer.save_pretrained('./')
model.save_pretrained('./')

In [None]:
test_df

In [80]:
test_df['prompt'] = test_df['prompt'].apply(lambda x:x.replace('\n', ''))

In [81]:
test_df['preds'] = test_df['prompt'].apply(lambda x: x.split('$&$')[1].split('|'))
tokenizer.decode(tokenizer.convert_tokens_to_ids(test_df['preds'][0][1]))
test_df.head()

In [82]:
from datasets import load_metric

precision = load_metric('precision')
recall = load_metric('recall')
f1 = load_metric('f1')
preds = []
model.eval()
for batch in tqdm(test_loader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    preds.append(predictions.cpu().numpy())
    f1.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

print(f'f1 : ', f1.compute(average='macro'))
print(f'precision : ', precision.compute(average='macro'))
print(f'recall : ', recall.compute(average='macro'))

In [None]:
torch.save(model.state_dict(), '../input/interview-type/model.pt')

In [83]:
def predict(sentence):
    tokens=tokenizer(sentence, truncation=True, max_length=128, padding='max_length', return_tensors='pt')
    tokens={k:v.to(device) for k, v in tokens.items()}
    output=model(**tokens)
    logits=output.logits
    prediction=torch.argmax(logits, axis=-1)
    return prediction.item()

ran_num = np.random.randint(0, len(test_df))
print(f'Actual :', test_df['completion'][ran_num])
print('Prediction :', labels.int2str(predict(test_df['prompt'][ran_num])))

In [84]:
preds_labels = []
for i in range(len(test_df)):
    preds_labels.append(labels.int2str(preds[i]))

In [85]:
from sklearn.metrics import classification_report
report = classification_report(test_df['completion'], preds_labels)
print(classification_report(test_df['completion'], preds_labels))

In [86]:
with open('report.txt', 'w') as f:
    f.write(str(report))

### cm


In [None]:
from tqdm.auto import tqdm
def predict_proba(sentence):
    model.to(device)
    model.eval()
    count = 0
    all_top_k = []
    progress_bar = tqdm(test_loader)
    idx = 0
    for batch in progress_bar:
        idx += 1
        batch = {k: v.to(device) for k, v in batch.items()}
        if idx+16< len(test_df_512):
            desired_classes = test_df_512[idx:idx+16]['new_preds']
        else:
            
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)

        top_k = {}

        for i in range(len(test_df_512.iloc[idx]['new_preds'])):
            try:
                class_label = labels.str2int(test_df_512.iloc[idx]['new_preds'][i])
                top_k[test_df_512.iloc[idx]['new_preds'][i]] = probs[0, class_label]
            except:
                top_k[i] = 0
        all_top_k.append(top_k)
        progress_bar.update(1)
    return all_top_k
all_top_k = predict_proba(test_df['prompt'].tolist())

In [87]:
labels.str2int('Coding & Problem Solving ')

In [88]:
predict_proba(test_df['prompt'].tolist())

In [89]:
def list_formatting(li):
    new_li = []
    for word in li:
        word = word.strip()
        word = word + ' '
        new_li.append(word)
    return new_li


test_df['new_preds'] = test_df['preds'].apply(list_formatting) 
test_df_512 = test_df[test_df['prompt'].apply(lambda x: len(x) <= 512)]

In [91]:
test_df_512.iloc[0]['new_preds'][0]