In [15]:
# ! pip install torch

In [1]:
import pandas as pd
# Load cleaned data
df = pd.read_excel('../data/cleaned_data.xlsx', header=0)
df.head()

Unnamed: 0,x,y,char_len,word_len,clean,clean_char_len,clean_word_len
0,So there is no way for me to plug it in here i...,0,82,21,way plug u unless go converter,30,6
1,"Good case, Excellent value.",1,27,4,good case excellent value,25,4
2,Great for the jawbone.,1,22,4,great jawbone,13,2
3,Tied to charger for conversations lasting more...,0,79,11,tied charger conversation lasting 45 minute ma...,57,8
4,The mic is great.,1,17,4,mic great,9,2


In [2]:
# keep only the clean and y columns, drop nullsand duplicates in one go
df = df[['clean', 'y']].dropna().drop_duplicates().reset_index(drop=True)
print(len(df))

2921


In [3]:
from sklearn.model_selection import train_test_split

# Split the same way for tain, val and test
train_X, intermediate_X, train_y, intermediate_y = train_test_split(df['clean'], df['y'], test_size=0.3, random_state=42)
val_X, test_X, val_y, test_y = train_test_split(intermediate_X, intermediate_y, test_size=0.5, random_state=42)
print(len(train_X), len(val_X), len(test_X))

2044 438 439


In [4]:
from transformers import DistilBertTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#Make the toknizer for detilbert
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [6]:
#now tokenize the data using that tokenizer on all sets
#Use the same length for all sets (like in preprocessing for the otehr models)
max_length = 44

train_enc = tokenizer(train_X.tolist(), padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
val_enc   = tokenizer(val_X.tolist(),   padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
test_enc  = tokenizer(test_X.tolist(),  padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

In [7]:
import torch
# Got to convert the labels to tensors 
train_labels = torch.tensor(train_y.values, dtype=torch.long)
val_labels   = torch.tensor(val_y.values, dtype=torch.long)
test_labels  = torch.tensor(test_y.values, dtype=torch.long)

In [8]:
from torch.utils.data import TensorDataset, DataLoader

#Now you have to make the datasets but the version for pytorch

batch_size = 16

train_dataset = TensorDataset(train_enc['input_ids'], train_enc['attention_mask'], train_labels)
val_dataset   = TensorDataset(val_enc['input_ids'],   val_enc['attention_mask'],   val_labels)
test_dataset  = TensorDataset(test_enc['input_ids'],  test_enc['attention_mask'],  test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size)

In [9]:
from transformers import DistilBertForSequenceClassification

#Now make the model
num_labels = 2  # binary classification
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

epochs = 2

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for input_ids, attention_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Training loss: {total_loss:.4f}")


Epoch 1/2 - Training loss: 60.3618
Epoch 2/2 - Training loss: 34.1054


In [11]:
from sklearn.metrics import accuracy_score

model.eval()
val_preds = []
val_true  = []

with torch.no_grad():
    for input_ids, attention_mask, labels in val_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        val_preds.extend(preds.cpu().numpy())
        val_true.extend(labels.numpy())

print("Validation accuracy:", accuracy_score(val_true, val_preds))

Validation accuracy: 0.860730593607306


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score

model.eval()
test_preds = []
test_true  = []

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        preds = torch.argmax(logits, dim=1)

        test_preds.extend(preds.cpu().numpy())
        test_true.extend(labels.numpy())

acc = accuracy_score(test_true, test_preds)
prec = precision_score(test_true, test_preds)
rec = recall_score(test_true, test_preds)
f1 = f1_score(test_true, test_preds)
kappa = cohen_kappa_score(test_true, test_preds)


print("Test metrics:")
print("Accuracy: ", acc)
print("Precision: ", prec)
print("Recall: ", rec)
print("F1 Score: ", f1)
print("Cohen's Kappa: ", kappa)

Test metrics:
Accuracy:  0.8314350797266514
Precision:  0.8091286307053942
Recall:  0.874439461883408
F1 Score:  0.8405172413793104
Cohen's Kappa:  0.6623427918095832
