In [1]:
import warnings
warnings.filterwarnings("ignore")
# Data processing libraries:
import numpy as np
import pandas as pd
import math
from scipy import stats
from scipy.stats import zscore
import string
import re
from transformers import pipeline
import time
import torch

# Plotting libraries:
from matplotlib import pyplot as plt

In [2]:
url = r"C:/users/thoma/Documents/Notebooks/Projet_ML/training_set_rel3.tsv"
df = pd.read_csv(url, sep='\t', decimal=".", encoding='unicode_escape')
df = df[['essay_set', 'essay','domain1_score']]
df = df.rename(columns={"essay": "text", "domain1_score": "label"})
for i in range(len(df)):

    if(df['essay_set'][i] == 1):
        min = 2
        max = 12
    if(df['essay_set'][i] == 2):
        min = 0
        max = 6
    if(df['essay_set'][i] == 3 or df['essay_set'][i]== 4):
        min = 0
        max = 3
    if(df['essay_set'][i] == 5 or df['essay_set'][i] == 6):
        min = 0
        max = 4
    if(df['essay_set'][i] == 7):
        min = 0
        max = 24
    if(df['essay_set'][i] == 8):
        min = 0
        max = 60

    df['label'][i] = ((df['label'][i] - min)/(max-min))

df = df[['label','text']]

x = []
y = []
for i in range(len(df)):
    x.append(df['text'][i])
    y.append(float(df['label'][i]))

print(len(x),len(y))

12976 12976


In [3]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(x, y, test_size=.2,random_state=123)

In [4]:
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [5]:
import torch

class EssayDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EssayDataset(train_encodings, train_labels)
val_dataset = EssayDataset(val_encodings, val_labels)

In [6]:
import evaluate
accuracy_metric = evaluate.load("accuracy")

In [7]:
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import DistilBertForSequenceClassification, AdamW

#model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=1)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=1)
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        pred = outputs[1]
        results = accuracy_metric.compute(references=labels, predictions=pred)
        #train_acc = torch.sum(y_pred == target)
        loss.backward()
        optim.step()
        
        print(loss)
        print(results)

model.eval()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor(0.3599, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.9375}
tensor(0.2125, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.8125}
tensor(0.0513, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.9375}
tensor(0.0378, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.875}
tensor(0.0345, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.8125}
tensor(0.1158, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.875}
tensor(0.0535, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.875}
tensor(0.0402, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.9375}
tensor(0.0395, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.9375}
tensor(0.0314, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.9375}
tensor(0.0741, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.8125}
tensor(0.0381, device='cuda:0', grad_fn=<MseLossBackward0>)
{'accuracy': 0.875}
tensor(0.0640, device='cuda:0', 

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [8]:
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
model.eval()
optim = AdamW(model.parameters(), lr=5e-5)
res = []
for batch in val_loader:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    pred = outputs[1]
    results = accuracy_metric.compute(references=labels, predictions=pred)
    res.append(results['accuracy'])
np.mean(res)

0.9052914110429447

True