In [1]:
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, Features
from transformers import DataCollatorWithPadding
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_excel("Dataset.xlsx")
data = data.rename(columns={'Sentence': 'text', 'Label': 'label'})
data = data.sample(frac = 1).reset_index(drop=True)
data

Unnamed: 0,text,label
0,"Depending on the emotion, an appropriate measu...",Purpose
1,We apply this principle to gigapixel image ren...,Methods
2,"In this study, a random forest model with two ...",Methods
3,We present a framework for acuity-driven visua...,Purpose
4,"For this purpose, a procedure for characterizi...",Methods
...,...,...
246,"At last, we discuss several trends in auto-par...",Methods
247,"Using graphical demonstration, the impact of v...",Methods
248,It can not only enable the clients to preserve...,Background of Research
249,Results showed that the novel Hybrid RuDSTCs c...,Results


Test 1

In [3]:
sbert = SentenceTransformer('all-MiniLM-L12-v2')
X = sbert.encode(data['Sentence'])
y = data['Label'].replace({'a1': 0, 'a2': 1, 'a3' : 2, 'a4' : 3, 'a5' : 4}, inplace=False).to_numpy()
print(X.shape)
print(y.shape)

(251, 384)
(251,)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 24)

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1024, activation = "relu"),
    tf.keras.layers.Dense(1024, activation = "relu"),
    tf.keras.layers.Dense(512, activation = "relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(512, activation = "relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(256, activation = "relu"),
    tf.keras.layers.Dense(128, activation = "relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation = "relu"),
    
    tf.keras.layers.Dense(32, activation = "relu"),
    tf.keras.layers.Dense(5, activation = "softmax")
])

In [26]:
model.compile(loss = "sparse_categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
history = model.fit(X_train, y_train, epochs = 20, batch_size = 16)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [27]:
y_pred = model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis = 1)
print(classification_report(y_pred_class, y_test))

              precision    recall  f1-score   support

           0       0.27      0.25      0.26        12
           1       0.40      0.40      0.40        10
           2       0.59      0.62      0.60        26
           3       0.46      0.43      0.44        14
           4       0.33      0.36      0.34        14

    accuracy                           0.45        76
   macro avg       0.41      0.41      0.41        76
weighted avg       0.44      0.45      0.45        76



Test 2

In [3]:

features = Features.from_dict({'text': {'dtype': 'string', 'id': None, '_type': 'Value'}, 'label': {'names': data['label'].unique().tolist(), 'id': None, '_type': 'ClassLabel'}})

train = data.groupby('label').head(10).reset_index(drop=True)
test = data.groupby('label').tail(-10).reset_index(drop=True)
train = Dataset.from_pandas(train, features=features)
test = Dataset.from_pandas(test, features=features)
print(train, test)

Dataset({
    features: ['text', 'label'],
    num_rows: 50
}) Dataset({
    features: ['text', 'label'],
    num_rows: 201
})


In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

train = train.map(preprocess_function)
test = test.map(preprocess_function)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 50/50 [00:00<00:00, 466.30 examples/s]
Map: 100%|██████████| 201/201 [00:00<00:00, 909.85 examples/s]


In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

accuracy = evaluate.load("accuracy")
id2label = {i: label for i, label in enumerate(data['label'].unique().tolist())}
label2id = {label: i for i, label in enumerate(data['label'].unique().tolist())}

training_args = TrainingArguments(
    output_dir="/",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [11]:
model2 = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label, label2id=label2id
)

trainer = Trainer(
    model=model2,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 10%|█         | 4/40 [00:20<02:31,  4.22s/it]
 10%|█         | 4/40 [00:45<02:31,  4.22s/it] 

{'eval_loss': 1.6099519729614258, 'eval_accuracy': 0.17412935323383086, 'eval_runtime': 23.3425, 'eval_samples_per_second': 8.611, 'eval_steps_per_second': 0.557, 'epoch': 1.0}


 20%|██        | 8/40 [01:11<03:58,  7.44s/it]
 20%|██        | 8/40 [01:34<03:58,  7.44s/it] 

{'eval_loss': 1.6069296598434448, 'eval_accuracy': 0.18407960199004975, 'eval_runtime': 22.9251, 'eval_samples_per_second': 8.768, 'eval_steps_per_second': 0.567, 'epoch': 2.0}


 30%|███       | 12/40 [02:01<03:38,  7.79s/it]
 30%|███       | 12/40 [02:23<03:38,  7.79s/it]

{'eval_loss': 1.6029309034347534, 'eval_accuracy': 0.19900497512437812, 'eval_runtime': 22.1417, 'eval_samples_per_second': 9.078, 'eval_steps_per_second': 0.587, 'epoch': 3.0}


 40%|████      | 16/40 [02:48<03:02,  7.62s/it]
 40%|████      | 16/40 [03:09<03:02,  7.62s/it]

{'eval_loss': 1.59913170337677, 'eval_accuracy': 0.21890547263681592, 'eval_runtime': 21.0562, 'eval_samples_per_second': 9.546, 'eval_steps_per_second': 0.617, 'epoch': 4.0}


 50%|█████     | 20/40 [03:32<02:34,  7.72s/it]
 50%|█████     | 20/40 [03:54<02:34,  7.72s/it]

{'eval_loss': 1.5963943004608154, 'eval_accuracy': 0.21393034825870647, 'eval_runtime': 21.3136, 'eval_samples_per_second': 9.431, 'eval_steps_per_second': 0.61, 'epoch': 5.0}


 60%|██████    | 24/40 [04:19<02:07,  7.94s/it]
 60%|██████    | 24/40 [04:41<02:07,  7.94s/it]

{'eval_loss': 1.5941962003707886, 'eval_accuracy': 0.208955223880597, 'eval_runtime': 21.7113, 'eval_samples_per_second': 9.258, 'eval_steps_per_second': 0.599, 'epoch': 6.0}


 70%|███████   | 28/40 [05:02<01:28,  7.41s/it]
 70%|███████   | 28/40 [05:24<01:28,  7.41s/it]

{'eval_loss': 1.5925624370574951, 'eval_accuracy': 0.21393034825870647, 'eval_runtime': 22.1565, 'eval_samples_per_second': 9.072, 'eval_steps_per_second': 0.587, 'epoch': 7.0}


 80%|████████  | 32/40 [05:48<01:00,  7.56s/it]
 80%|████████  | 32/40 [06:09<01:00,  7.56s/it]

{'eval_loss': 1.5917268991470337, 'eval_accuracy': 0.20398009950248755, 'eval_runtime': 20.7906, 'eval_samples_per_second': 9.668, 'eval_steps_per_second': 0.625, 'epoch': 8.0}


 90%|█████████ | 36/40 [06:30<00:28,  7.03s/it]
 90%|█████████ | 36/40 [06:52<00:28,  7.03s/it]

{'eval_loss': 1.5911369323730469, 'eval_accuracy': 0.19402985074626866, 'eval_runtime': 21.8503, 'eval_samples_per_second': 9.199, 'eval_steps_per_second': 0.595, 'epoch': 9.0}


100%|██████████| 40/40 [07:17<00:00,  7.60s/it]
100%|██████████| 40/40 [07:38<00:00,  7.60s/it]

{'eval_loss': 1.5909351110458374, 'eval_accuracy': 0.19402985074626866, 'eval_runtime': 21.2672, 'eval_samples_per_second': 9.451, 'eval_steps_per_second': 0.611, 'epoch': 10.0}


100%|██████████| 40/40 [07:46<00:00, 11.66s/it]


{'train_runtime': 466.9598, 'train_samples_per_second': 1.071, 'train_steps_per_second': 0.086, 'train_loss': 1.5626529693603515, 'epoch': 10.0}


100%|██████████| 13/13 [00:20<00:00,  1.58s/it]


{'eval_loss': 1.5909351110458374,
 'eval_accuracy': 0.19402985074626866,
 'eval_runtime': 21.4278,
 'eval_samples_per_second': 9.38,
 'eval_steps_per_second': 0.607,
 'epoch': 10.0}

In [12]:
tokenizer2 = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
data_collator2 = DataCollatorWithPadding(tokenizer=tokenizer2)
model3 = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english", num_labels=5, id2label = id2label, label2id = label2id, ignore_mismatched_sizes=True

)

def preprocess_function(examples):
    return tokenizer2(examples["text"], truncation=True)

train1 = train.map(preprocess_function)
test1 = test.map(preprocess_function)

trainer2 = Trainer(
    model=model3,
    args=training_args,
    train_dataset=train1,
    eval_dataset=test1,
    tokenizer=tokenizer2,
    data_collator=data_collator2,
    compute_metrics=compute_metrics,
)

trainer2.train()

Downloading (…)okenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 47.8kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)lve/main/config.json: 100%|██████████| 629/629 [00:00<00:00, 571kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 536kB/s]
Downloading model.safetensors: 100%|██████████| 268M/268M [00:14<00:00, 18.0MB/s] 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint

{'eval_loss': 1.6149466037750244, 'eval_accuracy': 0.17412935323383086, 'eval_runtime': 23.3535, 'eval_samples_per_second': 8.607, 'eval_steps_per_second': 0.557, 'epoch': 1.0}


 20%|██        | 8/40 [01:09<03:41,  6.92s/it]
 20%|██        | 8/40 [01:28<03:41,  6.92s/it] 

{'eval_loss': 1.6122069358825684, 'eval_accuracy': 0.17412935323383086, 'eval_runtime': 19.6944, 'eval_samples_per_second': 10.206, 'eval_steps_per_second': 0.66, 'epoch': 2.0}


 30%|███       | 12/40 [01:53<03:18,  7.09s/it]
 30%|███       | 12/40 [02:15<03:18,  7.09s/it]

{'eval_loss': 1.6085256338119507, 'eval_accuracy': 0.208955223880597, 'eval_runtime': 22.0558, 'eval_samples_per_second': 9.113, 'eval_steps_per_second': 0.589, 'epoch': 3.0}


 40%|████      | 16/40 [02:40<02:58,  7.43s/it]
 40%|████      | 16/40 [03:02<02:58,  7.43s/it]

{'eval_loss': 1.6058290004730225, 'eval_accuracy': 0.22885572139303484, 'eval_runtime': 21.8818, 'eval_samples_per_second': 9.186, 'eval_steps_per_second': 0.594, 'epoch': 4.0}


 50%|█████     | 20/40 [03:26<02:37,  7.86s/it]
 50%|█████     | 20/40 [03:48<02:37,  7.86s/it]

{'eval_loss': 1.6042907238006592, 'eval_accuracy': 0.22885572139303484, 'eval_runtime': 21.5405, 'eval_samples_per_second': 9.331, 'eval_steps_per_second': 0.604, 'epoch': 5.0}


 60%|██████    | 24/40 [04:14<02:12,  8.30s/it]
 60%|██████    | 24/40 [04:38<02:12,  8.30s/it]

{'eval_loss': 1.6031250953674316, 'eval_accuracy': 0.22885572139303484, 'eval_runtime': 23.1842, 'eval_samples_per_second': 8.67, 'eval_steps_per_second': 0.561, 'epoch': 6.0}


 70%|███████   | 28/40 [05:00<01:33,  7.78s/it]
 70%|███████   | 28/40 [05:23<01:33,  7.78s/it]

{'eval_loss': 1.6026393175125122, 'eval_accuracy': 0.22388059701492538, 'eval_runtime': 22.6415, 'eval_samples_per_second': 8.878, 'eval_steps_per_second': 0.574, 'epoch': 7.0}


 80%|████████  | 32/40 [05:47<01:01,  7.74s/it]
 80%|████████  | 32/40 [06:09<01:01,  7.74s/it]

{'eval_loss': 1.6011980772018433, 'eval_accuracy': 0.22388059701492538, 'eval_runtime': 21.8256, 'eval_samples_per_second': 9.209, 'eval_steps_per_second': 0.596, 'epoch': 8.0}


 90%|█████████ | 36/40 [06:31<00:29,  7.41s/it]
 90%|█████████ | 36/40 [06:53<00:29,  7.41s/it]

{'eval_loss': 1.6003878116607666, 'eval_accuracy': 0.23880597014925373, 'eval_runtime': 21.5266, 'eval_samples_per_second': 9.337, 'eval_steps_per_second': 0.604, 'epoch': 9.0}


100%|██████████| 40/40 [07:17<00:00,  7.59s/it]
100%|██████████| 40/40 [07:39<00:00,  7.59s/it]

{'eval_loss': 1.6000033617019653, 'eval_accuracy': 0.24875621890547264, 'eval_runtime': 21.6838, 'eval_samples_per_second': 9.27, 'eval_steps_per_second': 0.6, 'epoch': 10.0}


100%|██████████| 40/40 [07:47<00:00, 11.70s/it]

{'train_runtime': 468.1127, 'train_samples_per_second': 1.068, 'train_steps_per_second': 0.085, 'train_loss': 1.5118670463562012, 'epoch': 10.0}





TrainOutput(global_step=40, training_loss=1.5118670463562012, metrics={'train_runtime': 468.1127, 'train_samples_per_second': 1.068, 'train_steps_per_second': 0.085, 'train_loss': 1.5118670463562012, 'epoch': 10.0})

In [13]:
tokenizer3 = AutoTokenizer.from_pretrained("MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
data_collator3 = DataCollatorWithPadding(tokenizer=tokenizer3)
model4 = AutoModelForSequenceClassification.from_pretrained(
    "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", num_labels=5, id2label = id2label, label2id = label2id, ignore_mismatched_sizes=True
)

def preprocess_function(examples):
    return tokenizer3(examples["text"], truncation=True)

train2 = train.map(preprocess_function)
test2 = test.map(preprocess_function)

trainer3 = Trainer(
    model=model4,
    args=training_args,
    train_dataset=train2,
    eval_dataset=test2,
    tokenizer=tokenizer3,
    data_collator=data_collator3,
    compute_metrics=compute_metrics,
)

trainer3.train()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 50/50 [00:00<00:00, 405.52 examples/s]
Map: 100%|██████████| 201/201 [00:00<00:00, 916.63 examples/s]


MlflowException: Changing param values is not allowed. Param with key='torch_dtype' was already logged with value='None' for run ID='4187cd981e3d49f0b589f5f4663d93b9'. Attempted logging new value 'float16'.