In [1]:
!pip install transformers[torch]
!pip install evaluate
!pip install chamd
!pip install pandas
!pip install datasets
!pip install huggingface_hub

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m86.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7

In [2]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from chamd import ChatReader
from google.colab import drive
import os
import evaluate
drive.mount('/content/drive')
import threading
import torch.multiprocessing as mp

Mounted at /content/drive


In [5]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis = 1)
  return accuracy.compute(predictions = predictions, references = labels)

def train_distilBert_and_push(train_ds,run, df_id):
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
  def preprocess_function(data):
      return tokenizer(data["text"], truncation=True)
  train_text_list, train_labels_list = train_ds["text"], train_ds["labels"]
  test_text_list, test_labels_list = [train_text_list.pop(df_id)],[train_labels_list.pop(df_id)]
  test_ds = pd.DataFrame({"text" : test_text_list, "labels": test_labels_list})
  train_ds = pd.DataFrame({"text" : train_text_list, "labels": train_labels_list})
  valid_ds = train_ds.sample(n=27)
  train_ds = train_ds.drop(valid_ds.index)
  train_ds = Dataset.from_pandas(train_ds)
  valid_ds = Dataset.from_pandas(valid_ds)
  test_ds = Dataset.from_pandas(test_ds)
  tokenized_train = train_ds.map(preprocess_function, batched=True)
  tokenized_valid = valid_ds.map(preprocess_function, batched=True)
  tokenized_test = test_ds.map(preprocess_function, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


  id2label = {1: "NEGATIVE", 0: "POSITIVE"}
  label2id = {"NEGATIVE": 1, "POSITIVE": 0}

  model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
  training_args = TrainingArguments(
    output_dir="AD-distilBert" + str(run) + "_" + str(df_id),
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    hub_token = "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"
)

  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
  trainer.train()
  trainer.save_model("/content/drive/MyDrive/val27_models/" + str(run) + "_" + str(df_id))
  loc_str = str(pool) + "_" + str(df_id)
  !rm -r /content/AD-distilBert{loc_str}
  predictions = trainer.predict(tokenized_test)
  pred_result = np.argmax(predictions.predictions, axis=-1)[0]
  return pred_result

In [None]:
fileUrl = "/content/drive/MyDrive/ADReSS-IS2020-train/ADReSS-IS2020-data/train/transcription/"
groups = ["cc", "cd"]
reader = ChatReader()
lines = []
for group in groups:
    for fileName in os.listdir(fileUrl + group):
        file = reader.read_file(fileUrl + group + "/" + fileName)
        lines_to_add = ""
        for line in file.lines:
            if line.metadata["speaker"].text == "PAR":
              lines_to_add = lines_to_add + line.text
        lines.append([lines_to_add, file.metadata["session"].text, 1 if group == "cc" else 0])

df = pd.DataFrame(lines, columns=["text", "session", "labels"])

threads = []
for pool in range(10,20):
  labels = []
  ids = []
  for idx in range(0,108):
    df_train = pd.DataFrame({'text':  list(df["text"]), 'labels': list((df["labels"]))})
    train_ds = Dataset.from_pandas(df_train)
    try:
      label = train_distilBert_and_push(train_ds,pool,idx)
      labels.append(label)
      ids.append(idx)
    except Exception as e:
      print(e)
  results = pd.DataFrame({"id" : ids, "label" : labels})
  results.to_csv("/content/drive/MyDrive/val27_finetunedLOO" + str(pool) + ".csv", index=True)