In [1]:
!pip install transformers
!pip install chamd
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import torch
import transformers as ppb
import warnings
from chamd import ChatReader
import random
from sklearn.decomposition import PCA
from sklearn.svm import SVC
warnings.filterwarnings('ignore')

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.5 MB/s[0m eta [36m0:00:0

In [2]:
fileUrl = "/content/drive/MyDrive/ADReSS-IS2020-train/ADReSS-IS2020-data/train/transcription/"
groups = ["cc", "cd"]
reader = ChatReader()
lines = []
for group in groups:
    for fileName in os.listdir(fileUrl + group):
        file = reader.read_file(fileUrl + group + "/" + fileName)
        lines_to_add = ""
        for line in file.lines:
            if line.metadata["speaker"].text == "PAR":
              lines_to_add = lines_to_add + line.text
        lines.append([lines_to_add, file.metadata["session"].text, 1 if group == "cc" else 0])

df = pd.DataFrame(lines, columns=["X", "session", "y"])

In [3]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [4]:

tokenized = df["X"].apply((lambda X: tokenizer.encode(X, add_special_tokens=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)



In [5]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

features = last_hidden_states[0][:,0,:].numpy()
labels = df["y"]

df = pd.DataFrame({"embeddings" : last_hidden_states[0][:,0,:].tolist(), "group": df["y"]})

In [6]:
last_hidden_states[0][:,0,:].tolist()

Output hidden; open in https://colab.research.google.com to view.

In [10]:
import threading
runs = {}

def train_LR_and_save_preds(run,a):
  pred_list = []
  for i in range(0, 108):
      train_features, train_labels = df["embeddings"].tolist(), df["group"].tolist()
      test_embedding = train_features.pop(i)
      test_label = train_labels.pop(i)
      train_df = pd.DataFrame({"features": train_features, "labels": train_labels})
      valid_df = train_df.sample(n=11)
      train_df = train_df.drop(valid_df.index)
      train_features, train_labels = train_df["features"].tolist(), train_df["labels"].tolist()
      # Logistic Regression / Grid search
      parameters = {'C': np.linspace(0.0001, 100, 20)}
      grid_search = GridSearchCV(LogisticRegression(), parameters)
      grid_search.fit(train_features, train_labels)
      print("run " + str(run) + " i: " + str(i))
      lr_clf = LogisticRegression(C = grid_search.best_params_['C'])
      lr_clf.fit(train_features, train_labels)
      pred_list.append(lr_clf.predict([test_embedding])[0])
  pd.DataFrame({"pred" : pred_list}).to_csv("/content/drive/MyDrive/LR_PREDS_DB/" + str(run) + ".csv", index=True)

threads = []
for pool in range(37,50):
  t = threading.Thread(target = train_LR_and_save_preds, args = (pool, 7))
  threads.append(t)
  t.start()
  if pool >= 5:
    threads[0].join()
    threads.remove(threads[0])

for model_type in models.keys():
    result_df = {}
    for N in [5,15,25,35,45,1]:
        scores = []
        for run in range(0,100):
            selected_models = []
            for i_ensemble in range(0, N):
                idx = random.randint(0, 49)
                selected_models.append(models[model_type][idx])
            correct_votes = 0
            for df_idx in df.index:
                AD_votes = 0
                for model in selected_models:
                    AD_votes = AD_votes + model.predict([df["embeddings"][df_idx]])[0]
                final_vote = 1 if AD_votes > (N - AD_votes) else 0
                correct_votes = (correct_votes + 1) if final_vote == [df["group"][df_idx]][0] else correct_votes
            scores.append(correct_votes / len(df.index))
        result_df[str(N)] = scores
    #pd.DataFrame(result_df).to_csv("/content/drive/MyDrive/BERT_" + model_type + ".csv", index=True)

run 37 i: 0
run 37 i: 1
run 37 i: 2
run 37 i: 3
run 37 i: 4
run 37 i: 5
run 37 i: 6
run 37 i: 7
run 37 i: 8
run 37 i: 9
run 37 i: 10
run 37 i: 11
run 37 i: 12
run 37 i: 13
run 37 i: 14
run 37 i: 15
run 37 i: 16
run 37 i: 17
run 37 i: 18
run 37 i: 19
run 37 i: 20
run 37 i: 21
run 37 i: 22
run 37 i: 23
run 37 i: 24
run 37 i: 25
run 37 i: 26
run 37 i: 27
run 37 i: 28
run 37 i: 29
run 37 i: 30
run 37 i: 31
run 37 i: 32
run 37 i: 33
run 37 i: 34
run 37 i: 35
run 37 i: 36
run 37 i: 37
run 37 i: 38
run 37 i: 39
run 37 i: 40
run 37 i: 41
run 37 i: 42
run 37 i: 43
run 37 i: 44
run 37 i: 45
run 37 i: 46
run 37 i: 47
run 37 i: 48
run 37 i: 49
run 37 i: 50
run 37 i: 51
run 37 i: 52
run 37 i: 53
run 37 i: 54
run 37 i: 55
run 37 i: 56
run 37 i: 57
run 37 i: 58
run 37 i: 59
run 37 i: 60
run 37 i: 61
run 37 i: 62
run 37 i: 63
run 37 i: 64
run 37 i: 65
run 37 i: 66
run 37 i: 67
run 37 i: 68
run 37 i: 69
run 37 i: 70
run 37 i: 71
run 37 i: 72
run 37 i: 73
run 37 i: 74
run 37 i: 75
run 37 i: 76
run 37 i:

NameError: ignored

In [None]:
class PCASVM:
    def __init__(self):
        self.pca_model = PCA(n_components=107)
        self.svm_classifier = SVC()

    def fit(self, x_train, y_train):
        self.pca_model.fit(x_train)
        emb_comps = self.pca_model.transform(x_train)
        self.svm_classifier.fit(emb_comps, y_train)

    def predict(self, x):
        emb_comps = self.pca_model.transform(x)
        return self.svm_classifier.predict(emb_comps)

