## Import libraries

In [1]:
!pip install datasets
!pip install transformers




In [2]:
pip install accelerate -U

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric # Load data
from transformers import AutoTokenizer # Tokenisation
from transformers import AutoModelForSequenceClassification # Classification
from transformers import TrainingArguments 
from transformers import Trainer 

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# The input data dir. Should contain the .tsv files (or other data files) for the task.
DATA_DIR = "/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/data/"

LLM_MODEL = 'bert-base-uncased'
#LLM_MODEL = "vinai/bertweet-base"

# The name of the task to train.I'm going to name this 'yelp'.
TASK_NAME = 'complaints'

# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/outputs/{TASK_NAME}/'

# The directory where the evaluation reports will be written to.
REPORTS_DIR = f'/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/reports/{TASK_NAME}_evaluation_report/'

# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = '/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/cache/'

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)
if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR):
        REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
        os.makedirs(REPORTS_DIR)
if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
    REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
    os.makedirs(REPORTS_DIR)
#if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
#        raise ValueError("Output directory ({}) already exists and is not empty.".format(OUTPUT_DIR))
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## Read the raw data and load into dataset objects

In [7]:
raw_df = pd.read_csv('selected_columns.csv', header=None, names=['concatenated', 'numeric_condition'])
# binary label - 1 is a complain and 0 is not
print("Sample Data--\n", raw_df.head())
print("Data Length--",len(raw_df))

bert_df = pd.DataFrame({
    #'id':range(len(raw_df)),
    'label': raw_df["numeric_condition"],
    #'alpha': ['x']*len(raw_df),
    'text': raw_df["concatenated"]
})
print("Sample Data after transformation--\n", bert_df.head())
print("Data Length after transformation--",len(bert_df))

# Split the dataset
temp_bert_df, test_bert_df = train_test_split(bert_df, test_size=0.2, random_state=42)
train_bert_df, dev_bert_df = train_test_split(temp_bert_df, test_size=0.2, random_state=42)
print("Train, Dev and Test sizes--", len(train_bert_df), len(dev_bert_df), len(test_bert_df))

# Convert to tsv
train_bert_df.to_csv('/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/data/train.tsv', sep='\t', index=False, header=True)
dev_bert_df.to_csv('/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/data/dev.tsv', sep='\t', index=False, header=True)
test_bert_df.to_csv('/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/data/test.tsv', sep='\t', index=False, header=True)

Sample Data--
                                         concatenated  numeric_condition
0  Study on Examination of Therapeutic Efficacy a...                  2
1  Factors Influencing Social Functioning of Peop...                  2
2  A Study to Assess the Rate of Hospitalization ...                  2
3  Involuntary Memories Investigation in Schizoph...                  2
4  Perception of Facial Emotions in Schizophrenia...                  2
Data Length-- 1028
Sample Data after transformation--
    label                                               text
0      2  Study on Examination of Therapeutic Efficacy a...
1      2  Factors Influencing Social Functioning of Peop...
2      2  A Study to Assess the Rate of Hospitalization ...
3      2  Involuntary Memories Investigation in Schizoph...
4      2  Perception of Facial Emotions in Schizophrenia...
Data Length after transformation-- 1028
Train, Dev and Test sizes-- 657 165 206


In [8]:
data_files = {
    "train": "/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/data/train.tsv",
    "val": "/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/data/dev.tsv",
    "test": "/Users/ankit/Downloads/Ideathon/trial_Recommender/ideathon/data/test.tsv"
}

twt_datasets = load_dataset("csv", data_files=data_files, delimiter='\t')
print(twt_datasets)

Downloading and preparing dataset csv/default to /Users/ankit/.cache/huggingface/datasets/csv/default-39117c2677f64854/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files: 100%|██████████| 3/3 [00:00<00:00, 4600.70it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 236.95it/s]
                                                        

Dataset csv downloaded and prepared to /Users/ankit/.cache/huggingface/datasets/csv/default-39117c2677f64854/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 34.30it/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 657
    })
    val: Dataset({
        features: ['label', 'text'],
        num_rows: 165
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 206
    })
})





## Perform pre-processing

In [9]:
#tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, 
                                          use_fast=False,
                                          force_download=True)

# Function
def data_tokenizer(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_data = twt_datasets.map(data_tokenizer, batched=True)
train_dataset = tokenized_data["train"]
eval_dataset = tokenized_data["val"]
test_dataset = tokenized_data["test"]
print("Data sizes-- Train",len(train_dataset),"Eval",len(eval_dataset), "Test",len(test_dataset))

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 2.65kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 164kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:01<00:00, 133kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 6.47kB/s]
                                                              

Data sizes-- Train 657 Eval 165 Test 206




## Perform Fine-tuning

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(LLM_MODEL, num_labels=3)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [11]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


  metric = load_metric("accuracy")


In [12]:
training_args = TrainingArguments("test_trainer", 
                                  evaluation_strategy="steps",
                                  eval_steps=100,  
                                  save_total_limit=2, 
                                  metric_for_best_model='accuracy',   
                                  greater_is_better=True, 
                                  load_best_model_at_end=True,
                                  report_to="none")
trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=train_dataset, 
                  eval_dataset=eval_dataset,
                  compute_metrics=compute_metrics
                 )

trainer.train()


                                                   
 40%|████      | 100/249 [55:28<1:32:25, 37.22s/it]

{'eval_loss': 0.0024560855235904455, 'eval_accuracy': 1.0, 'eval_runtime': 95.6753, 'eval_samples_per_second': 1.725, 'eval_steps_per_second': 0.219, 'epoch': 1.2}


 80%|████████  | 200/249 [1:50:53<31:21, 38.40s/it]  
 80%|████████  | 200/249 [1:52:30<31:21, 38.40s/it]

{'eval_loss': 0.0009559995960444212, 'eval_accuracy': 1.0, 'eval_runtime': 96.7698, 'eval_samples_per_second': 1.705, 'eval_steps_per_second': 0.217, 'epoch': 2.41}


100%|██████████| 249/249 [2:19:43<00:00, 33.67s/it]


{'train_runtime': 8383.3777, 'train_samples_per_second': 0.235, 'train_steps_per_second': 0.03, 'train_loss': 0.09791247815970915, 'epoch': 3.0}


TrainOutput(global_step=249, training_loss=0.09791247815970915, metrics={'train_runtime': 8383.3777, 'train_samples_per_second': 0.235, 'train_steps_per_second': 0.03, 'train_loss': 0.09791247815970915, 'epoch': 3.0})

## Evaluate

In [13]:
trainer.evaluate()

100%|██████████| 21/21 [01:38<00:00,  4.71s/it]


{'eval_loss': 0.000675668939948082,
 'eval_accuracy': 1.0,
 'eval_runtime': 106.7889,
 'eval_samples_per_second': 1.545,
 'eval_steps_per_second': 0.197,
 'epoch': 3.0}

In [14]:
results = trainer.predict(test_dataset=test_dataset)

100%|██████████| 26/26 [01:54<00:00,  4.40s/it]


In [15]:
trainer.save_model("/Users/ankit/Downloads/Ideathon/trial_Recommender")



In [25]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [35]:


model_path = "/Users/ankit/Downloads/Ideathon/trial_Recommender"
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

input_text = "I'm having Autistic issues for years now"
tokenized_text = tokenizer(input_text,
                           truncation=True,
                           is_split_into_words=False,
                           return_tensors='pt')

outputs = model(**tokenized_text)
predicted_label = outputs.logits.argmax(-1)

print(predicted_label.item())


1
