In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Mar 12 13:16:19 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
!pip install torch
!pip install hopsworks
!pip install numpy
!pip install evaluate
! pip install -U accelerate
! pip install -U transformers
!pip install datasets
!pip install pandas



In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    GPT2TokenizerFast,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import torch
import hopsworks
import numpy as np
import evaluate



In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/text-embedding-ada-002')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT4Tokenizer'. 
The class this function is called from is 'GPT2TokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def get_decoding(dataset, embedding_object):
    decodings = []

    for data in dataset["embeddings"]:

        decoded_text = embedding_object.decode(data)

        decodings.append(decoded_text)

    dataset_decoded = dataset.copy()
    dataset_decoded["text"] = decodings
    dataset_decoded = dataset_decoded.drop(columns=["embeddings"])
    return dataset_decoded

def convstr2vec(temp):
    lst=temp.split(",")
    lst[0]=lst[0].replace("[","")
    lst[-1]=lst[-1].replace("]","")
    lst=[int(i) for i in lst]
    return lst

In [5]:
hopsworks_project = hopsworks.login()
fs = hopsworks_project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/546965
Connected. Call `.close()` to terminate connection gracefully.


In [6]:
training_fg = fs.get_or_create_feature_group("news_sentiment_traindata", version=1)
test_fg = fs.get_or_create_feature_group("news_sentiment_testdata", version=1)
# get all of the data from the feature group
training_features = training_fg.read()
testing_features = test_fg.read()

Finished: Reading data from Hopsworks, using ArrowFlight (3.32s) 
Finished: Reading data from Hopsworks, using ArrowFlight (1.32s) 


In [7]:
training_data = get_decoding(training_features, tokenizer)
testing_data = get_decoding(testing_features, tokenizer)

In [8]:
# print the 50th row
print(training_data)

       label                                               text
0          0  Study Finds Most Packaged Foods Contain Danger...
1          2  Fed decision, Lululemon earnings, inflation ga...
2          1  $DVA - DaVita Q4 earnings up 51%; shares up 4%...
3          1  Hedge Funds Have Never Been More Bullish On Bi...
4          0  OPEC+ Weighs Emergency Response as Oil Slumps ...
...      ...                                                ...
12299      2  Uefa explores move into Champions League strea...
12300      1  $IEA - Infrastructure and Energy Alternatives ...
12301      1         Morgan Stanley joins mortgage sector bulls
12302      2  He does not believe, however, that HKScan or A...
12303      1  The rebuilds are designed to improve the machi...

[12304 rows x 2 columns]


In [9]:
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return bert_tokenizer(examples["text"], padding="max_length", truncation=True)


In [10]:
# Convert pandas dataframes to Hugging Face datasets
train_dataset = Dataset.from_pandas(training_data)
test_dataset = Dataset.from_pandas(testing_data)

# Combine datasets into a DatasetDict
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})

In [11]:
tokenized_datasets = datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/12304 [00:00<?, ? examples/s]

Map:   0%|          | 0/3077 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets['train']['text'][50]

'Edited Transcript of FHER3.SA earnings conference call or presentation 13-Nov-19 3:00pm GMT'

In [13]:
def get_compute_metrics(metric):
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)
    return compute_metrics

In [14]:
id2label = {0: "Negative", 1: "Positive", 2: "Neutral"}
label2id = {val: key for key, val in id2label.items()}

def model_init():
    return AutoModelForSequenceClassification.from_pretrained('bert-base-cased', return_dict=True, num_labels=3,
                                                             id2label=id2label, label2id=label2id)

metric = evaluate.load("accuracy")
compute_metrics = get_compute_metrics(metric)
training_args = TrainingArguments(
    output_dir="bert_sentiment_trainer",
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    num_train_epochs=8,
    learning_rate= 2.754984679344267e-05,
    save_total_limit=3,
    seed=42,
    lr_scheduler_type='constant_with_warmup',
    warmup_steps=50,
    max_steps=3000,
    save_strategy="steps",
    save_steps=250,
    fp16=False,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

tokenized_train_dataset = tokenized_datasets["train"].shuffle(seed=55)
tokenized_test_dataset = tokenized_datasets["test"].shuffle(seed=55)

trainer = Trainer(
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    model_init=model_init,
    tokenizer=bert_tokenizer,
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
250,0.4398,0.439519,0.842704
500,0.4888,0.404417,0.846604
750,0.3046,0.361853,0.877803
1000,0.2825,0.366385,0.882353
1250,0.2357,0.389798,0.877478
1500,0.2887,0.367265,0.885603
1750,0.1013,0.490585,0.881378
2000,0.1938,0.567942,0.869678
2250,0.1344,0.49862,0.879103
2500,0.1249,0.614644,0.883003


TrainOutput(global_step=3000, training_loss=0.26562184977531433, metrics={'train_runtime': 5850.3498, 'train_samples_per_second': 8.205, 'train_steps_per_second': 0.513, 'total_flos': 1.2629444050944e+16, 'train_loss': 0.26562184977531433, 'epoch': 3.9})

In [20]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.c

In [21]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

events.out.tfevents.1710250037.64b715808c32.3460.0:   0%|          | 0.00/34.3k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dhanushbitra/bert_sentiment_trainer/commit/f3bd00ccada490fa877cd4384bfc6d47d6819ea7', commit_message='End of training', commit_description='', oid='f3bd00ccada490fa877cd4384bfc6d47d6819ea7', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
from transformers import pipeline
import json

pipe = pipeline("text-classification", model="dhanushbitra/bert_sentiment_trainer")
results = pipe(list(testing_data["text"]))

json_dict = json.loads(str(results).replace('\'', '\"'))
predictions = pd.DataFrame.from_dict(json_dict)

predicted_labels = [pipe.model.config.label2id[x] for x in predictions['label']]

old_accuracy = metric.compute(predictions=predicted_labels, references=test_dataset['label'])['accuracy']
print("old model metric = " + str(old_accuracy))

new_accuracy = trainer.predict(tokenized_test_dataset).metrics["test_accuracy"]
print("new model metric = " + str(new_accuracy))

config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

old model metric = 0.8947026324341891


new model metric = 0.8947026324341891
