In [2]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Downloading model.safetensors: 100%|██████████| 440M/440M [00:27<00:00, 16.1MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import numpy as np

import evaluate

metric = evaluate.load("f1")

In [3]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [4]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch",fp16=True,torch_compile=True)

In [5]:
from sklearn.model_selection import train_test_split 

In [6]:
from datasets import Dataset

In [7]:
import pandas as pd
df = pd.read_csv("./train_data.csv").convert_dtypes()
df.columns = ["text", "labels"]

In [8]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["labels"].values)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)


In [9]:
train_ds = Dataset.from_pandas(df_train)
test_ds = Dataset.from_pandas(df_test)
train_ds, test_ds

(Dataset({
     features: ['text', 'labels'],
     num_rows: 16163
 }),
 Dataset({
     features: ['text', 'labels'],
     num_rows: 4041
 }))

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


In [12]:
train_ds_tokenized = train_ds.map(tokenize_function, batched=True)
test_ds_tokenized = test_ds.map(tokenize_function, batched=True)


Map:   0%|          | 0/16163 [00:00<?, ? examples/s]

Map: 100%|██████████| 16163/16163 [00:01<00:00, 8862.13 examples/s]
Map: 100%|██████████| 4041/4041 [00:00<00:00, 9327.00 examples/s]


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tokenized,
    eval_dataset=test_ds_tokenized,
    compute_metrics=compute_metrics,
)

In [14]:
import io

In [15]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  8%|▊         | 500/6063 [01:16<07:00, 13.23it/s] 

{'loss': 0.3426, 'learning_rate': 4.590961570179779e-05, 'epoch': 0.25}


 16%|█▋        | 1000/6063 [01:56<06:24, 13.17it/s]

{'loss': 0.218, 'learning_rate': 4.178624443344879e-05, 'epoch': 0.49}


 25%|██▍       | 1500/6063 [02:36<05:45, 13.21it/s]

{'loss': 0.1733, 'learning_rate': 3.766287316509979e-05, 'epoch': 0.74}


 33%|███▎      | 2000/6063 [03:17<05:10, 13.07it/s]

{'loss': 0.1556, 'learning_rate': 3.353950189675079e-05, 'epoch': 0.99}


  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,
                                                     
 33%|███▎      | 2022/6063 [04:30<10:54:26,  9.72s/it]

{'eval_loss': 0.16917946934700012, 'eval_f1': 0.9396681749622927, 'eval_runtime': 18.864, 'eval_samples_per_second': 214.218, 'eval_steps_per_second': 26.824, 'epoch': 1.0}


 41%|████      | 2500/6063 [05:12<05:09, 11.53it/s]   

{'loss': 0.0825, 'learning_rate': 2.9416130628401783e-05, 'epoch': 1.24}


 49%|████▉     | 3000/6063 [05:57<04:25, 11.55it/s]

{'loss': 0.0663, 'learning_rate': 2.529275936005278e-05, 'epoch': 1.48}


 58%|█████▊    | 3500/6063 [06:40<03:20, 12.76it/s]

{'loss': 0.0775, 'learning_rate': 2.1177634834240475e-05, 'epoch': 1.73}


 66%|██████▌   | 4000/6063 [07:22<02:47, 12.28it/s]

{'loss': 0.0813, 'learning_rate': 1.7062510308428175e-05, 'epoch': 1.98}


                                                   
 67%|██████▋   | 4043/6063 [07:44<1:44:31,  3.10s/it]

{'eval_loss': 0.13454529643058777, 'eval_f1': 0.9602494154325799, 'eval_runtime': 17.1787, 'eval_samples_per_second': 235.233, 'eval_steps_per_second': 29.455, 'epoch': 2.0}


 74%|███████▍  | 4500/6063 [08:21<02:05, 12.49it/s]  

{'loss': 0.0313, 'learning_rate': 1.2939139040079171e-05, 'epoch': 2.23}


 82%|████████▏ | 5000/6063 [09:03<01:27, 12.13it/s]

{'loss': 0.0346, 'learning_rate': 8.815767771730167e-06, 'epoch': 2.47}


 91%|█████████ | 5500/6063 [09:45<00:44, 12.55it/s]

{'loss': 0.0226, 'learning_rate': 4.692396503381165e-06, 'epoch': 2.72}


 99%|█████████▉| 6000/6063 [10:27<00:05, 12.47it/s]

{'loss': 0.0351, 'learning_rate': 5.690252350321623e-07, 'epoch': 2.97}


                                                   
100%|██████████| 6063/6063 [10:51<00:00,  9.31it/s]

{'eval_loss': 0.14652298390865326, 'eval_f1': 0.9612827532264372, 'eval_runtime': 17.1694, 'eval_samples_per_second': 235.361, 'eval_steps_per_second': 29.471, 'epoch': 3.0}
{'train_runtime': 652.4314, 'train_samples_per_second': 74.32, 'train_steps_per_second': 9.293, 'train_loss': 0.10905001736819774, 'epoch': 3.0}





TrainOutput(global_step=6063, training_loss=0.10905001736819774, metrics={'train_runtime': 652.4314, 'train_samples_per_second': 74.32, 'train_steps_per_second': 9.293, 'train_loss': 0.10905001736819774, 'epoch': 3.0})

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f02c590a690>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f01dcf9d250, execution_count=15 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f01dcf9d450, raw_cell="trainer.train()" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/home/cc/Dev/IdeaProjects/UConn/AIClub/demos/risk-evaluation/notebook092703c99b.ipynb#X20sZmlsZQ%3D%3D> result=TrainOutput(global_step=6063, training_loss=0.10905001736819774, metrics={'train_runtime': 652.4314, 'train_samples_per_second': 74.32, 'train_steps_per_second': 9.293, 'train_loss': 0.10905001736819774, 'epoch': 3.0})>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [16]:
import wandb
wandb.finish()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f02c590a690>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f02ababbb10, raw_cell="import wandb
wandb.finish()" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/home/cc/Dev/IdeaProjects/UConn/AIClub/demos/risk-evaluation/notebook092703c99b.ipynb#X21sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given



0,1
eval/f1,▁██
eval/loss,█▁▃
eval/runtime,█▁▁
eval/samples_per_second,▁██
eval/steps_per_second,▁██
train/epoch,▁▂▂▃▃▄▄▅▅▅▆▇▇███
train/global_step,▁▂▂▃▃▄▄▅▅▅▆▇▇███
train/learning_rate,█▇▇▆▅▅▄▄▃▂▂▁
train/loss,█▅▄▄▂▂▂▂▁▁▁▁
train/total_flos,▁

0,1
eval/f1,0.96128
eval/loss,0.14652
eval/runtime,17.1694
eval/samples_per_second,235.361
eval/steps_per_second,29.471
train/epoch,3.0
train/global_step,6063.0
train/learning_rate,0.0
train/loss,0.0351
train/total_flos,1.275799196335104e+16


In [17]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [20]:
model=model.to("cpu")

In [6]:
from transformers import pipeline
classify_seq_for_risk_eval = pipeline("text-classification",model="./test_trainer/checkpoint-6000/", tokenizer=tokenizer, device="cuda")

In [4]:
import pandas as pd

In [7]:
import torch

submission_df = pd.read_csv("./test_data_post.csv")
preds = []

for x in submission_df["Tweets"]:
    preds.append(classify_seq_for_risk_eval(x))
    # tokenized = tokenizer(x,padding="max_length", truncation=True)
    # tokenized.to("cuda")
    # preds.append(trainer.model(**tokenized))



In [8]:
submission_df["Risk Analysis"] = preds

In [9]:
submission_df

Unnamed: 0,ID,Tweets,Risk Analysis
0,1,knowing what s right for you and your body isn...,"[{'label': 'LABEL_0', 'score': 0.9994314312934..."
1,2,looks like we need to boycott walmart exxonmob...,"[{'label': 'LABEL_1', 'score': 0.9998301267623..."
2,3,#cvshealth says of its employees are racially ...,"[{'label': 'LABEL_1', 'score': 0.9998119473457..."
3,4,be a part of cvshealth on our journey to trans...,"[{'label': 'LABEL_0', 'score': 0.9998739957809..."
4,5,juddlegum ford walmart delta deloitte nrcc lib...,"[{'label': 'LABEL_0', 'score': 0.9997418522834..."
...,...,...,...
5047,5048,cvs health is hiring in ma click the link in o...,"[{'label': 'LABEL_0', 'score': 0.9999178647994..."
5048,5049,merck cvs health aim to provide p amp l experi...,"[{'label': 'LABEL_0', 'score': 0.9998894929885..."
5049,5050,#scottsdale #jobs #hiring #opportunity cvs hea...,"[{'label': 'LABEL_0', 'score': 0.9999072551727..."
5050,5051,here s what i took to rejuvenate my health by ...,"[{'label': 'LABEL_1', 'score': 0.9967857599258..."
