# Bert with torch


In [1]:
!pip install transformers
# !pip install --index-url https://markovml:ha95HRmXKv9vXNSs@pypi.markovml.com/simple markovml

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.7 MB/s[0m eta [36m0:00:0

In [3]:
import pandas as pd
import tensorflow as tf
import torch
import numpy
from tensorflow import keras
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
data = pd.read_csv('/content/Cleaned_Tweets_stopwords.csv')
features = data['cleaned_text']
labels = data['label']

# Split the data into training and testing sets
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
!markov init --api-token=eEgdgto3SXBEzG8edLW85qZX

[?25h✔ Successfully logged in
[?25h

In [4]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the features
features_train_encoded = tokenizer(list(features_train), padding=True, truncation=True, return_tensors="tf")
features_test_encoded = tokenizer(list(features_test), padding=True, truncation=True, return_tensors="tf")

# Convert labels to TensorFlow tensors
labels_train_tensor = tf.convert_to_tensor(labels_train)
labels_test_tensor = tf.convert_to_tensor(labels_test)

# Extract features from BatchEncoding and convert to tensors
features_train_input_ids = tf.convert_to_tensor(features_train_encoded['input_ids'])
features_train_token_type_ids = tf.convert_to_tensor(features_train_encoded['token_type_ids'])
features_train_attention_mask = tf.convert_to_tensor(features_train_encoded['attention_mask'])

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
my_project = markov.Project.get_by_id("vXmg4DkwjkU3JR")
recorder = my_project.create_experiment_recorder(
  experiment_name = "BERT_Bi-LSTM+CNN_Updated",
    experiment_notes = "Training the BERT model",
    hyper_parameters ={
        "Lstm units": 64,
        "dropout_rate": 0.2,
        "Activation": "relu",
        "Optimizer": "Adam",
        "learning rate":0.001
    }
)

# Register the experiment recorder with the MarkovML backend. Only a registered
# experiment recorder can be used to add records.
recorder.register()

✔ Successfully fetched project Team_2_Updated_Results
ℹ Creating MarkovML project object Team_2_Updated_Results. Call project_object.register() to register with MarkovML backend.
ℹ Creating MarkovML model object Model: BERT_Bi-LSTM+CNN_Updated. Call model_object.register() to register with MarkovML backend.
✔ Model creation for model Model: BERT_Bi-LSTM+CNN_Updated successful!
ℹ Creating ExperimentRecorder object BERT_Bi-LSTM+CNN_Updated. Call recorder_object.register() to register with MarkovML backend.
✔ ExperimentRecorder successfully registered.
ℹ You can view the experiment at \https://app.markovml.com/pioneer-wsp-27a2ukk2as/proj/vXmg4DkwjkU3JR/experiments/hp-5CDaWiCRnDLyfq59tiLmGGf]8;;\


<markov.api.recording.experiments.experiment_recorder.ExperimentRecorder at 0x790d299af220>

In [None]:
# Train the model
batch_size = 128
epochs = 10

# Create a variable to store the loss and accuracy for each step.
loss_and_accuracy = []

# Create a callback function that will store the loss and accuracy for each step.
class LossAndAccuracyCallback(keras.callbacks.Callback):
    def on_batch_end(self, batch, logs):
        loss_and_accuracy.append((logs['loss'], logs['accuracy']))

model.fit(
    (features_train_input_ids, features_train_token_type_ids, features_train_attention_mask),
    labels_train_tensor,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[LossAndAccuracyCallback()]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x790caa45df60>

In [5]:
# Train the model
batch_size = 128
epochs = 10

# Create a variable to store the loss and accuracy for each step.
loss_and_accuracy = []

# Create a callback function that will store the loss and accuracy for each step.
class LossAndAccuracyCallback(keras.callbacks.Callback):
    def on_batch_end(self, batch, logs):
        loss_and_accuracy.append((logs['loss'], logs['accuracy']))

# Train the model with the callback function.
model.fit(
    (features_train_input_ids, features_train_token_type_ids, features_train_attention_mask),
    labels_train_tensor,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[LossAndAccuracyCallback()]
)



Epoch 1/10
 3/42 [=>............................] - ETA: 1:17 - loss: 0.7053 - accuracy: 0.5130

KeyboardInterrupt: ignored

In [None]:
save_path = "BERT_MODEL"
model.save(save_path)

print("Model saved successfully.")



Model saved successfully.


In [None]:
!zip -r /content/file.zip /content/drive/MyDrive/BERT_MODEL


  adding: content/drive/MyDrive/BERT_MODEL/ (stored 0%)
  adding: content/drive/MyDrive/BERT_MODEL/assets/ (stored 0%)
  adding: content/drive/MyDrive/BERT_MODEL/variables/ (stored 0%)
  adding: content/drive/MyDrive/BERT_MODEL/variables/variables.data-00000-of-00001 (deflated 16%)
  adding: content/drive/MyDrive/BERT_MODEL/variables/saved_model.pb (deflated 92%)
  adding: content/drive/MyDrive/BERT_MODEL/variables/variables.index (deflated 79%)
  adding: content/drive/MyDrive/BERT_MODEL/keras_metadata.pb (deflated 96%)
  adding: content/drive/MyDrive/BERT_MODEL/fingerprint.pb (stored 0%)
  adding: content/drive/MyDrive/BERT_MODEL/.ipynb_checkpoints/ (stored 0%)


In [None]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

# Assuming you have a list named 'loss_accuracy_list' containing tuples of (loss, accuracy)
# Example: loss_accuracy_list = [(loss1, acc1), (loss2, acc2), ...]

# Convert the list to a DataFrame
df = pd.DataFrame(loss_and_accuracy, columns=['Loss', 'Accuracy'])

# Print the DataFrame
print(df)
df.to_csv('loss_accuracy1.csv',index=False)
# df=pd.read_csv('/content/loss_accuracy.csv')
# loss_and_accuracy=df.values.tolist()
# loss_and_accuracy

         Loss  Accuracy
0    0.658149  0.664062
1    0.675327  0.601562
2    0.661967  0.627604
3    0.655124  0.634766
4    0.650022  0.640625
..        ...       ...
415  0.025647  0.992599
416  0.025149  0.992788
417  0.025289  0.992578
418  0.026382  0.992378
419  0.026302  0.992410

[420 rows x 2 columns]


In [None]:
with recorder:
  for i, (loss, accuracy) in enumerate(loss_and_accuracy):
    if (i + 1) % 41 == 0:
      recorder.add_record({"loss":loss})
      recorder.add_record({"accuracy":accuracy})



ℹ Starting recorder to track training data on markov backend using multiple threads. Please wait!


 [Elapsed Time: 0:00:00]  [|#                                   |] (  0.0 s/B) 


⠋ Please wait while we send the remaining data to the markov backend...

 [Elapsed Time: 0:00:06]  [|          #                         |] ( 16.3 B/s) 

⠙ Please wait while we send the remaining data to the markov backend...




✔ Recording has successfully completed.
ℹ You can view the experiment at \https://app.markovml.com/pioneer-wsp-27a2ukk2as/proj/vXmg4DkwjkU3JR/experiments/hp-5CDaWiCRnDLyfq59tiLmGGf]8;;\


In [None]:
# batch_size = 128

# # Create lists to store loss and accuracy values
# eval_loss_and_accuracy = []

# # Create a callback function that will store the loss and accuracy for each step.
# class LossAndAccuracyCallback(keras.callbacks.Callback):
#     def on_batch_end(self, batch, logs):
#         eval_loss_and_accuracy.append((logs['loss'], logs['accuracy']))

# # Evaluate the model
features_test_input_ids = tf.convert_to_tensor(features_test_encoded['input_ids'])
features_test_token_type_ids = tf.convert_to_tensor(features_test_encoded['token_type_ids'])
features_test_attention_mask = tf.convert_to_tensor(features_test_encoded['attention_mask'])

batch_size = 128

# Predict using the model
predictions = model.predict(
    (features_test_encoded['input_ids'], features_test_encoded['token_type_ids'], features_test_encoded['attention_mask']),
    batch_size=batch_size
)

# Extract logits from the predictions
logits = predictions.logits

# Apply softmax to obtain probabilities
predicted_probabilities = tf.nn.softmax(logits, axis=-1)






NameError: ignored

In [None]:
# Print the first few logits and probabilities
predicted_labels = tf.argmax(predicted_probabilities, axis=-1).numpy()
for i in range(5):
    print(f"Sample {i + 1}: Logits = {logits[i]}, Probabilities = {predicted_probabilities[i]},Predicted Label = {predicted_labels[i]}")


Sample 1: Logits = [-3.1736486  2.9537804], Probabilities = [0.00217743 0.99782264],Predicted Label = 1
Sample 2: Logits = [-3.2341022  2.9140835], Probabilities = [0.0021328 0.9978672],Predicted Label = 1
Sample 3: Logits = [ 3.527309  -3.2681208], Probabilities = [0.99888235 0.00111763],Predicted Label = 0
Sample 4: Logits = [-2.8124738  2.8604836], Probabilities = [0.00342591 0.99657416],Predicted Label = 1
Sample 5: Logits = [-1.7024715  1.6781546], Probabilities = [0.03290646 0.9670935 ],Predicted Label = 1


In [None]:
import numpy as np
max_values = np.max(predicted_probabilities, axis=1)
orig_copy = [[value] for value in max_values]
y_pred = [[value] for value in predicted_labels]
print(y_pred)
print(orig_copy)

[[1], [1], [0], [1], [1], [0], [0], [0], [0], [1], [1], [0], [1], [0], [1], [1], [0], [1], [1], [0], [0], [0], [1], [1], [0], [1], [1], [1], [1], [1], [0], [1], [0], [0], [0], [0], [0], [0], [1], [1], [1], [1], [1], [0], [1], [1], [0], [1], [1], [1], [0], [1], [1], [1], [0], [1], [1], [0], [1], [1], [0], [0], [0], [0], [1], [0], [1], [1], [0], [1], [0], [0], [1], [1], [1], [0], [0], [1], [0], [1], [0], [0], [1], [1], [0], [0], [0], [0], [1], [0], [0], [0], [1], [1], [0], [1], [1], [1], [0], [1], [1], [1], [0], [0], [0], [1], [1], [1], [1], [1], [0], [1], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1], [1], [0], [0], [1], [1], [1], [1], [0], [1], [0], [0], [1], [0], [0], [0], [0], [0], [1], [1], [1], [0], [0], [0], [1], [0], [0], [1], [0], [0], [1], [1], [1], [1], [1], [1], [0], [0], [1], [0], [0], [1], [1], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [1], [0], [0], [1], [1], [1], [0], [1], [1], [0], [0], [1], [0], [0], [1], [0], [0], [1], [0], [1], [1], [0], [0], [1], [0], [0],

In [None]:
from markov.api.schemas.model_recording import SingleTagInferenceRecord,RecordMetaType
from markov import EvaluationRecorder

evaluation_recorder = EvaluationRecorder(
    name=f"Test Eval on Best Bert Model",
    notes=f"Testing evaluation with MarkovML",
    model_id="CicGC79zMJUWrrzQMZ",  # or my_model.model_id
    dataset_id="4Sjrh3yito3qKjujy"

)

evaluation_recorder.register()

ℹ Creating EvaluationRecorder object Test Eval on Best Bert Model. Call recorder_object.register() to register with MarkovML backend.
✔ Evaluation recorder creation for EvaluationRecording(name='Test Eval on Best Bert Model', model_id='CicGC79zMJUWrrzQMZ', note='Testing evaluation with MarkovML', dataset_id='4Sjrh3yito3qKjujy', info={}) successful.


<markov.api.recording.evaluations.evaluation_recorder.EvaluationRecorder at 0x790cb7921960>

In [None]:
from markov.api.schemas.model_recording import SingleTagInferenceRecord,RecordMetaType,RecordCustomMetric
from markov import EvaluationRecorder
import random
def _get_cost(inferred, actual):
    if actual == inferred:
        return 0
    else:
        return random.randint(2, 5)

urid = 1
for prob, pred, orig, txt in zip(orig_copy, y_pred, labels_test, features_test):
    urid=urid+1
    mi_record = SingleTagInferenceRecord(
        inferred=float(pred[0]),
        actual=float(orig),
        urid=urid,
        score=float(prob[0]),
        custom_metrics=[
            RecordCustomMetric(label="Cost", value=_get_cost(float(pred[0]), float(orig))),
            RecordCustomMetric(label="Probability", value=float(prob[0]))
        ]
    )
    evaluation_recorder.add_record(mi_record)
outcome = evaluation_recorder.finish()
print(outcome)

Upload Progress : 100%|██████████| 2/2 [00:09<00:00,  4.89s/batch]


EvaluationRecordingFinishResponse(count=1321, recording_id='3ZkGspMpeikFp94gysWDpQM', return_code='OK', message='', run_id='3C8JtcQVh39y5zehkhX')


In [None]:
# Load and preprocess the unseen dataset
unseen_data = pd.read_csv('/content/Cleaned_Reddit.csv')
unseen_features = unseen_data['cleaned_text']
unseen_labels = unseen_data['label']

# Tokenize the unseen features
unseen_features_encoded = tokenizer(list(unseen_features), padding=True, truncation=True, return_tensors="tf")
unseen_labels_tensor = tf.convert_to_tensor(unseen_labels)


features_test_input_ids = tf.convert_to_tensor(unseen_features_encoded['input_ids'])
features_test_token_type_ids = tf.convert_to_tensor(unseen_features_encoded['token_type_ids'])
features_test_attention_mask = tf.convert_to_tensor(unseen_features_encoded['attention_mask'])

batch_size = 128

# Predict using the model
predictions = model.predict(
    (features_test_encoded['input_ids'], features_test_encoded['token_type_ids'], features_test_encoded['attention_mask']),
    batch_size=batch_size
)

# Extract logits from the predictions
logits = predictions.logits

# Apply softmax to obtain probabilities
predicted_probabilities = tf.nn.softmax(logits, axis=-1)





In [None]:
predicted_labels = tf.argmax(predicted_probabilities, axis=-1).numpy()
import numpy as np
max_values = np.max(predicted_probabilities, axis=1)
orig_copy = [[value] for value in max_values]
y_pred = [[value] for value in predicted_labels]
print(y_pred)
print(orig_copy)

[[1], [1], [0], [1], [1], [0], [0], [0], [0], [1], [1], [0], [1], [0], [1], [1], [0], [1], [1], [0], [0], [0], [1], [1], [0], [1], [1], [1], [1], [1], [0], [1], [0], [0], [0], [0], [0], [0], [1], [1], [1], [1], [1], [0], [1], [1], [0], [1], [1], [1], [0], [1], [1], [1], [0], [1], [1], [0], [1], [1], [0], [0], [0], [0], [1], [0], [1], [1], [0], [1], [0], [0], [1], [1], [1], [0], [0], [1], [0], [1], [0], [0], [1], [1], [0], [0], [0], [0], [1], [0], [0], [0], [1], [1], [0], [1], [1], [1], [0], [1], [1], [1], [0], [0], [0], [1], [1], [1], [1], [1], [0], [1], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1], [1], [0], [0], [1], [1], [1], [1], [0], [1], [0], [0], [1], [0], [0], [0], [0], [0], [1], [1], [1], [0], [0], [0], [1], [0], [0], [1], [0], [0], [1], [1], [1], [1], [1], [1], [0], [0], [1], [0], [0], [1], [1], [0], [0], [0], [0], [1], [1], [0], [0], [0], [1], [1], [0], [0], [1], [1], [1], [0], [1], [1], [0], [0], [1], [0], [0], [1], [0], [0], [1], [0], [1], [1], [0], [0], [1], [0], [0],

In [None]:
from markov.api.schemas.model_recording import SingleTagInferenceRecord,RecordMetaType
from markov import EvaluationRecorder

evaluation_recorder = EvaluationRecorder(
    name=f"RedditData Eval on Best Bert Model",
    notes=f"Testing evaluation with MarkovML",
    model_id="CicGC79zMJUWrrzQMZ",  # or my_model.model_id
    dataset_id="4Sjrh3yito3qKjujy"

)

evaluation_recorder.register()

ℹ Creating EvaluationRecorder object RedditData Eval on Best Bert Model. Call recorder_object.register() to register with MarkovML backend.
✔ Evaluation recorder creation for EvaluationRecording(name='RedditData Eval on Best Bert Model', model_id='CicGC79zMJUWrrzQMZ', note='Testing evaluation with MarkovML', dataset_id='4Sjrh3yito3qKjujy', info={}) successful.


<markov.api.recording.evaluations.evaluation_recorder.EvaluationRecorder at 0x790caa4354e0>

In [None]:
from markov.api.schemas.model_recording import SingleTagInferenceRecord,RecordMetaType,RecordCustomMetric
from markov import EvaluationRecorder
import random
def _get_cost(inferred, actual):
    if actual == inferred:
        return 0
    else:
        return random.randint(2, 5)

urid = 1
for prob, pred, orig, txt in zip(orig_copy, y_pred, labels_test, features_test):
    urid=urid+1
    mi_record = SingleTagInferenceRecord(
        inferred=float(pred[0]),
        actual=float(orig),
        urid=urid,
        score=float(prob[0]),
        custom_metrics=[
            RecordCustomMetric(label="Cost", value=_get_cost(float(pred[0]), float(orig))),
            RecordCustomMetric(label="Probability", value=float(prob[0]))
        ]
    )
    evaluation_recorder.add_record(mi_record)
outcome = evaluation_recorder.finish()
print(outcome)

Upload Progress : 100%|██████████| 2/2 [00:10<00:00,  5.05s/batch]


EvaluationRecordingFinishResponse(count=1318, recording_id='6xUA9PmXiusr7SdjdVYDxwQ', return_code='OK', message='', run_id='8evmz9w56vbeQbDmXw8')
