In [None]:
!pip install transformers
!pip install datasets

In [43]:
import os
import matplotlib.pyplot as plt
import copy

import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from datasets import load_dataset
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from datasets import Dataset, DatasetDict

### 1. Load Dataset

In [44]:
df = pd.read_csv(r"/content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/train.csv")
df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0


In [45]:
df = df.drop(columns=['id','qid1','qid2'])

In [46]:
df = df.dropna()

In [47]:
## Converting to huggingface dataset format 
from datasets import Dataset , DatasetDict

train_data = Dataset.from_pandas(df)

data = train_data.train_test_split(test_size=0.20 , seed=20)
data

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'is_duplicate', '__index_level_0__'],
        num_rows: 323429
    })
    test: Dataset({
        features: ['question1', 'question2', 'is_duplicate', '__index_level_0__'],
        num_rows: 80858
    })
})

### 2. Tokenization : 

In [48]:
## Define the Checkpoint 
model_checkpoint = "bert-base-cased"
batch_size = 32

## Import AutoTokenizer Function 
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
## Checking the Tokenizer 
tokenizer(list(df['question1'].iloc[0:2].values))

{'input_ids': [[101, 1327, 1110, 1103, 2585, 1118, 2585, 6388, 1106, 17557, 1107, 2934, 2319, 1107, 1107, 7168, 136, 102], [101, 1327, 1110, 1103, 1642, 1104, 19892, 21918, 1766, 113, 19892, 1324, 118, 178, 118, 1302, 1766, 114, 8549, 136, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
## Function for tokenizing datasets 
def preprocess_function(records):
    return tokenizer(records['question1'], records['question2'], truncation=True, return_token_type_ids=True, max_length = 75)


In [None]:
pre_tokenizer_columns = set(data["train"].features)
pre_tokenizer_columns

{'__index_level_0__', 'is_duplicate', 'question1', 'question2'}

In [None]:
## Apply Map Function 
encoded_dataset = data.map(preprocess_function, batched=True, )

Map:   0%|          | 0/323429 [00:00<?, ? examples/s]

Map:   0%|          | 0/80858 [00:00<?, ? examples/s]

In [None]:
tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
print("Columns added by tokenizer:", tokenizer_columns)

Columns added by tokenizer: ['attention_mask', 'token_type_ids', 'input_ids']


### 3. Data Padding : 

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf",)


tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
    columns=tokenizer_columns,
    label_cols=["is_duplicate"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
tf_validation_dataset = encoded_dataset["test"].to_tf_dataset(
    columns=tokenizer_columns,
    label_cols=["is_duplicate"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
## Testing data collator 
sample = next(iter(tf_train_dataset))
tokenizer.decode(sample[0]['input_ids'][0])

'[CLS] What would people define as beauty, average, and below average? [SEP] Why is preference given to beautiful / good looking people over the average looking ones? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

### 4. Model Training

In [None]:
## Calling function from Model Checkpoint 
model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 2)

Downloading tf_model.h5:   0%|          | 0.00/527M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
## Warming up the learning rate 
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
#tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs
model_name = model_checkpoint.split("/")[-1]

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
## Logging to huggingface to push model to hub 
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers.keras_callbacks import PushToHubCallback

## Creating a callback from Huggingface 
callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned-on-duplicate-Q-A", tokenizer=tokenizer
)


model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    
    epochs=num_epochs, callbacks=[callback]
)

Cloning https://huggingface.co/VinayakMane47/bert-base-cased-finetuned-on-duplicate-Q-A into local empty directory.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f2b44d07520>

In [None]:
## Saving model to our directory 
model.save_pretrained("/content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/")

### 5. Inferencing

In [51]:
## Function for making inferencing 
def check_similarity(question1, question2,model, debug = 0):
  tokenizer_output = tokenizer(question1, question2, truncation=True, return_token_type_ids=True, max_length = 75, return_tensors = 'tf')
  logits = model(**tokenizer_output)["logits"]
  predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
  if predicted_class_id == 1:
    if(debug):
        print("Both questions mean the same")
    return 1
  else:
    if(debug):
        print("Both the questions are different.")
    return 0

In [54]:
trained_model = TFAutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/",num_labels =2)

Some layers from the model checkpoint at /content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further

In [55]:
check_similarity("Who is prime minister of India ? " , "What is name of Indias pm?",debug=1,model=trained_model)

Both the questions are different.


0

In [None]:
import numpy as np

# Create a function that applies check_similarity to a row of data and returns a dictionary
def apply_check_similarity(row):
    question1 = row['question1']
    question2 = row['question2']
    y_pred = check_similarity(question1, question2,model=trained_model)
    y_true = row['is_duplicate']
    return {'y_pred': y_pred, 'y_true': y_true}


# Apply apply_check_similarity to each row of the dataset using the map method
y_pred_dataset = data['test'].map(apply_check_similarity)

# Convert the y_pred_dataset to a numpy array
y_pred = np.array(list(y_pred_dataset.as_numpy_iterator()))['y_pred']
y_true = np.array(list(y_pred_dataset.as_numpy_iterator()))['y_true']





Map:   0%|          | 0/80858 [00:00<?, ? examples/s]

In [56]:
data['test']

Dataset({
    features: ['question1', 'question2', 'is_duplicate', '__index_level_0__'],
    num_rows: 80858
})