In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#D1 Train - Test Split

In [None]:
import pandas as pd

train_df = pd.read_csv("/content/drive/MyDrive/A2-VA/D1/msr_paraphrase_train.txt", delimiter='\t', names=['label', 'id1', 'id2', 'sentence1', 'sentence2'], skiprows=1, on_bad_lines='skip')
test_df = pd.read_csv("/content/drive/MyDrive/A2-VA/D1/msr_paraphrase_test.txt", delimiter='\t', names=['label', 'id1', 'id2', 'sentence1', 'sentence2'], skiprows=1, on_bad_lines='skip')


# Ensure label column is integer
train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)
# Convert sentences to strings (to avoid 'float' errors)
train_df['sentence1'] = train_df['sentence1'].astype(str)
train_df['sentence2'] = train_df['sentence2'].astype(str)

test_df['sentence1'] = test_df['sentence1'].astype(str)
test_df['sentence2'] = test_df['sentence2'].astype(str)


# Print dataset sizes
print(f"Training Set Size: {len(train_df)}, Test Set Size: {len(test_df)}")


Training Set Size: 3938, Test Set Size: 1639


#Load D2

In [None]:
df_d2 = pd.read_csv('/content/drive/MyDrive/A2-VA/D2/D2.csv')
df_d2

Unnamed: 0,intent,sentence1,sentence2,label
0,Set a Timer,Set a timer for 10 minutes.,Start a 10-minute countdown.,1
1,Set a Timer,Set a timer for 10 minutes.,Please time 10 minutes for me.,1
2,Set a Timer,Set a timer for 10 minutes.,Create a 10-minute timer.,0
3,Set a Timer,Set a timer for 10 minutes.,Begin a countdown for 10 minutes.,0
4,Set a Timer,Set a timer for 10 minutes.,Can you activate a timer for 10 minutes?,0
...,...,...,...,...
2170,Turn on the Lights,Turn on the chandelier.,Switch on the garden lights.,0
2171,Turn on the Lights,Turn on the chandelier.,Turn on all indoor lights.,1
2172,Turn on the Lights,Activate the smart bulbs.,Switch on the garden lights.,0
2173,Turn on the Lights,Activate the smart bulbs.,Turn on all indoor lights.,0


In [None]:
from sklearn.model_selection import train_test_split
df_d2_train, df_d2_test = train_test_split(df_d2, test_size=0.2, random_state=42)
print(f"Training Set Size: {len(df_d2_train)}, Test Set Size: {len(df_d2_test)}")

Training Set Size: 1740, Test Set Size: 435


# Baseline Model

In [None]:
# Baseline classifier (Exact Match)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def baseline_classifier(sentence1, sentence2):
    return 1 if sentence1.strip().lower() == sentence2.strip().lower() else 0

# Evaluate Baseline Model
def evaluate_baseline(df):
    predictions = [baseline_classifier(row['sentence1'], row['sentence2']) for _, row in df.iterrows()]
    accuracy = accuracy_score(df['label'], predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(df['label'], predictions, average='binary')
    print(f'Baseline Model - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}')



##Test on D1

In [None]:
evaluate_baseline(test_df)

Baseline Model - Accuracy: 0.3362, Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##Test on D2

In [None]:
evaluate_baseline(df_d2_test)

Baseline Model - Accuracy: 0.5241, Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#Pretrained model-paraphrase-mpnet-base-v2

##Test on D1

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from sentence_transformers import SentenceTransformer
import numpy as np
from numpy.linalg import norm
from tqdm import tqdm


# Load Pre-trained Transformer Model
def load_transformer_model():
    model_name = "sentence-transformers/paraphrase-mpnet-base-v2"
    model = SentenceTransformer(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer, model

def calculate_embeddings(sentence1, sentence2, tokenizer, embedding_model):
  query_embedding = embedding_model.encode([sentence1])
  passage_embeddings = embedding_model.encode([sentence2])
  return query_embedding, passage_embeddings

def transformer_classifier(query_embedding, passage_embeddings, similarity_threshold):
  similarity = (np.dot(query_embedding, passage_embeddings.T)/(norm(query_embedding)*norm(passage_embeddings)))[0][0]
  if similarity > similarity_threshold:
    return 1
  else:
    return 0


# Evaluate Transformer Model
def evaluate_transformer(df, tokenizer, model):
  embedding_pairs = [calculate_embeddings(row['sentence1'], row['sentence2'], tokenizer, model) for _, row in tqdm(df.iterrows(),total=len(df))]
  for threshold in np.arange(0.5, 0.9, 0.05):
    predictions = [transformer_classifier(e1,e2,threshold) for e1,e2 in embedding_pairs]
    accuracy = accuracy_score(df['label'], predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(df['label'], predictions, average='binary')
    print(f'Transformer Model - Threshold: {threshold}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}')

# Load Model and Run Evaluation
tokenizer, model = load_transformer_model()
evaluate_transformer(test_df, tokenizer, model)



Evaluating Transformer Model on Test Set...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 1639/1639 [01:05<00:00, 24.97it/s]


Transformer Model - Threshold: 0.5, Accuracy: 0.6919, Precision: 0.6867, Recall: 0.9853, F1-score: 0.8094
Transformer Model - Threshold: 0.55, Accuracy: 0.7077, Precision: 0.6997, Recall: 0.9807, F1-score: 0.8167
Transformer Model - Threshold: 0.6000000000000001, Accuracy: 0.7303, Precision: 0.7197, Recall: 0.9724, F1-score: 0.8272
Transformer Model - Threshold: 0.6500000000000001, Accuracy: 0.7413, Precision: 0.7378, Recall: 0.9467, F1-score: 0.8293
Transformer Model - Threshold: 0.7000000000000002, Accuracy: 0.7492, Precision: 0.7570, Recall: 0.9164, F1-score: 0.8291
Transformer Model - Threshold: 0.7500000000000002, Accuracy: 0.7523, Precision: 0.7851, Recall: 0.8631, F1-score: 0.8222
Transformer Model - Threshold: 0.8000000000000003, Accuracy: 0.7224, Precision: 0.8100, Recall: 0.7601, F1-score: 0.7843
Transformer Model - Threshold: 0.8500000000000003, Accuracy: 0.6736, Precision: 0.8469, Recall: 0.6204, F1-score: 0.7162


##Test on D2

In [None]:
evaluate_transformer(df_d2_test, tokenizer, model)

100%|██████████| 435/435 [00:11<00:00, 38.25it/s]


Transformer Model - Threshold: 0.5, Accuracy: 0.4897, Precision: 0.4785, Recall: 0.8068, F1-score: 0.6007
Transformer Model - Threshold: 0.55, Accuracy: 0.4874, Precision: 0.4739, Recall: 0.7005, F1-score: 0.5653
Transformer Model - Threshold: 0.6000000000000001, Accuracy: 0.4851, Precision: 0.4641, Recall: 0.5314, F1-score: 0.4955
Transformer Model - Threshold: 0.6500000000000001, Accuracy: 0.4966, Precision: 0.4659, Recall: 0.3961, F1-score: 0.4282
Transformer Model - Threshold: 0.7000000000000002, Accuracy: 0.5103, Precision: 0.4750, Recall: 0.2754, F1-score: 0.3486
Transformer Model - Threshold: 0.7500000000000002, Accuracy: 0.5218, Precision: 0.4912, Recall: 0.1353, F1-score: 0.2121
Transformer Model - Threshold: 0.8000000000000003, Accuracy: 0.5287, Precision: 0.5333, Recall: 0.0773, F1-score: 0.1350
Transformer Model - Threshold: 0.8500000000000003, Accuracy: 0.5379, Precision: 0.7500, Recall: 0.0435, F1-score: 0.0822


#Cross Encoder Model

## Training function

In [None]:
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
import torch

def train_cross_encoder(training_data, model_path):
  # Load a pre-trained cross-encoder model
  model = CrossEncoder('cross-encoder/stsb-roberta-base')

  # Prepare the data for training
  train_samples = []
  for _, row in training_data.iterrows():
      train_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=row['label']))

  # Create a DataLoader for batching
  train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32) # Adjust batch_size as needed

  # Train the model
  model.fit(train_dataloader=train_dataloader, epochs=10, show_progress_bar=True) # Adjust epochs as needed

  # Save the trained model
  model.save(model_path)
  return model


##Evaluator Function

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from sentence_transformers import SentenceTransformer, CrossEncoder, InputExample
from torch.utils.data import DataLoader
import numpy as np
from numpy.linalg import norm
from tqdm import tqdm

def evaluate_saved_cross_encoder(df, model_path):
    # Load the saved cross-encoder model
    model = CrossEncoder(model_path)

    # Make predictions
    predictions = model.predict([(row['sentence1'], row['sentence2']) for _, row in df.iterrows()])
    # Convert probabilities to class labels (0 or 1)
    predicted_labels = [1 if prob >= 0.5 else 0 for prob in predictions] # Assuming a 0.5 threshold

    # Evaluate the model
    accuracy = accuracy_score(df['label'], predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(df['label'], predicted_labels, average='binary')

    print(f"Cross-Encoder Model - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")



##Train on D1

In [None]:
# Train the cross-encoder model on the D1 training set
model_d1 = train_cross_encoder(train_df, "/content/drive/MyDrive/A2-VA/D1/trained_cross_encoder_model_d1")


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

###Test on D1

In [None]:
evaluate_saved_cross_encoder(test_df, "/content/drive/MyDrive/A2-VA/D1/trained_cross_encoder_model_d1")

Cross-Encoder Model - Accuracy: 0.8700, Precision: 0.8981, Recall: 0.9072, F1-score: 0.9026


###Test on D2

In [None]:
evaluate_saved_cross_encoder(df_d2_test, "/content/drive/MyDrive/A2-VA/D1/trained_cross_encoder_model_d1")

Saved Cross-Encoder Model - Accuracy: 0.5241, Precision: 0.5000, Recall: 0.3720, F1-score: 0.4266


##Train on D2

In [None]:
# Train the cross-encoder model on the D2 training set
model_d2 = train_cross_encoder(df_d2_train, "/content/drive/MyDrive/A2-VA/D2/trained_cross_encoder_model_d2")


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

Iteration:   0%|          | 0/55 [00:00<?, ?it/s]

###Test on D1

In [None]:
evaluate_saved_cross_encoder(test_df, "/content/drive/MyDrive/A2-VA/D2/trained_cross_encoder_model_d2")

Saved Cross-Encoder Model - Accuracy: 0.7688, Precision: 0.7597, Recall: 0.9531, F1-score: 0.8455


###Test on D2

In [None]:
evaluate_saved_cross_encoder(df_d2_test, "/content/drive/MyDrive/A2-VA/D2/trained_cross_encoder_model_d2")

Saved Cross-Encoder Model - Accuracy: 0.5172, Precision: 0.4914, Recall: 0.4155, F1-score: 0.4503


## Train on D1+D2

In [None]:
df_d1_d2 = pd.concat([train_df, df_d2_train], ignore_index=True)
model_d1_d2 = train_cross_encoder(df_d1_d2, "/content/drive/MyDrive/A2-VA/D1_D2/trained_cross_encoder_model_d1_d2")

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

Iteration:   0%|          | 0/178 [00:00<?, ?it/s]

###Test on D1

In [None]:
evaluate_saved_cross_encoder(test_df, "/content/drive/MyDrive/A2-VA/D1_D2/trained_cross_encoder_model_d1_d2")

Saved Cross-Encoder Model - Accuracy: 0.8737, Precision: 0.9045, Recall: 0.9053, F1-score: 0.9049


###Test on D2

In [None]:
evaluate_saved_cross_encoder(df_d2_test, "/content/drive/MyDrive/A2-VA/D1_D2/trained_cross_encoder_model_d1_d2")

Saved Cross-Encoder Model - Accuracy: 0.5103, Precision: 0.4828, Recall: 0.4058, F1-score: 0.4409


#Misclassified Examples

In [None]:
misclassified_indices = [i for i, (pred, true) in enumerate(zip([1 if pred >= 0.5 else 0 for pred in test_predictions_d2], df_d2_test['label'])) if pred != true]

for i in  misclassified_indices:
  print(f"Index: {i}")
  print(f"Sentence 1: {df_d2_test.iloc[i]['sentence1']}")
  print(f"Sentence 2: {df_d2_test.iloc[i]['sentence2']}")
  print(f"Predicted Label: {[1 if test_predictions_d2[i] >= 0.5 else 0][0]}")
  print(f"True Label: {df_d2_test.iloc[i]['label']}")
  print("---")


Index: 1
Sentence 1: Let John know I’m on my way.
Sentence 2: Text John and ask if he's free.
Predicted Label: 0
True Label: 1
---
Index: 5
Sentence 1: Tell John I’ll call him later.
Sentence 2: Tell John to call me when he's free.
Predicted Label: 1
True Label: 0
---
Index: 11
Sentence 1: Create a 10-minute timer.
Sentence 2: Put on a countdown for 10 minutes.
Predicted Label: 1
True Label: 0
---
Index: 12
Sentence 1: Text John and say hello.
Sentence 2: Tell John I’ll text him later.
Predicted Label: 0
True Label: 1
---
Index: 13
Sentence 1: Can you brighten the room?
Sentence 2: Turn on the chandelier.
Predicted Label: 0
True Label: 1
---
Index: 15
Sentence 1: Let John know I’m on my way.
Sentence 2: Write a text message to John.
Predicted Label: 0
True Label: 1
---
Index: 17
Sentence 1: Remind me in 10 minutes.
Sentence 2: Set an alarm for 10 minutes from now.
Predicted Label: 0
True Label: 1
---
Index: 18
Sentence 1: Play upbeat tracks.
Sentence 2: Turn on some tunes.
Predicted La

In [None]:
model_path_d1 = "/content/drive/MyDrive/A2-VA/D1/trained_cross_encoder_model_d1"
model_d1 = CrossEncoder(model_path_d1)
test_predictions_d1 = model_d1.predict([(row['sentence1'], row['sentence2']) for _, row in test_df.iterrows()])


misclassified_indices_d1 = [i for i, (pred, true) in enumerate(zip([1 if pred >= 0.5 else 0 for pred in test_predictions_d1], test_df['label'])) if pred != true]

for i in  misclassified_indices_d1:
  print(f"Index: {i}")
  print(f"Sentence 1: {test_df.iloc[i]['sentence1']}")
  print(f"Sentence 2: {test_df.iloc[i]['sentence2']}")
  print(f"Predicted Label: {[1 if test_predictions_d1[i] >= 0.5 else 0][0]}")
  print(f"True Label: {test_df.iloc[i]['label']}")
  print("---")


Index: 3
Sentence 1: A tropical storm rapidly developed in the Gulf of Mexico Sunday and was expected to hit somewhere along the Texas or Louisiana coasts by Monday night.
Sentence 2: A tropical storm rapidly developed in the Gulf of Mexico on Sunday and could have hurricane-force winds when it hits land somewhere along the Louisiana coast Monday night.
Predicted Label: 1
True Label: 0
---
Index: 15
Sentence 2: In the memo, Ballmer reiterated the open-source threat to Microsoft.
Predicted Label: 1
True Label: 0
---
Index: 22
Sentence 1: Senator Clinton should be ashamed of herself for playing politics with the important issue of homeland security funding, he said.
Sentence 2: She should be ashamed of herself for playing politics with this important issue, said state budget division spokesman Andrew Rush.
Predicted Label: 0
True Label: 1
---
Index: 41
Sentence 1: Thanks to the euro's rise against the Japanese currency, the dollar was at 117.24 yen, well above the overnight 10-month low 