In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

A. Basic Text Entailment using simple rule - based methods 

In [1]:
import pandas as pd
import nltk
from sklearn.metrics import accuracy_score
nltk.download('punkt')
data = pd.read_csv('/kaggle/input/textual-entailment-dataset/validation.csv')
def preprocess(text):
    return nltk.word_tokenize(text.lower())

data['sentence1_tokens'] = data['text1'].apply(preprocess)
data['sentence2_tokens'] = data['text2'].apply(preprocess)

def simple_rule_based_entailment(s1, s2):
    return set(s2).issubset(set(s1))


data['prediction'] = data.apply(lambda row: simple_rule_based_entailment(row['sentence1_tokens'], row['sentence2_tokens']), axis=1)
accuracy = accuracy_score(data['label'], data['prediction'])
print(f'Accuracy: {accuracy}')


# Sample Input:
sentence1 = "The cat is on the mat."
sentence2 = "The mat has a cat."

# Preprocess the sample input
sentence1_tokens = preprocess(sentence1)
sentence2_tokens = preprocess(sentence2)

# Check if entailment is predicted
is_entailment = simple_rule_based_entailment(sentence1_tokens, sentence2_tokens)
print(f'Entailment: {is_entailment}')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Accuracy: 0.33876719307182884
Entailment: False


B. Natural Language Inference with BERT 

In [2]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Step 2: Load the Dataset
dataset = load_dataset('snli')

# Check the first few examples to understand the structure
print(dataset['train'].features)  # Check the features of the training dataset
print(dataset['train'][0:5])       # Print the first 5 examples from the training dataset

# Step 3: Preprocess the Data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Step 4: Load the Pre-Trained BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Initialize the model here

# Check if a GPU is available and move the model to GPU if it is
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)  # Now the model is defined, so this won't throw an error

def preprocess_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

# Apply preprocessing to the dataset (train, validation, and test splits)
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Check the structure of the dataset again
print(encoded_dataset)

# Step 5: Inspect the label column directly to understand its structure
print("Label examples:")
print(encoded_dataset['train']['label'][0:5])  # Print the first 5 labels

# Step 6: Identify unique labels
unique_labels = set(encoded_dataset['train']['label'])
print(f"Unique labels in the dataset: {unique_labels}")

# Step 7: Define label mapping and handle unexpected labels
label_dict = {0: 0, 1: 1, 2: 2}  # Adjust this as necessary based on your labels

# Step 8: Map the labels correctly, handle unexpected labels
def map_labels(example):
    # Use the label_dict for mapping, and set a default for unexpected labels
    label = example['label']
    return {'labels': label_dict.get(label, -1)}  # Map to -1 if the label is unexpected

encoded_dataset = encoded_dataset.map(map_labels)

# Set the format for PyTorch
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Step 9: Set Up Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy='epoch',     # Evaluation during each epoch
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Strength of L2 regularization
    logging_dir='./logs',            # Directory for logs
)

# Initialize the Trainer with the model, training arguments, and datasets
trainer = Trainer(
    model=model,                         # The BERT model for training
    args=training_args,                  # Training arguments
    train_dataset=encoded_dataset['train'],  # Training dataset
    eval_dataset=encoded_dataset['validation'],  # Validation dataset
)

# Step 10: Train the Model
trainer.train()

# Step 11: Evaluate the Model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Step 12: Make Predictions
premise = "A man inspects the uniform of a figure in some East Asian country."
hypothesis = "The man is sleeping."

# Tokenize the input example
inputs = tokenizer(premise, hypothesis, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)  # Move inputs to the same device as the model

# Get model prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits).item()

# Convert prediction to human-readable label
label_map = {0: 'entailment', 1: 'contradiction', 2: 'neutral'}
print(f"Predicted Label: {label_map[predicted_label]}")


{'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}
{'premise': ['A person on a horse jumps over a broken down airplane.', 'A person on a horse jumps over a broken down airplane.', 'A person on a horse jumps over a broken down airplane.', 'Children smiling and waving at camera', 'Children smiling and waving at camera'], 'hypothesis': ['A person is training his horse for a competition.', 'A person is at a diner, ordering an omelette.', 'A person is outdoors, on a horse.', 'They are smiling at their parents', 'There are children present'], 'label': [1, 2, 0, 1, 0]}


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 550152
    })
})
Label examples:
[1, 2, 0, 1, 0]
Unique labels in the dataset: {0, 1, 2, -1}


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/550152 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113265377778387, max=1.0…

Epoch,Training Loss,Validation Loss


IndexError: Target -1 is out of bounds.

C. Sentence Pair classification using Siamese Network 

In [4]:
pip install sentence_transformers 

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
from sentence_transformers import SentenceTransformer
from tensorflow.keras import layers, models, optimizers
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# 1. Preprocess the Data: Load dataset and generate embeddings
data = pd.read_csv('/kaggle/input/task-finding-semantic-textual-similarity/Text_Similarity_Dataset.csv')

# Load a pre-trained Sentence Transformer model for generating sentence embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Get sentence embeddings
data['sentence1_embedding'] = data['sentence1'].apply(lambda x: model.encode(x))
data['sentence2_embedding'] = data['sentence2'].apply(lambda x: model.encode(x))

# Convert embeddings to numpy array
X1 = np.array(data['sentence1_embedding'].tolist())
X2 = np.array(data['sentence2_embedding'].tolist())
y = data['similarity_label'].values  # Assuming the dataset has a column for similarity labels

# Split the data into training and test sets
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(X1, X2, y, test_size=0.2, random_state=42)

# 2. Define the Siamese Network Architecture
# Input layers for the two sentences
input1 = layers.Input(shape=(X1.shape[1],))
input2 = layers.Input(shape=(X1.shape[1],))

# Dense layers for feature extraction
dense_layer = layers.Dense(128, activation='relu')
encoded1 = dense_layer(input1)
encoded2 = dense_layer(input2)

# Concatenate the extracted features and add a final dense layer for output
merged = layers.concatenate([encoded1, encoded2])
output = layers.Dense(1, activation='sigmoid')(merged)

# Define the Siamese model
siamese_model = models.Model(inputs=[input1, input2], outputs=output)

# Compile the model
siamese_model.compile(optimizer=optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# 3. Train the model
siamese_model.fit([X1_train, X2_train], y_train, epochs=5, batch_size=32, validation_split=0.2)

# 4. Evaluate the Model
# Predict on the test set
y_pred = siamese_model.predict([X1_test, X2_test])

# Convert predictions to binary (similar or not similar)
y_pred = (y_pred > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.4f}')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

KeyError: 'sentence1'