In [1]:
# Packages Installation
import pandas as pd
from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric
from sklearn.model_selection import train_test_split
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
Metadata = pd.read_csv('C:\\24592774_Dataset_Discovery_Using_LLM\\MetaData_Creation\\MetaData_Notebooks\\Prepared_MetaData_DataSet.csv')

In [3]:
MetaData_DS = Metadata.copy()

In [4]:
MetaData_DS['text'] = MetaData_DS['title']+MetaData_DS['description']+MetaData_DS['summary']+MetaData_DS['tags']

In [5]:
import torch
import torch.nn.functional as F
from transformers import T5Tokenizer, T5Model, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Initialize T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Define maximum sequence length
max_length = 128  

# Tokenize dataset descriptions
tokenized_descriptions = tokenizer(list(MetaData_DS['description']), truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Convert labels to numerical format
labels = [1 if url else 0 for url in MetaData_DS['dataset_url']]

# Extract input_ids and attention_mask
input_ids = tokenized_descriptions['input_ids']
attention_mask = tokenized_descriptions['attention_mask']

# Split data into training and validation sets
train_ids, val_ids, train_masks, val_masks, train_labels, val_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.2, random_state=42)

# Convert tokenized texts and labels to PyTorch tensors
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create TensorDataset for training and validation sets
train_dataset = TensorDataset(train_ids, train_masks, train_labels)
val_dataset = TensorDataset(val_ids, val_masks, val_labels)

# Define batch size
batch_size = 16

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define the T5-based model with increased dropout
class T5Classifier(nn.Module):
    def __init__(self, num_classes):
        super(T5Classifier, self).__init__()
        self.t5 = T5Model.from_pretrained('t5-base')
        self.dropout = nn.Dropout(0.3)  # Dropout rate
        self.linear = nn.Linear(self.t5.config.d_model, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.t5.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]  # Take [CLS] token representation
        hidden_state = self.dropout(hidden_state)
        logits = self.linear(hidden_state)
        return logits

# Instantiate the T5 model
num_classes = 2  
model = T5Classifier(num_classes)

# Define the optimizer with weight decay and gradient clipping
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
criterion = nn.CrossEntropyLoss()

# Training loop with early stopping
epochs = 10
patience = 2
best_loss = float('inf')
trigger_times = 0

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    total_correct = 0
    total_samples = 0
    for batch in train_loader:
        inputs, attention_masks, labels = batch

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs, attention_masks)
        # Compute loss
        loss = criterion(outputs, labels)

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update model parameters
        optimizer.step()

        # Accumulate the total loss
        total_loss += loss.item()

        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, dim=1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / total_samples

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    total_correct = 0
    total_samples = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, attention_masks, labels = batch

            # Forward pass
            outputs = model(inputs, attention_masks)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Compute predictions
            _, predicted = torch.max(outputs, dim=1)

            # Update total samples and total correct predictions
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()

            # Save predictions and labels for metric calculation
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    accuracy = total_correct / total_samples
    scheduler.step(avg_val_loss)
    
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='binary')

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")
    print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {accuracy}")
    print(f"Validation Precision: {precision}, Validation Recall: {recall}, Validation F1 Score: {f1}")

    # Early stopping
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping!')
            break


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  train_ids = torch.tensor(train_ids)
  val_ids = torch.tensor(val_ids)
  train_masks = torch.tensor(train_masks)
  val_masks = torch.tensor(val_masks)


Epoch 1/10
Train Loss: 0.43093207478523254, Train Accuracy: 1.0
Validation Loss: 0.3218614310026169, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 2/10
Train Loss: 0.363043999671936, Train Accuracy: 0.9701492537313433
Validation Loss: 0.24489563703536987, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 3/10
Train Loss: 0.29445854425430296, Train Accuracy: 1.0
Validation Loss: 0.1874486580491066, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 4/10
Train Loss: 0.2625820904970169, Train Accuracy: 1.0
Validation Loss: 0.14426851272583008, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 5/10
Train Loss: 0.21354144513607026, Train Accuracy: 1.0
Validation Loss: 0.11172188073396683, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, 

In [6]:
import torch
import torch.nn.functional as F
from transformers import T5Tokenizer, T5Model, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Initialize T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Define maximum sequence length
max_length = 128  

# Tokenize dataset descriptions
tokenized_descriptions = tokenizer(list(MetaData_DS['text']), truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Convert labels to numerical format
labels = [1 if url else 0 for url in MetaData_DS['dataset_url']]

# Extract input_ids and attention_mask
input_ids = tokenized_descriptions['input_ids']
attention_mask = tokenized_descriptions['attention_mask']

# Split data into training and validation sets
train_ids, val_ids, train_masks, val_masks, train_labels, val_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.2, random_state=42)

# Convert tokenized texts and labels to PyTorch tensors
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create TensorDataset for training and validation sets
train_dataset = TensorDataset(train_ids, train_masks, train_labels)
val_dataset = TensorDataset(val_ids, val_masks, val_labels)

# Define batch size
batch_size = 16

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define the T5-based model with increased dropout
class T5Classifier(nn.Module):
    def __init__(self, num_classes):
        super(T5Classifier, self).__init__()
        self.t5 = T5Model.from_pretrained('t5-base')
        self.dropout = nn.Dropout(0.3)  # Dropout rate
        self.linear = nn.Linear(self.t5.config.d_model, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.t5.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]  # Take [CLS] token representation
        hidden_state = self.dropout(hidden_state)
        logits = self.linear(hidden_state)
        return logits

# Instantiate the T5 model
num_classes = 2 
model = T5Classifier(num_classes)

# Define the optimizer with weight decay and gradient clipping
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
criterion = nn.CrossEntropyLoss()

# Training loop with early stopping
epochs = 10
patience = 2
best_loss = float('inf')
trigger_times = 0

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    total_correct = 0
    total_samples = 0
    for batch in train_loader:
        inputs, attention_masks, labels = batch

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs, attention_masks)
        # Compute loss
        loss = criterion(outputs, labels)

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update model parameters
        optimizer.step()

        # Accumulate the total loss
        total_loss += loss.item()

        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, dim=1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / total_samples

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    total_correct = 0
    total_samples = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, attention_masks, labels = batch

            # Forward pass
            outputs = model(inputs, attention_masks)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Compute predictions
            _, predicted = torch.max(outputs, dim=1)

            # Update total samples and total correct predictions
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()

            # Save predictions and labels for metric calculation
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    accuracy = total_correct / total_samples
    scheduler.step(avg_val_loss)
    
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='binary')

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")
    print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {accuracy}")
    print(f"Validation Precision: {precision}, Validation Recall: {recall}, Validation F1 Score: {f1}")

    # Early stopping
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping!')
            break


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  train_ids = torch.tensor(train_ids)
  val_ids = torch.tensor(val_ids)
  train_masks = torch.tensor(train_masks)
  val_masks = torch.tensor(val_masks)


Epoch 1/10
Train Loss: 0.714879310131073, Train Accuracy: 0.417910447761194
Validation Loss: 0.5631105601787567, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 2/10
Train Loss: 0.5656410336494446, Train Accuracy: 0.8507462686567164
Validation Loss: 0.4415822774171829, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 3/10
Train Loss: 0.45174430012702943, Train Accuracy: 0.9850746268656716
Validation Loss: 0.34335102140903473, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 4/10
Train Loss: 0.34579399824142454, Train Accuracy: 1.0
Validation Loss: 0.25897257030010223, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 5/10
Train Loss: 0.2665411353111267, Train Accuracy: 1.0
Validation Loss: 0.1863223910331726, Validation Accuracy: 1.0
Validation Precision: 1.

In [7]:
from transformers import T5Tokenizer, T5Model
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5Model.from_pretrained('t5-base')
model.eval()

# Function to get T5 embedding
def get_t5_embedding(text, model, tokenizer, max_length=128):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    # Use the last hidden state of the [CLS] token as the embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

dataset_descriptions = MetaData_DS['description']

# Generate embeddings for all dataset descriptions
dataset_embeddings = []
for description in dataset_descriptions:
    emb = get_t5_embedding(description, model, tokenizer)
    dataset_embeddings.append(emb)

# Function to predict dataset URLs based on search text
def predict_dataset_url(search_text):
    search_emb = get_t5_embedding(search_text, model, tokenizer)
    similarities = [cosine_similarity([search_emb], [dataset_emb])[0][0] for dataset_emb in dataset_embeddings]
    
    # Find the maximum and minimum similarity values
    max_similarity = max(similarities)
    min_similarity = min(similarities)
    
    # Calculate the 90% range
    range_90 = 0.90 * (max_similarity - min_similarity)
    
    # Find the threshold value
    threshold = min_similarity + range_90
    
    # Find the indices of datasets with similarity above the threshold
    similar_indices = [i for i, sim in enumerate(similarities) if sim > threshold]
    
    results = []
    for idx in similar_indices:
        dataset_info = {
            "title": MetaData_DS.iloc[idx]['title'],
            "dataset_url": MetaData_DS.iloc[idx]['dataset_url'],
            "cosine_similarity": similarities[idx]
        }
        results.append(dataset_info)
    
    return results

# Example usage
search_text = "electric vehicles"
results = predict_dataset_url(search_text)

# Display results
if not results:
    print("No datasets found above the 90% threshold.")
else:
    for result in results:
        print(f"Title: {result['title']}")
        print(f"Dataset URL: {result['dataset_url']}")
        print(f"Cosine Similarity: {result['cosine_similarity']}\n")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Title: ['test1234']
Dataset URL: https://data.world/kgs1/test1234
Cosine Similarity: 0.16620910167694092

Title: ['legalmap']
Dataset URL: https://data.world/h0tftw/legalmap
Cosine Similarity: 0.16620910167694092

Title: ['iptv', 'subscription', 'service', 'go']
Dataset URL: https://data.world/freemotion/which-iptv-subscription-service-should-you-go-for
Cosine Similarity: 0.16620910167694092

Title: ['recognized', 'sports']
Dataset URL: https://data.world/sports/recognized-sports
Cosine Similarity: 0.16620910167694092

Title: ['sports', 'illustrated', 'covers']
Dataset URL: https://data.world/crowdflower/sports-illustrated-covers
Cosine Similarity: 0.16620910167694092



In [8]:
from transformers import T5Tokenizer, T5Model
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5Model.from_pretrained('t5-base')
model.eval()

# Function to get T5 embedding
def get_t5_embedding(text, model, tokenizer, max_length=128):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    # Use the last hidden state of the [CLS] token as the embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

dataset_descriptions = MetaData_DS['text']

# Generate embeddings for all dataset descriptions
dataset_embeddings = []
for description in dataset_descriptions:
    emb = get_t5_embedding(description, model, tokenizer)
    dataset_embeddings.append(emb)

# Function to predict dataset URLs based on search text
def predict_dataset_url(search_text):
    search_emb = get_t5_embedding(search_text, model, tokenizer)
    similarities = [cosine_similarity([search_emb], [dataset_emb])[0][0] for dataset_emb in dataset_embeddings]
    
    # Find the maximum and minimum similarity values
    max_similarity = max(similarities)
    min_similarity = min(similarities)
    
    # Calculate the 90% range
    range_90 = 0.90 * (max_similarity - min_similarity)
    
    # Find the threshold value
    threshold = min_similarity + range_90
    
    # Find the indices of datasets with similarity above the threshold
    similar_indices = [i for i, sim in enumerate(similarities) if sim > threshold]
    
    results = []
    for idx in similar_indices:
        dataset_info = {
            "title": MetaData_DS.iloc[idx]['title'],
            "dataset_url": MetaData_DS.iloc[idx]['dataset_url'],
            "cosine_similarity": similarities[idx]
        }
        results.append(dataset_info)
    
    return results

# Example usage
search_text = "electric vehicles"
results = predict_dataset_url(search_text)

# Display results
if not results:
    print("No datasets found above the 90% threshold.")
else:
    for result in results:
        print(f"Title: {result['title']}")
        print(f"Dataset URL: {result['dataset_url']}")
        print(f"Cosine Similarity: {result['cosine_similarity']}\n")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Title: ['test1234']
Dataset URL: https://data.world/kgs1/test1234
Cosine Similarity: 0.13548383116722107

Title: ['legalmap']
Dataset URL: https://data.world/h0tftw/legalmap
Cosine Similarity: 0.1409246027469635

