In [10]:
# Packages Installation
import pandas as pd
from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric
from sklearn.model_selection import train_test_split
import re

In [11]:
Metadata = pd.read_csv('C:\\24592774_Dataset_Discovery_Using_LLM\\MetaData_Creation\\MetaData_Notebooks\\Prepared_MetaData_DataSet.csv')

In [12]:
Metadata_DS = Metadata.copy()

In [13]:
Metadata_DS['text'] = Metadata_DS['title']+Metadata_DS['description']+Metadata_DS['summary']+Metadata_DS['tags']

In [7]:
import torch
import torch.nn.functional as F
from transformers import ElectraTokenizer, ElectraModel, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Assuming MetaData_DS['description'] contains the dataset descriptions
MetaData_DS = Metadata.copy()

# Initialize ELECTRA tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

# Define maximum sequence length
max_length = 128  # You can adjust this according to your dataset and memory constraints

# Tokenize dataset descriptions
tokenized_descriptions = tokenizer(list(MetaData_DS['description']), truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Convert labels to numerical format
labels = [1 if url else 0 for url in MetaData_DS['dataset_url']]

# Extract input_ids and attention_mask
input_ids = tokenized_descriptions['input_ids']
attention_mask = tokenized_descriptions['attention_mask']

# Split data into training and validation sets
train_ids, val_ids, train_masks, val_masks, train_labels, val_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.2, random_state=42)

# Convert tokenized texts and labels to PyTorch tensors
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create TensorDataset for training and validation sets
train_dataset = TensorDataset(train_ids, train_masks, train_labels)
val_dataset = TensorDataset(val_ids, val_masks, val_labels)

# Define batch size
batch_size = 16

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define the ELECTRA-based model with increased dropout
class ElectraClassifier(nn.Module):
    def __init__(self, num_classes):
        super(ElectraClassifier, self).__init__()
        self.electra = ElectraModel.from_pretrained('google/electra-base-discriminator')
        self.dropout = nn.Dropout(0.3)  # Dropout rate
        self.linear = nn.Linear(self.electra.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0]  # Take [CLS] token representation
        hidden_state = self.dropout(hidden_state)
        logits = self.linear(hidden_state)
        return logits

# Instantiate the ELECTRA model
num_classes = 2  # Adjust according to your task
model = ElectraClassifier(num_classes)

# Define the optimizer with weight decay and gradient clipping
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
criterion = nn.CrossEntropyLoss()

# Training loop with early stopping
epochs = 10
patience = 2
best_loss = float('inf')
trigger_times = 0

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    total_correct = 0
    total_samples = 0
    for batch in train_loader:
        inputs, attention_masks, labels = batch

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs, attention_masks)
        # Compute loss
        loss = criterion(outputs, labels)

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update model parameters
        optimizer.step()

        # Accumulate the total loss
        total_loss += loss.item()

        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, dim=1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / total_samples

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    total_correct = 0
    total_samples = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, attention_masks, labels = batch

            # Forward pass
            outputs = model(inputs, attention_masks)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Compute predictions
            _, predicted = torch.max(outputs, dim=1)

            # Update total samples and total correct predictions
            total_samples += labels.size(0)
                        # Update total samples and total correct predictions
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()

            # Save predictions and labels for metric calculation
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    accuracy = total_correct / total_samples
    scheduler.step(avg_val_loss)
    
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='binary')

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")
    print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {accuracy}")
    print(f"Validation Precision: {precision}, Validation Recall: {recall}, Validation F1 Score: {f1}")

    # Early stopping
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping!')
            break



  train_ids = torch.tensor(train_ids)
  val_ids = torch.tensor(val_ids)
  train_masks = torch.tensor(train_masks)
  val_masks = torch.tensor(val_masks)


Epoch 1/10
Train Loss: 0.24539896696805955, Train Accuracy: 0.9701492537313433
Validation Loss: 0.057477476075291634, Validation Accuracy: 0.5
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 2/10
Train Loss: 0.04670276567339897, Train Accuracy: 1.0
Validation Loss: 0.010904028546065092, Validation Accuracy: 0.5
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 3/10
Train Loss: 0.018700627610087395, Train Accuracy: 1.0
Validation Loss: 0.0038369958056136966, Validation Accuracy: 0.5
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 4/10
Train Loss: 0.014899472892284393, Train Accuracy: 1.0
Validation Loss: 0.0019913959549739957, Validation Accuracy: 0.5
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 5/10
Train Loss: 0.003919217688962817, Train Accuracy: 1.0
Validation Loss: 0.0011060097021982074, Validation Accuracy: 0.5
Validation Precision: 1.0, Validati

In [None]:
Meta

In [21]:
from transformers import ElectraTokenizer, ElectraModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load ELECTRA tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraModel.from_pretrained('google/electra-base-discriminator')
model.eval()

# Function to get ELECTRA embedding
def get_electra_embedding(text, model, tokenizer, max_length=128):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the last hidden state of the [CLS] token as the embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Assuming MetaData_DS['description'] contains the dataset descriptions
dataset_descriptions = Metadata['description']

# Generate embeddings for all dataset descriptions
dataset_embeddings = []
for description in dataset_descriptions:
    emb = get_electra_embedding(description, model, tokenizer)
    dataset_embeddings.append(emb)

# Function to predict dataset URLs based on search text
def predict_dataset_url(search_text):
    search_emb = get_electra_embedding(search_text, model, tokenizer)
    similarities = [cosine_similarity([search_emb], [dataset_emb])[0][0] for dataset_emb in dataset_embeddings]
    
    # Find the maximum and minimum similarity values
    max_similarity = max(similarities)
    min_similarity = min(similarities)
    
    # Calculate the 90% range
    range_90 = 0.90 * (max_similarity - min_similarity)
    
    # Find the threshold value
    threshold = min_similarity + range_90
    
    # Find the indices of datasets with similarity above the threshold
    similar_indices = [i for i, sim in enumerate(similarities) if sim > threshold]
    
    results = []
    for idx in similar_indices:
        dataset_info = {
            "title": Metadata.iloc[idx]['title'],
            "dataset_url": Metadata.iloc[idx]['dataset_url'],
            "cosine_similarity": similarities[idx]
        }
        results.append(dataset_info)
    
    return results

# Example usage
search_text = "electric vehicles"
results = predict_dataset_url(search_text)

# Display results
if not results:
    print("No datasets found above the 90% threshold.")
else:
    for result in results:
        print(f"Title: {result['title']}")
        print(f"Dataset URL: {result['dataset_url']}")
        print(f"Cosine Similarity: {result['cosine_similarity']}\n")


Title: ['json', 'repository']
Dataset URL: https://data.world/hdx/e66dbc70-17fe-4230-b9d6-855d192fc05c
Cosine Similarity: 0.7494983673095703

Title: ['houseboats']
Dataset URL: https://data.world/datagov-uk/0cd0d5c0-f170-4899-ba45-e7d227bbd0e4
Cosine Similarity: 0.7750325798988342

Title: ['thurrock', 'outdoor', 'sports']
Dataset URL: https://data.world/datagov-uk/17c44e3a-804c-487b-a07f-b90298685e2a
Cosine Similarity: 0.7471963167190552



In [22]:
from transformers import ElectraTokenizer, ElectraModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load ELECTRA tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraModel.from_pretrained('google/electra-base-discriminator')
model.eval()

# Function to get ELECTRA embedding
def get_electra_embedding(text, model, tokenizer, max_length=128):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the last hidden state of the [CLS] token as the embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Assuming MetaData_DS['description'] contains the dataset descriptions
dataset_descriptions = Metadata_DS['text']

# Generate embeddings for all dataset descriptions
dataset_embeddings = []
for description in dataset_descriptions:
    emb = get_electra_embedding(description, model, tokenizer)
    dataset_embeddings.append(emb)

# Function to predict dataset URLs based on search text
def predict_dataset_url(search_text):
    search_emb = get_electra_embedding(search_text, model, tokenizer)
    similarities = [cosine_similarity([search_emb], [dataset_emb])[0][0] for dataset_emb in dataset_embeddings]
    
    # Find the maximum and minimum similarity values
    max_similarity = max(similarities)
    min_similarity = min(similarities)
    
    # Calculate the 90% range
    range_90 = 0.90 * (max_similarity - min_similarity)
    
    # Find the threshold value
    threshold = min_similarity + range_90
    
    # Find the indices of datasets with similarity above the threshold
    similar_indices = [i for i, sim in enumerate(similarities) if sim > threshold]
    
    results = []
    for idx in similar_indices:
        dataset_info = {
            "title": Metadata_DS.iloc[idx]['title'],
            "dataset_url": Metadata_DS.iloc[idx]['dataset_url'],
            "cosine_similarity": similarities[idx]
        }
        results.append(dataset_info)
    
    return results

# Example usage
search_text = "electric vehicles"
results = predict_dataset_url(search_text)

# Display results
if not results:
    print("No datasets found above the 90% threshold.")
else:
    for result in results:
        print(f"Title: {result['title']}")
        print(f"Dataset URL: {result['dataset_url']}")
        print(f"Cosine Similarity: {result['cosine_similarity']}\n")


Title: ['impact', 'uncoordinated', 'plugin', 'electric', 'vehicle', 'charging']
Dataset URL: https://data.world/us-doe-gov/8ae7e117-313b-40b1-b146-83add97d400b
Cosine Similarity: 0.7266891002655029

Title: ['impact', 'uncoordinated', 'plugin', 'electric', 'vehicle', 'charging']
Dataset URL: https://data.world/us-doe-gov/3f032a3c-7dc0-4f54-9f51-534b2e248f80
Cosine Similarity: 0.7266891002655029

Title: ['coronavirus', 'daily', 'data']
Dataset URL: https://data.world/markmarkoh/coronavirus-data
Cosine Similarity: 0.7453581094741821

Title: ['fashion', 'images', 'dataset']
Dataset URL: https://data.world/crawlfeeds/fashion-images-dataset
Cosine Similarity: 0.74165940284729

Title: ['site', 'g03', 'gasconade', 'river', 'bathymetry', 'structure', 'a1411', '89']
Dataset URL: https://data.world/us-doi-gov/12749821-f445-4f35-8dee-81eb7a56f07d
Cosine Similarity: 0.7338127493858337

Title: ['twitter', 'dataset', '100', 'million', 'tweets', 'related', 'covid19']
Dataset URL: https://data.world/rt