In [1]:
import pandas as pd
from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric
from sklearn.model_selection import train_test_split
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
Metadata = pd.read_csv('C:\\24592774_Dataset_Discovery_Using_LLM\\MetaData_Creation\\MetaData_Notebooks\\Prepared_MetaData_DataSet.csv')

In [3]:
Metadata.head()


Unnamed: 0,title,description,summary,tags,dataset_url,available_formats
0,"['2023w7', 'global', 'electric', 'vehicle', 'm...","['global', 'electric', 'vehicle', 'market', 's...","['editorsimple', 'original', 'visualization', ...","['makeover monday', 'cars', 'vehicles', 'elect...",https://data.world/makeovermonday/2023w7,['xlsx']
1,"['connected', 'electric', 'autonomous', 'vehic...","['connected', 'electric', 'autonomous', 'vehic...","['rolling', 'snapshot', 'data', 'collected', '...",['autonomous'],https://data.world/smartcolumbusos/650b7e59-af...,['csv']
2,"['electric', 'vehicle', 'charging', 'stations']","['dataset', 'contains', 'session', 'details', ...","['dataset', 'contains', 'session', 'details', ...","['electric vehicle', 'environment', 'energy']",https://data.world/townofcary/electric-vehicle...,"['dbf', 'csv', 'shx', 'shp', 'json', 'prj']"
3,"['nyserda', 'electric', 'vehicle', 'drive', 'c...","['new', 'york', 'state', '’', 'charge', 'ny', ...","['original', 'title', 'nyserda', 'electric', '...","['ev', 'electric vehicle', 'bev', 'phev', 'ghg...",https://data.world/data-ny-gov/thd2-fu8y,['csv']
4,"['impact', 'uncoordinated', 'plugin', 'electri...","['impact', 'uncoordinated', 'plugin', 'electri...","['original', 'title', 'impact', 'uncoordinated...","['battery', 'consumption', 'data', 'energy', '...",https://data.world/us-doe-gov/8ae7e117-313b-40...,['xlsx']


In [4]:
import pandas as pd

# Check for null values in each column
null_values = Metadata.isnull().sum()
print("Null Values:")
print(null_values)

# Check for NA values in each column
na_values = Metadata.isna().sum()
print("\nNA Values:")
print(na_values)

# Check for NaN values in each column
nan_values = Metadata.isna().sum()
print("\nNaN Values:")
print(nan_values)


Null Values:
title                0
description          0
summary              0
tags                 0
dataset_url          0
available_formats    0
dtype: int64

NA Values:
title                0
description          0
summary              0
tags                 0
dataset_url          0
available_formats    0
dtype: int64

NaN Values:
title                0
description          0
summary              0
tags                 0
dataset_url          0
available_formats    0
dtype: int64


In [5]:
import torch
import torch.nn.functional as F
from transformers import AlbertTokenizer, AlbertModel, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support
from torch.optim.lr_scheduler import ReduceLROnPlateau

MetaData_DS = Metadata.copy()

# ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

max_length = 128  

# Tokenize dataset descriptions
tokenized_descriptions = tokenizer(list(MetaData_DS['description']), truncation=True, padding=True, max_length=max_length, return_tensors='pt')

labels = [1 if url else 0 for url in MetaData_DS['dataset_url']]

# Extractions
input_ids = tokenized_descriptions['input_ids']
attention_mask = tokenized_descriptions['attention_mask']

# training and validation sets splitting
train_ids, val_ids, train_masks, val_masks, train_labels, val_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.2, random_state=42)

train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

#  TensorDataset for training and validation sets creation
train_dataset = TensorDataset(train_ids, train_masks, train_labels)
val_dataset = TensorDataset(val_ids, val_masks, val_labels)


batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define the ALBERT-based model 
class AlbertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AlbertClassifier, self).__init__()
        self.albert = AlbertModel.from_pretrained('albert-base-v2')
        self.dropout = nn.Dropout(0.3)  # Dropout rate
        self.linear = nn.Linear(self.albert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.albert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0]  # Take [CLS] token representation
        hidden_state = self.dropout(hidden_state)
        logits = self.linear(hidden_state)
        return logits

#  ALBERT model
num_classes = 2  
model = AlbertClassifier(num_classes)

# Define the optimizer with weight decay and gradient clipping
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
criterion = nn.CrossEntropyLoss()

# Training loop with early stopping
epochs = 10
patience = 2
best_loss = float('inf')
trigger_times = 0

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    total_correct = 0
    total_samples = 0
    for batch in train_loader:
        inputs, attention_masks, labels = batch

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs, attention_masks)
        # Compute loss
        loss = criterion(outputs, labels)

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update model parameters
        optimizer.step()

        # Accumulate the total loss
        total_loss += loss.item()

        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, dim=1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / total_samples

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    total_correct = 0
    total_samples = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, attention_masks, labels = batch

            # Forward pass
            outputs = model(inputs, attention_masks)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Compute predictions
            _, predicted = torch.max(outputs, dim=1)

            # Update total samples and total correct predictions
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()

            # Save predictions and labels for metric calculation
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    accuracy = total_correct / total_samples
    scheduler.step(avg_val_loss)
    
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='binary')

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")
    print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {accuracy}")
    print(f"Validation Precision: {precision}, Validation Recall: {recall}, Validation F1 Score: {f1}")

    # Early stopping
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping!')
            break


  train_ids = torch.tensor(train_ids)
  val_ids = torch.tensor(val_ids)
  train_masks = torch.tensor(train_masks)
  val_masks = torch.tensor(val_masks)


Epoch 1/10
Train Loss: 0.2304468871850986, Train Accuracy: 0.8059701492537313
Validation Loss: 0.00011329429253237322, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 2/10
Train Loss: 0.00014587936893804, Train Accuracy: 1.0
Validation Loss: 8.16580268292455e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 3/10
Train Loss: 1.3201191723055673e-05, Train Accuracy: 1.0
Validation Loss: 4.436810968400096e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 4/10
Train Loss: 6.776504778827075e-06, Train Accuracy: 1.0
Validation Loss: 3.699206217788742e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 5/10
Train Loss: 5.848165801580762e-06, Train Accuracy: 1.0
Validation Loss: 3.397458840481704e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, Val

In [7]:
MetaData_DS['text'] = MetaData_DS['title']+MetaData_DS['description']+MetaData_DS['summary']+MetaData_DS['tags']

In [8]:
import torch
import torch.nn.functional as F
from transformers import AlbertTokenizer, AlbertModel, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support
from torch.optim.lr_scheduler import ReduceLROnPlateau


# Initialize ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

max_length = 128  
# Tokenize dataset descriptions
tokenized_descriptions = tokenizer(list(MetaData_DS['text']), truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Convert labels to numerical format
labels = [1 if url else 0 for url in MetaData_DS['dataset_url']]

# Extract input_ids and attention_mask
input_ids = tokenized_descriptions['input_ids']
attention_mask = tokenized_descriptions['attention_mask']

# Split data into training and validation sets
train_ids, val_ids, train_masks, val_masks, train_labels, val_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.2, random_state=42)

# Convert tokenized texts and labels to PyTorch tensors
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create TensorDataset for training and validation sets
train_dataset = TensorDataset(train_ids, train_masks, train_labels)
val_dataset = TensorDataset(val_ids, val_masks, val_labels)

# Define batch size
batch_size = 16

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define the ALBERT-based model with increased dropout
class AlbertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AlbertClassifier, self).__init__()
        self.albert = AlbertModel.from_pretrained('albert-base-v2')
        self.dropout = nn.Dropout(0.3)  # Dropout rate
        self.linear = nn.Linear(self.albert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.albert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0]  # Take [CLS] token representation
        hidden_state = self.dropout(hidden_state)
        logits = self.linear(hidden_state)
        return logits

# Instantiate the ALBERT model
num_classes = 2  
model = AlbertClassifier(num_classes)

# Define the optimizer with weight decay and gradient clipping
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
criterion = nn.CrossEntropyLoss()

# Training loop with early stopping
epochs = 10
patience = 2
best_loss = float('inf')
trigger_times = 0

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    total_correct = 0
    total_samples = 0
    for batch in train_loader:
        inputs, attention_masks, labels = batch

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs, attention_masks)
        # Compute loss
        loss = criterion(outputs, labels)

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update model parameters
        optimizer.step()

        # Accumulate the total loss
        total_loss += loss.item()

        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, dim=1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / total_samples

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    total_correct = 0
    total_samples = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, attention_masks, labels = batch

            # Forward pass
            outputs = model(inputs, attention_masks)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Compute predictions
            _, predicted = torch.max(outputs, dim=1)

            # Update total samples and total correct predictions
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()

            # Save predictions and labels for metric calculation
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    accuracy = total_correct / total_samples
    scheduler.step(avg_val_loss)
    
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='binary')

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")
    print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {accuracy}")
    print(f"Validation Precision: {precision}, Validation Recall: {recall}, Validation F1 Score: {f1}")

    # Early stopping
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping!')
            break


  train_ids = torch.tensor(train_ids)
  val_ids = torch.tensor(val_ids)
  train_masks = torch.tensor(train_masks)
  val_masks = torch.tensor(val_masks)


Epoch 1/10
Train Loss: 0.32286162620875986, Train Accuracy: 0.7761194029850746
Validation Loss: 0.00014088406169321388, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 2/10
Train Loss: 7.695886652072658e-05, Train Accuracy: 1.0
Validation Loss: 5.006777200833312e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 3/10
Train Loss: 4.995345625502523e-06, Train Accuracy: 1.0
Validation Loss: 2.082434832573199e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 4/10
Train Loss: 3.4694752685027196e-06, Train Accuracy: 1.0
Validation Loss: 1.4379608614945028e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 5/10
Train Loss: 1.8805239733410418e-06, Train Accuracy: 1.0
Validation Loss: 1.1362127452230197e-06, Validation Accuracy: 1.0
Validation Precision: 1

In [9]:
import torch
import torch.nn.functional as F
from transformers import AlbertTokenizer, AlbertModel, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Initialize ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

max_length = 128  

# Tokenize dataset descriptions
tokenized_descriptions = tokenizer(list(MetaData_DS['text']), truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Convert labels to numerical format
labels = [1 if url else 0 for url in MetaData_DS['dataset_url']]

# Extract input_ids and attention_mask
input_ids = tokenized_descriptions['input_ids']
attention_mask = tokenized_descriptions['attention_mask']

# Split data into training and validation sets
train_ids, val_ids, train_masks, val_masks, train_labels, val_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.2, random_state=42)

# Convert tokenized texts and labels to PyTorch tensors
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create TensorDataset for training and validation sets
train_dataset = TensorDataset(train_ids, train_masks, train_labels)
val_dataset = TensorDataset(val_ids, val_masks, val_labels)

# Define batch size
batch_size = 16

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define the ALBERT-based model with increased dropout
class AlbertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AlbertClassifier, self).__init__()
        self.albert = AlbertModel.from_pretrained('albert-base-v2')
        self.dropout = nn.Dropout(0.5)  
        self.linear = nn.Linear(self.albert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.albert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Use pooler_output instead of last_hidden_state
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits


# Instantiate the ALBERT model
num_classes = 2  
model = AlbertClassifier(num_classes)

# Define the optimizer with weight decay and gradient clipping
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
criterion = nn.CrossEntropyLoss()

# Training loop with early stopping
epochs = 3
patience = 2
best_loss = float('inf')
trigger_times = 0

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    total_correct = 0
    total_samples = 0
    for batch in train_loader:
        inputs, attention_masks, labels = batch

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs, attention_masks)
        # Compute loss
        loss = criterion(outputs, labels)

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update model parameters
        optimizer.step()

        # Accumulate the total loss
        total_loss += loss.item()

        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, dim=1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / total_samples

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    total_correct = 0
    total_samples = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, attention_masks, labels = batch

            # Forward pass
            outputs = model(inputs, attention_masks)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Compute predictions
            _, predicted = torch.max(outputs, dim=1)

            # Update total samples and total correct predictions
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()

            # Save predictions and labels for metric calculation
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    accuracy = total_correct / total_samples
    scheduler.step(avg_val_loss)
    
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='binary')

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")
    print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {accuracy}")
    print(f"Validation Precision: {precision}, Validation Recall: {recall}, Validation F1 Score: {f1}")

    # Early stopping
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping!')
            break


  train_ids = torch.tensor(train_ids)
  val_ids = torch.tensor(val_ids)
  train_masks = torch.tensor(train_masks)
  val_masks = torch.tensor(val_masks)


Epoch 1/3
Train Loss: 0.41791629791259766, Train Accuracy: 0.8208955223880597
Validation Loss: 0.07898492366075516, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 2/3
Train Loss: 0.05128904189914465, Train Accuracy: 1.0
Validation Loss: 0.011473944410681725, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 3/3
Train Loss: 0.0091940822545439, Train Accuracy: 1.0
Validation Loss: 0.0038527388824149966, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0


In [10]:
import torch
import torch.nn.functional as F
from transformers import AlbertTokenizer, AlbertModel, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support
from torch.optim.lr_scheduler import ReduceLROnPlateau


MetaData_DS = Metadata.copy()

# Initialize ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

max_length = 128  

# Tokenize dataset descriptions
tokenized_descriptions = tokenizer(list(MetaData_DS['description']), truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Convert labels to numerical format
labels = [1 if url else 0 for url in MetaData_DS['dataset_url']]

# Extract input_ids and attention_mask
input_ids = tokenized_descriptions['input_ids']
attention_mask = tokenized_descriptions['attention_mask']

# Split data into training and validation sets
train_ids, val_ids, train_masks, val_masks, train_labels, val_labels = train_test_split(input_ids, attention_mask, labels, test_size=0.2, random_state=42)

# Convert tokenized texts and labels to PyTorch tensors
train_ids = torch.tensor(train_ids)
val_ids = torch.tensor(val_ids)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create TensorDataset for training and validation sets
train_dataset = TensorDataset(train_ids, train_masks, train_labels)
val_dataset = TensorDataset(val_ids, val_masks, val_labels)

# Define batch size
batch_size = 16

# Create DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define the ALBERT-based model with increased dropout
class AlbertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AlbertClassifier, self).__init__()
        self.albert = AlbertModel.from_pretrained('albert-base-v2')
        self.dropout = nn.Dropout(0.3)  # Dropout rate
        self.linear = nn.Linear(self.albert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.albert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0]  # Take [CLS] token representation
        hidden_state = self.dropout(hidden_state)
        logits = self.linear(hidden_state)
        return logits

# Instantiate the ALBERT model
num_classes = 2 
model = AlbertClassifier(num_classes)

# Define the optimizer with weight decay and gradient clipping
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
criterion = nn.CrossEntropyLoss()

# Training loop with early stopping
epochs = 10
patience = 2
best_loss = float('inf')
trigger_times = 0

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    total_correct = 0
    total_samples = 0
    for batch in train_loader:
        inputs, attention_masks, labels = batch

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs, attention_masks)
        # Compute loss
        loss = criterion(outputs, labels)

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update model parameters
        optimizer.step()

        # Accumulate the total loss
        total_loss += loss.item()

        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, dim=1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / total_samples

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    total_correct = 0
    total_samples = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, attention_masks, labels = batch

            # Forward pass
            outputs = model(inputs, attention_masks)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Compute predictions
            _, predicted = torch.max(outputs, dim=1)

            # Update total samples and total correct predictions
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()

            # Save predictions and labels for metric calculation
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    accuracy = total_correct / total_samples
    scheduler.step(avg_val_loss)
    
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='binary')

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss}, Train Accuracy: {train_accuracy}")
    print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {accuracy}")
    print(f"Validation Precision: {precision}, Validation Recall: {recall}, Validation F1 Score: {f1}")

    # Early stopping
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping!')
            break


  train_ids = torch.tensor(train_ids)
  val_ids = torch.tensor(val_ids)
  train_masks = torch.tensor(train_masks)
  val_masks = torch.tensor(val_masks)


Epoch 1/10
Train Loss: 0.33168293490889483, Train Accuracy: 0.7761194029850746
Validation Loss: 0.00020222133025527, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 2/10
Train Loss: 0.0002504504878743319, Train Accuracy: 1.0
Validation Loss: 1.4580677088815719e-05, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 3/10
Train Loss: 1.4306911361927633e-05, Train Accuracy: 1.0
Validation Loss: 9.890595265460433e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 4/10
Train Loss: 1.2289830101508414e-05, Train Accuracy: 1.0
Validation Loss: 7.040772743494017e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, Validation Recall: 1.0, Validation F1 Score: 1.0
Epoch 5/10
Train Loss: 9.123415384237888e-06, Train Accuracy: 1.0
Validation Loss: 5.092458422950585e-06, Validation Accuracy: 1.0
Validation Precision: 1.0, 

In [11]:
from transformers import AlbertTokenizer, AlbertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load ALBERT tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')
model.eval()

# Function to get ALBERT embedding
def get_albert_embedding(text, model, tokenizer, max_length=128):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the last hidden state of the [CLS] token as the embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()


In [12]:
from transformers import AlbertTokenizer, AlbertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load ALBERT tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')
model.eval()

# Function to get ALBERT embedding
def get_albert_embedding(text, model, tokenizer, max_length=128):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the last hidden state of the [CLS] token as the embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()


In [13]:
dataset_descriptions = Metadata['description']

# Generate embeddings for all dataset descriptions
dataset_embeddings = []
for description in dataset_descriptions:
    emb = get_albert_embedding(description, model, tokenizer)
    dataset_embeddings.append(emb)


In [14]:
# Function to predict dataset URLs based on search text
def predict_dataset_url(search_text):
    search_emb = get_albert_embedding(search_text, model, tokenizer)
    similarities = [cosine_similarity([search_emb], [dataset_emb])[0][0] for dataset_emb in dataset_embeddings]
    
    # Find the maximum and minimum similarity values
    max_similarity = max(similarities)
    min_similarity = min(similarities)
    
    # Calculate the 90% range
    range_90 = 0.50 * (max_similarity - min_similarity)
    
    # Find the threshold value
    threshold = min_similarity + range_90
    
    # Find the indices of datasets with similarity above the threshold
    similar_indices = [i for i, sim in enumerate(similarities) if sim > threshold]
    
    results = []
    for idx in similar_indices:
        dataset_info = {
            "title": Metadata.iloc[idx]['title'],
            "dataset_url": Metadata.iloc[idx]['dataset_url'],
            "cosine_similarity": similarities[idx]
        }
        results.append(dataset_info)
    
    return results


search_text = "electric charging points"
results = predict_dataset_url(search_text)

# Display results
if not results:
    print("No datasets found above the 90% threshold.")
else:
    for result in results:
        print(f"Title: {result['title']}")
        print(f"Dataset URL: {result['dataset_url']}")
        print(f"Cosine Similarity: {result['cosine_similarity']}\n")


Title: ['nyserda', 'electric', 'vehicle', 'drive', 'clean', 'rebate', 'data', '2017']
Dataset URL: https://data.world/data-ny-gov/thd2-fu8y
Cosine Similarity: 0.7907650470733643

Title: ['hawaii', 'public', 'electric', 'vehicle', 'charging', 'stations']
Dataset URL: https://data.world/johnsnowlabs/hawaii-public-electric-vehicle-charging-stations
Cosine Similarity: 0.7595440149307251

Title: ['romanian', 'new', 'car', 'registration', '2023']
Dataset URL: https://data.world/romanian-data/romanian-new-car-registration-in-2023
Cosine Similarity: 0.7753380537033081

Title: ['site', 'g03', 'gasconade', 'river', 'bathymetry', 'structure', 'a1411', '89']
Dataset URL: https://data.world/us-doi-gov/12749821-f445-4f35-8dee-81eb7a56f07d
Cosine Similarity: 0.8071860074996948

Title: ['gpm', 'ground', 'validation', 'twodimensional', 'video', 'disdromete', 'v1']
Dataset URL: https://data.world/us-nasa-gov/47e3817a-65aa-4af4-8d40-f0ed0f655bda
Cosine Similarity: 0.7865433096885681

Title: ['un', 'popul