In [1]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Required Libraries

In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from torch.nn import BCEWithLogitsLoss
from sklearn.preprocessing import MultiLabelBinarizer
import ast


# Load and Preprocess Dataset

In [3]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/arxiv_data.csv')

# Combine title and summary into a single text column
df['text'] = df['titles'] + ' ' + df['summaries']
df = df.drop(["titles", "summaries"], axis=1)

# Convert 'terms' column from string representation of lists to actual lists
df['terms'] = df['terms'].apply(lambda x: ast.literal_eval(x))

# One-hot encode the terms (multi-label)
mlb = MultiLabelBinarizer()
encoded_labels = mlb.fit_transform(df['terms'])


In [5]:
df.head()

Unnamed: 0,terms,text
0,"[cs.CV, cs.LG]",Survey on Semantic Stereo Matching / Semantic ...
1,"[cs.CV, cs.AI, cs.LG]",FUTURE-AI: Guiding Principles and Consensus Re...
2,"[cs.CV, cs.AI]",Enforcing Mutual Consistency of Hard Regions f...
3,[cs.CV],Parameter Decoupling Strategy for Semi-supervi...
4,"[cs.CV, cs.LG]",Background-Foreground Segmentation for Interio...


# Split Dataset and Initialize Tokenizer

In [4]:
# Split the dataset into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing function for tokenizing text
def preprocess_data(df, tokenizer, max_length=128):
    inputs = tokenizer(df['text'].tolist(), max_length=max_length, padding=True, truncation=True, return_tensors='pt')
    labels = torch.tensor(mlb.transform(df['terms'].tolist()))
    return inputs['input_ids'], inputs['attention_mask'], labels


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



# Preprocess Data and Create DataLoaders

In [6]:
# Preprocess data
train_input_ids, train_attention_mask, train_labels = preprocess_data(train_df, tokenizer)
val_input_ids, val_attention_mask, val_labels = preprocess_data(val_df, tokenizer)
test_input_ids, test_attention_mask, test_labels = preprocess_data(test_df, tokenizer)

# Create datasets and dataloaders
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize Model, Optimizer, and Loss Function

In [7]:
# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mlb.classes_))

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = BCEWithLogitsLoss()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training Loop with Early Stopping

In [8]:
# Training loop with early stopping
epochs = 5
best_val_loss = float('inf')
patience = 2
counter = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids_batch, attention_mask_batch, labels_batch = [item.to(device) for item in batch]

        optimizer.zero_grad()
        outputs = model(input_ids_batch, attention_mask=attention_mask_batch)
        logits = outputs.logits
        loss = loss_fn(logits, labels_batch.float())
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids_batch, attention_mask_batch, labels_batch = [item.to(device) for item in batch]
            outputs = model(input_ids_batch, attention_mask=attention_mask_batch)
            logits = outputs.logits
            loss = loss_fn(logits, labels_batch.float())
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping!")
            break


Epoch 1, Training Loss: 0.024173482049646932
Epoch 1, Validation Loss: 0.004921348966704106
Epoch 2, Training Loss: 0.004667154988072559
Epoch 2, Validation Loss: 0.004600630971792809
Epoch 3, Training Loss: 0.00437974931074433
Epoch 3, Validation Loss: 0.003794123420246049
Epoch 4, Training Loss: 0.0036812445464022227
Epoch 4, Validation Loss: 0.0036484481451348806
Epoch 5, Training Loss: 0.004228294597750187
Epoch 5, Validation Loss: 0.003682516513458442


# Model Evaluation

In [9]:
# Evaluation on the test set
model.eval()
test_loss = 0
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids_batch, attention_mask_batch, labels_batch = [item.to(device) for item in batch]
        outputs = model(input_ids_batch, attention_mask=attention_mask_batch)
        logits = outputs.logits
        loss = loss_fn(logits, labels_batch.float())
        test_loss += loss.item()

        predictions.append(logits.cpu().numpy())
        true_labels.append(labels_batch.cpu().numpy())

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}")

# Flatten predictions and labels
predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Threshold predictions
threshold = 0.5
predicted_labels = [[1 if p >= threshold else 0 for p in pred] for pred in predictions]

# Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='micro')
recall = recall_score(true_labels, predicted_labels, average='micro')
f1 = f1_score(true_labels, predicted_labels, average='micro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Test Loss: 0.0036780598130002388
Accuracy: 0.41062288749396425
Precision: 0.8701328447091159
Recall: 0.5439056981960485
F1-Score: 0.6693879948314342


# Prediction for New Input Text

In [10]:
def predict_category(title, summary):
    # Combine title and summary into a single text input
    input_text = title + " " + summary

    # Preprocess the input text
    inputs = tokenizer(input_text, max_length=128, padding=True, truncation=True, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.sigmoid(logits)

    # Convert to binary predictions (0 or 1)
    predicted_labels = (predictions > 0.5).int().cpu().numpy()

    # Decode the predicted terms (categories)
    predicted_terms = mlb.inverse_transform(predicted_labels)

    # Return the predicted categories
    return predicted_terms

In [11]:
# Get user input for title and summary
user_title = input("Enter the title of the research paper: ")
user_summary = input("Enter the summary (abstract) of the research paper: ")

# Predict and print the categories
predicted_categories = predict_category(user_title, user_summary)
print("Predicted categories:", predicted_categories)

Enter the title of the research paper: Attention-Based 3D Seismic Fault Segmentation Training by a Few 2D Slice Labels
Enter the summary (abstract) of the research paper: Detection faults in seismic data is a crucial step for seismic structural interpretation, reservoir characterization and well placement. Some recent works regard it as an image segmentation task. The task of image segmentation requires huge labels, especially 3D seismic data, which has a complex structure and lots of noise. Therefore, its annotation requires expert experience and a huge workload. In this study, we present lambda-BCE and lambda-smooth L1loss to effectively train 3D-CNN by some slices from 3D seismic data, so that the model can learn the segmentation of 3D seismic data from a few 2D slices. In order to fully extract information from limited data and suppress seismic noise, we propose an attention module that can be used for active supervision training and embedded in the network. The attention heatmap l

# Model deploymwnt and Access

In [12]:
!pip install huggingface_hub
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your ter

In [14]:
from huggingface_hub import HfApi, Repository
from transformers import BertForSequenceClassification, BertTokenizer

# Define the repository name
repo_name = "Suyash07/MLC_BERT"  # Replace with your Hugging Face username and desired model name

# Save the model and tokenizer to a local directory
save_directory = "./model_to_push"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Push the model to Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Model and tokenizer pushed to Hugging Face Hub at https://huggingface.co/{repo_name}")


model.safetensors:   0%|          | 0.00/441M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model and tokenizer pushed to Hugging Face Hub at https://huggingface.co/Suyash07/MLC_BERT


In [16]:
from huggingface_hub import HfApi

# Save the mlb object locally
import joblib
mlb_filename = 'mlb.pkl'
joblib.dump(mlb, mlb_filename)

# Define repository information
repo_name = "Suyash07/MLC_BERT"  # Replace with your Hugging Face repo name
api = HfApi()

# Push the mlb.pkl file to the repository
api.upload_file(
    path_or_fileobj=mlb_filename,  # Path to the local file
    path_in_repo="mlb.pkl",  # Path where the file will be saved in the repo
    repo_id=repo_name,  # Your repo name
)


mlb.pkl:   0%|          | 0.00/53.6k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Suyash07/MLC_BERT/commit/fde53b5883da3a71fbe931d4d20f4adefea39231', commit_message='Upload mlb.pkl with huggingface_hub', commit_description='', oid='fde53b5883da3a71fbe931d4d20f4adefea39231', pr_url=None, pr_revision=None, pr_num=None)