In [1]:
!pip install pinecone langchain_openai langchain_pinecone langchain-community hdbscan umap-learn datasets


Collecting pinecone
  Downloading pinecone-5.4.2-py3-none-any.whl.metadata (19 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.0-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.2-py3-none-any.whl.metadata (1.6 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting hdbscan
  Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pinecone-plugin-inference<4.0.0,>=2.0.0 (from pinecone)
  Downloading pinecone_plugin_inference-3.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting tikto

In [3]:
import pandas as pd

# Read the data from the CSV file all_except_tmlr.csv into a DataFrame df

df = pd.read_csv('papers_changed.csv')

In [4]:
df.dropna(subset = ['Abstract'], inplace = True)

In [6]:
#report the frequency of each value in Conference cooumn in df
df['Conference'].value_counts()

Unnamed: 0_level_0,count
Conference,Unnamed: 1_level_1
KDD,150
NIPS,150
EMNLP,150
CVPR,150
TMLR,150


In [5]:
# remove rows whose Abstract column value is 'No abstract found'

df = df[df['Abstract'] != 'No abstract found']

In [7]:
#now pick 100 each from Conference column values KDD, NeurIPS, EMNLP, CVPR

kdd_mask = (df['Conference'] == 'KDD')
neurips_mask = (df['Conference'] == 'NIPS')
emnlp_mask = (df['Conference'] == 'EMNLP')
cvpr_mask = (df['Conference'] == 'CVPR')
tmlr_mask = (df['Conference'] == 'TMLR')

kdd_df = df.loc[kdd_mask].sample(150)
neurips_df = df.loc[neurips_mask].sample(150)
emnlp_df = df.loc[emnlp_mask].sample(150)
cvpr_df = df.loc[cvpr_mask].sample(150)
tmlr_df = df.loc[tmlr_mask].sample(150)

final_df = pd.concat([kdd_df, neurips_df, emnlp_df, cvpr_df, tmlr_df])
final_df.describe()

Unnamed: 0,Abstract,Conference
count,750,750
unique,750,5
top,Algorithmic recourse is a process that leverag...,KDD
freq,1,150


In [8]:
df

Unnamed: 0,Abstract,Conference
0,Social Recommendation (SR) typically exploits ...,KDD
1,We propose a neuralized undirected graphical m...,KDD
2,The recent success of pre-trained language mod...,KDD
3,Ratings of a user to most items in recommender...,KDD
4,There are many applications for which we want ...,KDD
...,...,...
745,Interest is rising in Physics-Informed Neural ...,TMLR
746,"In this work, we highlight and perform a compr...",TMLR
747,Camera images are ubiquitous in machine learni...,TMLR
748,Capsule networks are a class of neural network...,TMLR


In [7]:
pinecone = "your-index-key"

In [8]:
import getpass
import os
import time
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone)

In [9]:
import time

index_name = "all-conferences-150-changed-abstract-test1"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [12]:
import getpass
import os


os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

Enter API key for OpenAI: ··········


In [13]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [12]:
import pandas as pd
from typing import Any
from datasets import Dataset
from transformers import BartTokenizer
from typing import List, Dict, Optional, Tuple, Any
from tqdm.auto import tqdm
from collections import defaultdict
from typing import Dict, List
import numpy as np

def extract_texts_and_labels_from_df(df: pd.DataFrame) -> List[Tuple[str, str]]:
    """
    Extract abstracts and conference labels from a DataFrame.

    Args:
        df: Pandas DataFrame containing 'Abstract' and 'Conference' columns.

    Returns:
        List of tuples (abstract, label).
    """
    data = []

    for _, row in df.iterrows():
        abstract = row["Abstract"]  # Text of the research abstract
        label = row["Conference"]  # Conference name (label)
        if pd.notna(abstract) and pd.notna(label):  # Ensure non-null values
            data.append((abstract, label))

    return data


# Extract texts and labels
data = extract_texts_and_labels_from_df(df)

# Convert data into Hugging Face Dataset format
abstracts, labels = zip(*data)
label_classes = list(set(labels))  # Unique conferences (e.g., ["CVPR", "EMNLP", "ICML"])
label_to_id = {label: idx for idx, label in enumerate(label_classes)}  # Map conference to ID
encoded_labels = [label_to_id[label] for label in labels]

# Hugging Face Dataset for Training
dataset = Dataset.from_dict({"text": abstracts, "label": encoded_labels})
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
print(f"Sample Data: {data[:5]}")


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Sample Data: [("Social Recommendation (SR) typically exploits neighborhood influence in the social network to enhance user preference modeling. However, users' intricate social behaviors may introduce noisy social connections for user modeling and harm the models' robustness. Existing solutions to alleviate social noise either filter out the noisy connections or generate new potential social connections. Due to the absence of labels, the former approaches may retain uncertain connections for user preference modeling while the latter methods may introduce additional social noise. Through data analysis, we discover that (1) social noise likely comes from the connected users with low preference similarity; and (2) Opinion Leaders (OLs) play a pivotal role in influence dissemination, surpassing high-similarity neighbors, regardless of their preference similarity with trusting peers. Guided by these observations, we propose a novel Self-Supervised Denoising approach through Independent Casc

In [13]:
# Tokenizer for BART
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Create dataset
dataset = Dataset.from_dict({"text": abstracts, "label": encoded_labels})
tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [14]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained BART for sequence classification
num_labels = len(label_classes)
model = BartForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch"
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Train the model
trainer.train()


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


TrainOutput(global_step=141, training_loss=1.2811722992159795, metrics={'train_runtime': 318.2444, 'train_samples_per_second': 7.07, 'train_steps_per_second': 0.443, 'total_flos': 690062264064000.0, 'train_loss': 1.2811722992159795, 'epoch': 3.0})

In [17]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the correct device
model = model.to(device)

# Predict on a sample abstract
sample_text = "This paper introduces a new video recognition dataset and focuses on object detection."
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True)

# Move inputs to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}

# Perform inference
with torch.no_grad():  # Disable gradient computation for inference
    outputs = model(**inputs)
    predicted_class = outputs.logits.argmax().item()

print(f"Predicted Conference: {label_classes[predicted_class]}")


Using device: cuda
Predicted Conference: TMLR


In [None]:
import torch
from tqdm.auto import tqdm
import pandas as pd

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the correct device
model = model.to(device)

def predict_and_evaluate(df):
    """
    Perform batch inference on a DataFrame containing abstracts and labels, and compute accuracy.

    Args:
        df: Pandas DataFrame with 'Abstract' and 'Label' columns.

    Returns:
        Tuple: DataFrame with 'Predicted_Label' and accuracy score.
    """
    predictions = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Predicting labels"):
        abstract = row["Abstract"]
        inputs = tokenizer(abstract, return_tensors="pt", truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to same device as model

        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)
            predicted_class = outputs.logits.argmax().item()
            predicted_label = label_classes[predicted_class]  # Convert to label name

        predictions.append(predicted_label)

    # Add predictions to the DataFrame
    df["Predicted_Label"] = predictions

    # Calculate accuracy
    correct_predictions = (df["Label"] == df["Predicted_Label"]).sum()
    total_samples = len(df)
    accuracy = correct_predictions / total_samples

    print(f"\nAccuracy: {accuracy:.2%}")
    return df, accuracy


# Example DataFrame (replace with your actual data)
eval_df = pd.read_csv('/kaggle/input/text-class-hackathon/labelled-given-test.csv')

# Perform batch inference and calculate accuracy
predictions_df, accuracy = predict_and_evaluate(eval_df)

# Display DataFrame with predictions
print(predictions_df)