# Final CSV creation

## Task 1 classification

In [None]:
from transformers import BertTokenizer
from transformers import BertModel
import re
from nltk.tokenize import sent_tokenize
from PyPDF2 import PdfReader
from sklearn.metrics.pairwise import cosine_similarity
import statistics

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

def get_coherence(s, tokenizer, model):
    sentence1 = s
    
    tokens1 = tokenizer(sentence1, return_tensors="pt", padding=True, truncation=True)

    outputs1 = model(**tokens1)

    # cls_embedding1 = outputs1.last_hidden_state[:, 0, :]  # Shape: (1, hidden_size)

    # Extract token embeddings (excluding [CLS] and [SEP])
    token_embeddings1 = outputs1.last_hidden_state.squeeze(0)[1:-1]

    # Compute pairwise cosine similarity for tokens
    similarity_matrix1 = cosine_similarity(token_embeddings1.detach().numpy())

    coherence_score1 = similarity_matrix1.mean()

    return coherence_score1

def get_coherence_list(sent):    
    path1_coherence = []
    for i in sent:
        path1_coherence.append(get_coherence(i, tokenizer, model))
    return path1_coherence

def get_sent_len_list(sent):
    return [len(i) for i in sent]

def extract_all_from_pdf(pdf_path, include_coherence=True):
    # Load the PDF
    reader = PdfReader(pdf_path)
    full_text = ""

    # Extract text from each page
    for page in reader.pages:
        full_text += page.extract_text() + " "

    # Define patterns
    numerical_pattern = r'[0-9]'
    math_pattern = r'[+\-*/=^%()]'
    math_pattern = r'[σ∑∫π√∞Δθλ+\-=*/^<>%∂µˆΓαγδθλϵ(){}]'
    
    # Calculate character counts
    total_characters = len(full_text)  # Total characters, including spaces and newlines
    numerical_count = len(re.findall(numerical_pattern, full_text))  # Count numerical characters
    math_count = len(re.findall(math_pattern, full_text))  # Count mathematical characters

    # Basic cleaning to remove headings, equations, and unnecessary content
    cleaned_text = re.sub(r"(\n|\\n)+", " ", full_text)  # Remove newlines
    cleaned_text = re.sub(r"[^\w\s.,!?-]", "", cleaned_text)  # Remove special characters
    cleaned_text = re.sub(r"\b[A-Z]{2,}\b", "", cleaned_text)  # Remove headings (all-uppercase words)

    # Tokenize into sentences
    sentences = sent_tokenize(cleaned_text)

    # Filter out equations (e.g., containing "=" or numbers with operators)
    sentences = [
        sentence.strip()
        for sentence in sentences
        if not re.search(r"[=+\-*/^]", sentence) and len(re.findall(r"\d", sentence)) < len(sentence.split()) // 2
    ]

    if include_coherence:
        coherence = statistics.mean(get_coherence_list(sentences))

    sent_len = statistics.mean(get_sent_len_list(sentences))

    if include_coherence:
        return sent_len, (math_count+numerical_count)/total_characters, coherence
    else:
        return sent_len, (math_count+numerical_count)/total_characters
    
def classify_pdf(pdf_path, length_range=[0,160], density_range=[0,1.8], coherence_range=[0.40, 1], include_coherence=True):
    if include_coherence:
        sentence_length, density, coherence = extract_all_from_pdf(pdf_path, include_coherence=include_coherence)
        if (length_range[0]<=sentence_length<=length_range[1]) and (density_range[0]<=density<=density_range[1]) and (coherence_range[0]<=coherence<=coherence_range[1]):
            return True
        return False 
    else:
        sentence_length, density = extract_all_from_pdf(pdf_path, include_coherence=False)
        if (length_range[0]<=sentence_length<=length_range[1]) and (density_range[0]<=density<=density_range[1]):
            return True
        return False 

In [None]:
import os
root_folder = "Data/Papers"
file_names = []
publishable = []
for file_name in os.listdir(root_folder):
    file_path = os.path.join(root_folder, file_name)
    print(f"file_name: {file_name}")
    
    # Check if the file is a PDF
    if file_name.endswith(".pdf"):
        # Replace with your processing logic
        file_names.append(file_name)
        publishable.append(classify_pdf(file_path))

In [None]:
import pandas as pd

# Create DataFrame
df = pd.DataFrame({'Publishable': [int(i) for i in publishable]}, index=[i[:-4] for i in file_names])

df.index.name = 'Paper ID'

df.head()

## Task 2 classification

### Classification model function

In [None]:
import os
from groq import Groq
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter

def find_n_closest_vectors(training_vectors, new_vector, n=1):
    """
    Find the indices of the n closest vectors to the new vector among the training vectors.
    
    Args:
        training_vectors (list or np.ndarray): List or array of training vectors.
        new_vector (np.ndarray): The new vector to compare.
        n (int): Number of closest vectors to find.
    
    Returns:
        list: Indices of the n closest vectors in the training vectors.
    """
    training_vectors = np.array(training_vectors)  # Ensure it's a NumPy array
    new_vector = np.array(new_vector).reshape(1, -1)  # Reshape to match dimensions
    similarities = cosine_similarity(training_vectors, new_vector).flatten()  # Compute cosine similarities
    closest_indices = np.argsort(similarities)[-n:][::-1]  # Get indices of n highest similarities in descending order
    return closest_indices.tolist()



def most_frequent_element(lst):
    """
    Find the element with the highest frequency of occurrence in a list.
    
    Args:
        lst (list): Input list of elements.
    
    Returns:
        The element with the highest frequency.
    """
    if not lst:
        return None  # Handle empty list case
    
    counter = Counter(lst)  # Count the frequency of each element
    most_common_element = counter.most_common(1)[0][0]  # Get the element with the highest frequency
    return most_common_element

def extract_abstract(pdf_path):
    """
    Extracts the title and abstract from a PDF.
    Title: Text from start to '\nAbstract\n'.
    Abstract: Text between '\nAbstract\n' and '1 Introduction'.
    """
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()


    # Extract abstract
    abstract_start = text.find("\nAbstract\n") + len("\nAbstract\n")
    abstract_end = text.find("1 Introduction")
    abstract = text[abstract_start:abstract_end].strip() if abstract_start != -1 and abstract_end != -1 else ""

    return abstract


def get_response_from_LLM(prompt, api_key): #this function to be filled in with code that would return the response of the LLM for the input prompt
    groq_api_key = api_key

    client = Groq(
        # This is the default and can be omitted
        api_key=groq_api_key,
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama-3.3-70b-versatile",
    )

    return chat_completion.choices[0].message.content

def get_rationale_from_LLM(abstract, conference, api_key, n_words=100):
    prompt = f'Why is the research paper with abstract: {abstract} best suited to be published in {conference} conference in less than {n_words} words'
    rationale = get_response_from_LLM(prompt, api_key)
    return rationale

from sentence_transformers import SentenceTransformer
def predict_conference_and_get_rationale(pdf_path, api_key): # this is the final funciton that gives the predicted conference of the research paper pdf
    titles_vector_base = np.load('KNN_train_data.npy') # Vector store can be used here
    titles_vector_labels = np.load('KNN_train_labels.npy')
    class_label = ['CVPR', 'EMNLP', 'KDD', 'NeurIPS', 'TMLR']
    abstract = extract_abstract(pdf_path)
    model_name = 'all-mpnet-base-v2'
    model = SentenceTransformer(model_name)
    abstract_embed = model.encode(abstract)
    index = find_n_closest_vectors(titles_vector_base, abstract_embed, n=21)
    l = [titles_vector_labels[i] for i in index]
    conference_class = most_frequent_element(l)
    conference = class_label[conference_class]
    rationale = get_rationale_from_LLM(abstract, conference, api_key)
    return conference, rationale

In [None]:
sample_groq_api_key = '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'
pdf_path = r"Data\Reference\Publishable\TMLR\R015.pdf"
conf, ration = predict_conference_and_get_rationale(pdf_path, sample_groq_api_key)
print(conf)
print(ration)

### Run for all test files

Though not needed for all files, only the ones classified as 1 in Publishable column of the dataframe

In [None]:
import os
root_folder = "Data/Papers"
sample_groq_api_key = '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'
file_names = []
conferences = []
rationales = []
for file_name in os.listdir(root_folder):
    file_path = os.path.join(root_folder, file_name)
    print(f"file_name: {file_name}")
    
    # Check if the file is a PDF
    if file_name.endswith(".pdf"):
        # Replace with your processing logic
        file_names.append(file_name)
        conference, rationale = predict_conference_and_get_rationale(file_path, sample_groq_api_key)
        conferences.append(conference)
        rationales.append(rationale)

Add columns and create the dataframe

In [None]:
df['Conference'] = conferences
df['Rationale'] = rationales

In [None]:
df.loc[df['Publishable'] == 0, ['Conference', 'Rationale']] = np.nan

In [None]:
df.head()

In [None]:
df.to_csv('results.csv', index=True)