In [2]:
# Import necessary modules
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from PIL import Image

# Define output file
output_file = "extracted_full_texts.txt"

# Load models once outside the loop
model_dict = create_model_dict()  # Create the model dictionary only once
converter = PdfConverter(artifact_dict=model_dict)  # Initialize the converter once

# Open the output file in write mode
with open(output_file, "w", encoding="utf-8") as output:
    for i in range(1, 136):
        # Prepare file path
        file_path = f"input-pdfs/P{i:03}.pdf"  # Format to match filenames R001.pdf to R015.pdf

        # Process the PDF using the preloaded converter
        rendered = converter(file_path)
        text, _, images = text_from_rendered(rendered)
        text = " ".join(text.split())


        # Save extracted text to output file
        output.write(f"{text.strip()}\n")  # Save text from the PDF as a single line


Loaded layout model datalab-to/surya_layout on device cuda with dtype torch.float16
Loaded texify model to cuda with torch.float16 dtype
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16
Loaded table recognition model vikp/surya_tablerec on device cuda with dtype torch.float16
Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16


Recognizing layout: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]
100%|██████████| 1/1 [00:00<00:00, 31.73it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  3.65it/s]
Recognizing layout: 100%|██████████| 3/3 [00:01<00:00,  1.89it/s]
100%|██████████| 3/3 [00:00<00:00, 163.98it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  6.93it/s]
Recognizing layout: 100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
100%|██████████| 1/1 [00:00<00:00, 112.09it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  5.51it/s]
Recognizing layout: 100%|██████████| 2/2 [00:01<00:00,  1.44it/s]
100%|██████████| 2/2 [00:00<00:00, 124.71it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 100%|██████████| 3/3 [00:05<00:00,  1.72s/it]
Recogniz

In [5]:
import re

# Define the regex pattern for extracting the abstract
pattern = r'(?<=# Abstract)(.*?)(?=# \d)'

# Open the input file and the output file
with open('/content/extracted_full_texts.txt', 'r') as infile, open('extracted_full_abstracts.txt', 'w') as outfile:
    for line in infile:
        # Extract the abstract from the current line
        match = re.search(pattern, line, re.DOTALL)
        if match:
            abstract = match.group(0).strip()
        else:
            abstract = "Abstract not found"  # Placeholder if no abstract is found

        # Write the result to the output file
        outfile.write(abstract + '\n')


# Task - 1

In [13]:
import os
import time
import google.generativeai as genai

def read_all_lines(filename="/content/extracted_full_texts.txt"):
    """Read all lines from file"""
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            return [line.strip() for line in f.readlines()]
    except FileNotFoundError:
        print(f"Error: {filename} not found")
        return []

def process_paper(model, text, min_interval=7):
    """Process single paper through Gemini API with rate limiting"""
    start_time = time.time()
    try:
        chat = model.start_chat(history=[])
        response = chat.send_message(text)

        # Calculate remaining time to wait
        elapsed = time.time() - start_time
        remaining = min_interval - elapsed

        if remaining > 0:
            print(f"Waiting {remaining:.2f} seconds to maintain rate limit...")
            time.sleep(remaining)

        return response.text
    except Exception as e:
        print(f"API Error: {str(e)}")
        return None

def save_results(results, output_file="gemini2_output.txt"):
    """Save results to file"""
    with open(output_file, 'w', encoding='utf-8') as f:
        for result in results:
            f.write(f"{result}\n")

def main():
    # Configure Gemini
    genai.configure(api_key="####")

    # Setup model
    generation_config = {
        "temperature": 0.1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 8192,
    }

    sys_prompt = """

You are an expert evaluator of academic research papers. Your task is to analyze the provided research paper text and assess it for three key areas: inappropriate methodologies, incoherent arguments, and unsubstantiated claims. You will provide a score from 1 to 5 for each area, with a focus on strongly penalizing severe flaws but making it more achievable for a paper with minor, infrequent issues to get a high score. **Avoid using the middle score of 3 unless the paper truly falls equally between flawed and sound.**

Specifically:

*   **Inappropriate Methodologies:**
    *   **1:** The methodology is **catastrophically flawed**, invalid, and completely inappropriate for the research question. It renders the entire study utterly unreliable, and its conclusions are almost certainly false. The approach is demonstrably unsound and indicative of significant errors.
    *   **2:** The methodology has **major, crippling flaws** that seriously undermine the validity of the findings. Significant and obvious concerns exist about the data collection, analysis, or experimental design. The methods chosen are clearly inadequate.
    *   **3:** The methodology has some noticeable flaws or weaknesses that may affect the interpretation or generalizability of results. However, it falls between having serious issues and being completely sound. Use this option only if the paper truly balances between flawed and well-done.
    *   **4:** The methodology is **generally strong and well-reasoned**, with only *minor and infrequent* issues or limitations that *do not significantly impact* the validity of the study. A few minor issues are acceptable for this score.
    *   **5:** The methodology is **highly appropriate and well-executed**, demonstrating a strong understanding of research design. The methods are sound and are very suitable for addressing the research question. The approach is generally well-implemented even if not absolutely perfect.

*   **Incoherent Arguments:**
    *   **1:** The arguments are **completely nonsensical, utterly illogical**, and lack *any* coherent connection between evidence and conclusions. The reasoning is impossible to follow and demonstrates a complete absence of logical thinking.
    *   **2:** The arguments suffer from **severe, fundamental logical flaws**, major inconsistencies, and a complete lack of clear connections between evidence and claims. Reasoning is often incoherent and makes no sense.
    *   **3:** The arguments are somewhat unclear or contain logical leaps, making it challenging to fully follow the reasoning. There are inconsistencies and gaps in logic, and use this option if the paper truly balances on being coherent and incoherent.
    *   **4:** The arguments are **mostly clear, logical, and well-structured,** with only *minor and infrequent* ambiguities. Reasoning is generally solid, and any minor issues do not significantly impede the overall clarity. A few small inconsistencies are acceptable at this score.
    *   **5:** The arguments are **highly coherent, logical, and well-presented**, demonstrating a strong command of argumentation and a clear connection between evidence and conclusions. Reasoning is easy to follow and is well-supported.

*   **Unsubstantiated Claims:**
    *   **1:** The paper is **filled with outrageous and preposterous claims** lacking any support whatsoever. Assertions are demonstrably false, contradict established knowledge, and reveal a reckless disregard for evidence. The paper is essentially an exercise in unsubstantiated fabrication.
    *   **2:** The paper makes a **great number of unsubstantiated and poorly supported claims.** There is little to no evidence presented to support major assertions, and key statements are made with a complete absence of supporting facts.
    *   **3:** The paper makes some unsubstantiated claims and lacks adequate support for key assertions. Some claims require additional evidence. Use this option if the paper truly falls between being well-supported and unsubstantiated.
    *   **4:**  **Most claims are well-supported** by evidence, with only *minor and infrequent* claims that could use additional substantiation. The core assertions are backed by data or established knowledge, and a few unproven assertions are acceptable.
    *   **5:** **The claims are highly substantiated and well-supported**, demonstrating a thorough effort to provide backing for all assertions. A strong effort to support all claims with data and knowledge is apparent, even if it's not perfect.

Your output MUST be a JSON object with the following structure:

```json
{
  "scores": {
    "inappropriate_methodologies": <integer between 1 and 5>,
    "incoherent_arguments": <integer between 1 and 5>,
    "unsubstantiated_claims": <integer between 1 and 5>
    }
}

"""

    model = genai.GenerativeModel(
        model_name="gemini-2.0-flash-exp",
        generation_config=generation_config,
        system_instruction=sys_prompt
    )

    papers = read_all_lines()
    results = []

    for i, paper in enumerate(papers, 1):
        print(f"\nProcessing paper {i} of {len(papers)}")
        result = process_paper(model, paper)
        results.append(f"For the doc number {i} the result is")
        results.append(result)
        print(f"Processed paper {i}")

    save_results(results)
    print("\nResults saved to gemini2_output.txt")

if __name__ == "__main__":
    main()


Processing paper 1 of 135
Waiting 3.25 seconds to maintain rate limit...
Processed paper 1

Processing paper 2 of 135
Waiting 4.67 seconds to maintain rate limit...
Processed paper 2

Processing paper 3 of 135
Waiting 5.18 seconds to maintain rate limit...
Processed paper 3

Processing paper 4 of 135
Waiting 4.67 seconds to maintain rate limit...
Processed paper 4

Processing paper 5 of 135
Waiting 5.25 seconds to maintain rate limit...
Processed paper 5

Processing paper 6 of 135
Waiting 5.10 seconds to maintain rate limit...
Processed paper 6

Processing paper 7 of 135
Waiting 4.95 seconds to maintain rate limit...
Processed paper 7

Processing paper 8 of 135
Waiting 5.20 seconds to maintain rate limit...
Processed paper 8

Processing paper 9 of 135
Waiting 5.27 seconds to maintain rate limit...
Processed paper 9

Processing paper 10 of 135
Waiting 4.95 seconds to maintain rate limit...
Processed paper 10

Processing paper 11 of 135
Waiting 4.90 seconds to maintain rate limit...
Pro

In [14]:
import pandas as pd
import json

# Path to the text file
file_path = 'gemini2_output.txt'

# Initialize list to store the rows
rows = []

# Read the file and process each document
with open(file_path, 'r') as file:
    current_doc = {}
    json_str = ""  # Initialize json_str before use
    for line in file:
        line = line.strip()
        if line.startswith("For the doc number"):
            if current_doc:  # Save the previous document if exists
                scores = current_doc["scores"]
                inappropriate = scores["inappropriate_methodologies"]
                incoherent = scores["incoherent_arguments"]
                unsubstantiated = scores["unsubstantiated_claims"]
                total = inappropriate + incoherent + unsubstantiated
                # methodologies_found = current_doc.get("inappropriate_methodologies_found", [])
                rows.append({
                    "Doc Number": len(rows) + 1,
                    "Inappropriate Methodologies": inappropriate,
                    "Incoherent Arguments": incoherent,
                    "Unsubstantiated Claims": unsubstantiated,
                    "Total Score": total,
                    # "Inappropriate Methodologies Found": len(methodologies_found)
                })
            current_doc = {}  # Reset for the new document
            json_str = ""  # Reset json_str for the new document
        elif line.startswith("{"):  # JSON-like content starts
            json_str = line
        elif line.endswith("}") and json_str.endswith("}"):  # JSON-like content ends
            json_str += line
            current_doc = json.loads(json_str)
        elif line:  # Append multiline JSON content
            json_str += line

# Add the last document if any
if current_doc:
    scores = current_doc["scores"]
    inappropriate = scores["inappropriate_methodologies"]
    incoherent = scores["incoherent_arguments"]
    unsubstantiated = scores["unsubstantiated_claims"]
    total = inappropriate + incoherent + unsubstantiated
    # methodologies_found = current_doc.get("inappropriate_methodologies_found", [])
    rows.append({
        "Doc Number": len(rows) + 1,
        "Inappropriate Methodologies": inappropriate,
        "Incoherent Arguments": incoherent,
        "Unsubstantiated Claims": unsubstantiated,
        "Total Score": total,
        # "Inappropriate Methodologies Found": len(methodologies_found)
    })

# Create a DataFrame
df = pd.DataFrame(rows)
df

# Display the DataFrame to the user

Unnamed: 0,Doc Number,Inappropriate Methodologies,Incoherent Arguments,Unsubstantiated Claims,Total Score
0,1,2,3,2,7
1,2,1,1,1,3
2,3,2,2,2,6
3,4,4,4,4,12
4,5,4,4,4,12
...,...,...,...,...,...
130,131,4,4,4,12
131,132,2,2,1,5
132,133,4,4,4,12
133,134,1,1,1,3


In [18]:
df['Publishable'] = df['Total Score'].apply(lambda x: 1 if x >= 11 else 0)
df

Unnamed: 0,Doc Number,Inappropriate Methodologies,Incoherent Arguments,Unsubstantiated Claims,Total Score,Publishable
0,1,2,3,2,7,0
1,2,1,1,1,3,0
2,3,2,2,2,6,0
3,4,4,4,4,12,1
4,5,4,4,4,12,1
...,...,...,...,...,...,...
130,131,4,4,4,12,1
131,132,2,2,1,5,0
132,133,4,4,4,12,1
133,134,1,1,1,3,0


In [20]:
df['Paper ID'] = 'P' + (df.index + 1).astype(str).str.zfill(3)

df

Unnamed: 0,Doc Number,Inappropriate Methodologies,Incoherent Arguments,Unsubstantiated Claims,Total Score,Publishable,Paper ID
0,1,2,3,2,7,0,P001
1,2,1,1,1,3,0,P002
2,3,2,2,2,6,0,P003
3,4,4,4,4,12,1,P004
4,5,4,4,4,12,1,P005
...,...,...,...,...,...,...,...
130,131,4,4,4,12,1,P131
131,132,2,2,1,5,0,P132
132,133,4,4,4,12,1,P133
133,134,1,1,1,3,0,P134


In [24]:
df['Conference'] = ''
df['Rationale'] = ''

# Update 'Conference' and 'Rationale' based on 'Publishable' column
df.loc[df['Publishable'] == 0, ['Conference', 'Rationale']] = 'na'
df

Unnamed: 0,Doc Number,Inappropriate Methodologies,Incoherent Arguments,Unsubstantiated Claims,Total Score,Publishable,Paper ID,Conference,Rationale
0,1,2,3,2,7,0,P001,na,na
1,2,1,1,1,3,0,P002,na,na
2,3,2,2,2,6,0,P003,na,na
3,4,4,4,4,12,1,P004,,
4,5,4,4,4,12,1,P005,,
...,...,...,...,...,...,...,...,...,...
130,131,4,4,4,12,1,P131,,
131,132,2,2,1,5,0,P132,na,na
132,133,4,4,4,12,1,P133,,
133,134,1,1,1,3,0,P134,na,na


In [26]:
import pandas as pd


with open('/content/extracted_full_abstracts.txt', 'r') as file:
    abstracts = [line.strip() for line in file.readlines()]

# Add the 'Abstract' column to the DataFrame
if len(abstracts) == len(df):
    df['Abstract'] = abstracts
else:
    print("Error: Number of abstracts does not match the number of rows in the DataFrame.")
    # Handle the mismatch appropriately, e.g., fill with NaN or truncate/pad the lists
    df['Abstract'] = abstracts[:len(df)] #taking the first len(df) abstracts

# Now df contains the new 'Abstract' column
print(df.head())

   Doc Number  Inappropriate Methodologies  Incoherent Arguments  \
0           1                            2                     3   
1           2                            1                     1   
2           3                            2                     2   
3           4                            4                     4   
4           5                            4                     4   

   Unsubstantiated Claims  Total Score  Publishable Paper ID Conference  \
0                       2            7            0     P001         na   
1                       1            3            0     P002         na   
2                       2            6            0     P003         na   
3                       4           12            1     P004              
4                       4           12            1     P005              

  Rationale                                           Abstract  
0        na  Drone tracking and localization are essential ...  
1        n

In [21]:
result_df = df[['Paper ID', 'Publishable']]
result_df

Unnamed: 0,Paper ID,Publishable
0,P001,0
1,P002,0
2,P003,0
3,P004,1
4,P005,1
...,...,...
130,P131,1
131,P132,0
132,P133,1
133,P134,0


# Task - 2

In [23]:
pinecone = "#####"

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone)

In [25]:
import time

index_name = "all-conferences-450"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [28]:
import getpass
import os
from langchain_openai import OpenAIEmbeddings

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")



embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

Enter API key for OpenAI: ··········


In [30]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [31]:
def get_conference_predictions(text, vector_store):
    """
    Get conference predictions based on similarity search
    Args:
        text: Input text to analyze
        vector_store: Initialized vector store for similarity search
    Returns:
        tuple: (list of top conferences, highest frequency number)
    """
    # Perform similarity search
    results = vector_store.similarity_search(text, k=15)

    # Convert to DataFrame and get conference frequencies
    df = pd.DataFrame(results)
    conferences = df.iloc[:,1].apply(lambda x: x[1]['conference'])
    freq_counts = conferences.value_counts()

    # Get max frequency
    max_freq = freq_counts.max()

    # Get conference list based on threshold
    if max_freq >= 8:
        conf_list = [freq_counts.index[0]]  # Only top conference
    else:
        conf_list = freq_counts.index[:3].tolist()  # Top 3 conferences

    return conf_list, max_freq

In [32]:
# prompt: send abstract of every row in the df whose value of Publishable column is 1 to get_conference_predictions. it will give two outputs, conf_list, max_freq --  put these values in new columns : Maxfreq and Conflist. create these new columns for the whole dataset. leave the ones with Publishable 0 as blanks

# Assuming 'df' and 'get_conference_predictions' are defined as in the provided code.

df['Maxfreq'] = ''
df['Conflist'] = ''

for index, row in df.iterrows():
    if row['Publishable'] == 1:
        abstract = row['Abstract']
        conf_list, max_freq = get_conference_predictions(abstract, vector_store)
        df.loc[index, 'Conflist'] = str(conf_list)  # Store as string
        df.loc[index, 'Maxfreq'] = max_freq

print(df.head())

   Doc Number  Inappropriate Methodologies  Incoherent Arguments  \
0           1                            2                     3   
1           2                            1                     1   
2           3                            2                     2   
3           4                            4                     4   
4           5                            4                     4   

   Unsubstantiated Claims  Total Score  Publishable Paper ID Conference  \
0                       2            7            0     P001         na   
1                       1            3            0     P002         na   
2                       2            6            0     P003         na   
3                       4           12            1     P004              
4                       4           12            1     P005              

  Rationale                                           Abstract Maxfreq  \
0        na  Drone tracking and localization are essential ...    

In [38]:
# prompt: if Maxfreq value of a row is >=8, update the  Conference column of that row to the name of conference Conflist contains. (there will be only 1 name, one of TMLR ,NIPS  ,KDD ,
# CVPR ,EMNLP  )

# Assuming 'df' and other necessary variables are defined as in the provided code.

# Update 'Conference' column based on 'Maxfreq' and 'Conflist'
for index, row in df.iterrows():
  if row['Maxfreq'] != '':
    if int(row['Maxfreq']) >= 8 and row['Publishable'] == 1:
        conf_list = eval(row['Conflist'])  # Convert string representation of list back to list
        if conf_list:
            df.loc[index, 'Conference'] = conf_list[0] #only one name in the list

print(df.head())

   Doc Number  Inappropriate Methodologies  Incoherent Arguments  \
0           1                            2                     3   
1           2                            1                     1   
2           3                            2                     2   
3           4                            4                     4   
4           5                            4                     4   

   Unsubstantiated Claims  Total Score  Publishable Paper ID Conference  \
0                       2            7            0     P001         na   
1                       1            3            0     P002         na   
2                       2            6            0     P003         na   
3                       4           12            1     P004        KDD   
4                       4           12            1     P005       CVPR   

  Rationale                                           Abstract Maxfreq  \
0        na  Drone tracking and localization are essential ...    

In [40]:
TMLR = """
The Transactions on Machine Learning Research (TMLR) is a relatively new journal-style publication venue associated with the International Conference on Machine Learning (ICML). It operates as an open access, continuous publication journal rather than a traditional conference with deadlines.

**Key Characteristics:**

*   **Journal Format:** TMLR papers undergo a rigorous peer-review process, similar to journals, and accepted papers are continuously published online, rather than being presented at a specific conference.
*   **Open Access:** All TMLR publications are freely available to read and download.
*   **Focus:** TMLR aims to publish high-quality research in all areas of machine learning, with an emphasis on rigorous methodology and impactful contributions.
*   **Review Process:** The review process is thorough and can take several rounds of revision, leading to well-vetted, high-quality publications. The reviewers tend to be experts in the respective fields.
*   **ICML Affiliation:** While TMLR is a standalone publication, it is closely affiliated with ICML and accepted papers are typically highlighted during the ICML conference.
*   **Emphasis on Clarity and Reproducibility:** TMLR encourages clear writing, code sharing, and proper evaluation to facilitate reproducibility.
*   **Continuous Publication:** Unlike conferences with fixed deadlines and schedules, TMLR publishes accepted papers on an ongoing basis.
*   **Not a Conference:** TMLR does not have a traditional conference component like presentations or posters. Accepted papers are published online, and there's no physical or virtual gathering for presenting.

**In summary, TMLR is a prestigious, peer-reviewed journal-style publication venue focusing on high-quality machine learning research with a rigorous review process and open access policy. It prioritizes clarity, reproducibility, and impactful contributions to the field and is closely associated with the ICML conference.**
"""

NIPS = """
The Conference on Neural Information Processing Systems (NeurIPS, formerly NIPS) is one of the most prestigious and highly selective conferences in the field of artificial intelligence and machine learning.

**Key Characteristics:**

*   **Highly Competitive:** NeurIPS is known for its extremely low acceptance rates, making it very difficult to get a paper accepted. This also means the accepted papers are typically of very high quality and significant impact.
*   **Broad Coverage:** NeurIPS covers a wide range of topics within machine learning, including deep learning, reinforcement learning, optimization, theoretical foundations, probabilistic methods, and more.
*   **Strong Focus on Deep Learning:** With the recent surge in popularity of deep learning, NeurIPS has become a major venue for showcasing cutting-edge research in this subfield.
*   **Poster and Oral Presentations:** Accepted papers are typically presented as posters, and a smaller subset are selected for oral presentations.
*   **Workshops and Tutorials:** NeurIPS features various workshops and tutorials covering a broad spectrum of topics in AI and machine learning. These workshops provide an in-depth view into specialized areas and future research directions.
*   **Large and Diverse Audience:** The conference attracts a diverse audience from academia, industry, and government research labs.
*   **High Visibility and Impact:** NeurIPS papers often have a significant impact on the field, and they are frequently cited by other researchers.
*   **Single-Blind Reviewing:** Typically, the reviewing process is single-blind, meaning that reviewers know the authors' identities, but the authors do not know the reviewers' identities.
*   **Focus on Novel Contributions:** The conference focuses on original and innovative research contributions.
*   **Annual Event:** NeurIPS is an annual conference that takes place each year, usually in December.

**In summary, NeurIPS is a leading global conference in AI and machine learning, renowned for its high standards, breadth of coverage, and its impact on shaping the future direction of the field. It's a crucial venue for researchers to share cutting-edge findings, network with experts, and stay updated on the latest advancements.**
"""

KDD = """
The ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD) is a premier international conference focused on knowledge discovery, data mining, and machine learning.

**Key Characteristics:**

*   **Focus on Knowledge Discovery:** KDD is distinguished by its emphasis on the full knowledge discovery process, from data preprocessing and cleaning to the deployment and evaluation of models.
*   **Real-World Applications:** While theoretical research is welcome, KDD has a strong focus on practical applications and how data mining can solve real-world problems.
*   **Industry Relevance:** KDD attracts a significant number of attendees from the industry, and many papers showcase industry-oriented applications and solutions.
*   **Broad Topics:** The conference covers a diverse set of topics, including machine learning, data mining, data science, database management, big data, and social network analysis.
*   **Research and Applied Tracks:** KDD features both research tracks, where the contributions are theoretical and algorithmic, and applied tracks, where the papers focus on real-world case studies and deployments.
*   **Workshops and Tutorials:** KDD offers various workshops and tutorials covering specialized topics and practical techniques.
*   **KDD Cup:** The conference also includes the KDD Cup, a data mining competition that attracts researchers and practitioners to solve challenging problems.
*   **Emphasis on Scalability and Efficiency:** Given the focus on real-world data, KDD emphasizes the scalability and efficiency of algorithms and solutions.
*   **Large and Established Conference:** KDD is one of the longest-running and most well-known conferences in data mining and knowledge discovery.
*   **Double-Blind Reviewing:** The review process is typically double-blind, meaning that both the reviewers and authors are anonymous.

**In summary, KDD is a highly respected and impactful conference focused on bridging the gap between research and practical applications of data mining and knowledge discovery. It provides a valuable platform for researchers, practitioners, and industry professionals to share their work, learn from each other, and advance the field.**
"""

CVPR = """
The Conference on Computer Vision and Pattern Recognition (CVPR) is one of the most prestigious and competitive conferences in the field of computer vision.

**Key Characteristics:**

*   **Premier Computer Vision Venue:** CVPR is widely regarded as a top venue for publishing research in computer vision, attracting submissions from top research labs worldwide.
*   **Broad Range of Topics:** The conference covers a diverse set of topics within computer vision, including image recognition, object detection, segmentation, 3D vision, video analysis, medical image analysis, and robotics vision.
*   **Focus on Visual Data Analysis:** CVPR is primarily concerned with the analysis, interpretation, and understanding of visual data, such as images and videos.
*   **Rigorous Peer Review:** The review process is typically rigorous, with a large number of submissions and a low acceptance rate.
*   **Emphasis on Technical Innovation:** CVPR papers typically demonstrate technical innovation and advance the state-of-the-art in the field.
*   **Poster and Oral Presentations:** Accepted papers are presented as posters and a smaller subset is selected for oral presentation.
*   **Tutorials and Workshops:** CVPR features numerous tutorials and workshops focused on specialized areas of computer vision.
*   **Large and International Community:** The conference attracts a large and international audience from academia and industry, highlighting its wide reach.
*   **Annual Event:** CVPR is an annual conference that typically takes place in the summer.
*   **Double-Blind Reviewing:** The review process is double-blind, ensuring a fair assessment of the work, where reviewers are not aware of the author's identities and vice-versa.
*   **Publicly Available Code:** It is increasingly expected that papers will also make their code public, contributing to reproducibility and open science.

**In summary, CVPR is a leading international conference in computer vision, highly regarded for its rigorous review process, strong focus on technical innovation, broad coverage of topics, and significant impact on the field. It is a crucial venue for researchers to disseminate their latest findings, engage with the computer vision community, and advance the field forward.**
"""

EMNLP = """
The Conference on Empirical Methods in Natural Language Processing (EMNLP) is a leading international conference in the field of natural language processing (NLP).

**Key Characteristics:**

*   **Focus on Empirical NLP:** EMNLP emphasizes empirical research, focusing on methods and models that are tested and evaluated using real-world data.
*   **Broad Range of Topics:** The conference covers a wide array of topics within NLP, including text classification, machine translation, language modeling, information extraction, dialogue systems, summarization, question answering, and more.
*   **Data-Driven Approach:** EMNLP papers typically use a data-driven approach, focusing on using datasets to train and evaluate NLP models.
*   **Focus on Evaluation Metrics:** Rigorous evaluation is a key component of EMNLP research, emphasizing the use of standardized metrics to assess the performance of NLP models.
*   **Theoretical and Applied Research:** EMNLP welcomes both theoretical and applied research, as long as the research is empirically grounded.
*   **Strong Community:** The conference has a strong and supportive community of NLP researchers and practitioners.
*   **Poster and Oral Presentations:** Accepted papers are typically presented as posters, with a smaller subset selected for oral presentations.
*   **Workshops and Tutorials:** EMNLP features various workshops and tutorials covering specialized topics within NLP.
*   **Annual Event:** EMNLP is an annual conference that takes place each year.
*   **Double-Blind Reviewing:** The review process is double-blind, ensuring anonymity of both authors and reviewers.
*  **Emphasis on Reproducibility:** Like other top tier ML conferences, EMNLP encourages reproducible research by encouraging code sharing.

**In summary, EMNLP is a premier conference for empirical research in natural language processing, focused on data-driven approaches, rigorous evaluation, and addressing real-world problems using NLP techniques. It serves as an important platform for researchers to share their work, collaborate with peers, and contribute to the advancement of the NLP field.**
"""


In [39]:
import pandas as pd


with open('/content/extracted_full_texts.txt', 'r') as file:
    Fulltext = [line.strip() for line in file.readlines()]

# Add the 'Abstract' column to the DataFrame
if len(Fulltext) == len(df):
    df['Fulltext'] = Fulltext
else:
    print("Error: Number of Fulltext does not match the number of rows in the DataFrame.")
    # Handle the mismatch appropriately, e.g., fill with NaN or truncate/pad the lists
    df['Abstract'] = Fulltext[:len(df)] #taking the first len(df) Fulltext

# Now df contains the new 'Fulltext' column
print(df.head())

   Doc Number  Inappropriate Methodologies  Incoherent Arguments  \
0           1                            2                     3   
1           2                            1                     1   
2           3                            2                     2   
3           4                            4                     4   
4           5                            4                     4   

   Unsubstantiated Claims  Total Score  Publishable Paper ID Conference  \
0                       2            7            0     P001         na   
1                       1            3            0     P002         na   
2                       2            6            0     P003         na   
3                       4           12            1     P004        KDD   
4                       4           12            1     P005       CVPR   

  Rationale                                           Abstract Maxfreq  \
0        na  Drone tracking and localization are essential ...    

In [46]:
# prompt: Create a new column with empty strings, prompt_classifier in df. for rows with Publishable as 1 and Conference as 'na', construct the prompt as follows:  Here is the full text of a research paper:
# {full_text_query} -- which is the value of Fulltext column
# Here are the descriptions of the conferences you should consider:
# {conference_details} -- for this -- first extract all the names of the conferences in Conflist (which is a list converted to string). the details of each conference is stored in a variable with the name of the conference, as a string
# Based on the provided information, which conference is the best fit for this paper?

# Assuming df, TMLR, NIPS, KDD, CVPR, and EMNLP are defined as in the previous code.

df['prompt_classifier'] = ''

for index, row in df.iterrows():
    if row['Publishable'] == 1 and row['Conference'] == '':
        full_text = row['Fulltext']
        conf_names = eval(row['Conflist'])  # Convert string representation of list back to list
        conference_details = ''
        for conf_name in conf_names:
          if conf_name == 'TMLR':
            conference_details += f'{conf_name}: {TMLR}\n'
          elif conf_name == 'NIPS':
            conference_details += f'{conf_name}: {NIPS}\n'
          elif conf_name == 'KDD':
            conference_details += f'{conf_name}: {KDD}\n'
          elif conf_name == 'CVPR':
            conference_details += f'{conf_name}: {CVPR}\n'
          elif conf_name == 'EMNLP':
            conference_details += f'{conf_name}: {EMNLP}\n'
          # Add more conference details as needed

        prompt = f"""Here is the full text of a research paper:
{full_text}
Here are the descriptions of the conferences you should consider:
{conference_details}
Based on the provided information, which conference is the best fit for this paper?"""

        df.loc[index, 'prompt_classifier'] = prompt

print(df.head())

   Doc Number  Inappropriate Methodologies  Incoherent Arguments  \
0           1                            2                     3   
1           2                            1                     1   
2           3                            2                     2   
3           4                            4                     4   
4           5                            4                     4   

   Unsubstantiated Claims  Total Score  Publishable Paper ID Conference  \
0                       2            7            0     P001         na   
1                       1            3            0     P002         na   
2                       2            6            0     P003         na   
3                       4           12            1     P004        KDD   
4                       4           12            1     P005       CVPR   

  Rationale                                           Abstract Maxfreq  \
0        na  Drone tracking and localization are essential ...    

In [49]:
import time
import google.generativeai as genai

def predict_conference(text, api_key="####"):
    """
    Predict the most suitable conference for a given research paper text
    Args:
        text (str): Research paper text
        api_key (str): Gemini API key
    Returns:
        str: Predicted conference name
    """
    # Configure Gemini
    genai.configure(api_key=api_key)

    # Setup model config
    generation_config = {
        "temperature": 0.1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 8192,
    }

    # System prompt
    sys_prompt = """
    You are an expert research paper classifier.
    Your task is to analyze a research paper's full text and classify it into one of a given set of conferences.
    You will be provided with the full text of a research paper, as well as detailed descriptions of the conferences you should classify to.
    You must output only the name of the conference that best fits the research paper's topic and methodology. Do not provide any additional information, only one word: the conference name.
    If absolutely no conference matches, return the closest conference.
    """

    # Initialize model
    model = genai.GenerativeModel(
        model_name="gemini-2.0-flash-exp",
        generation_config=generation_config,
        system_instruction=sys_prompt
    )

    # Process with rate limiting
    start_time = time.time()
    try:
        chat = model.start_chat(history=[])
        response = chat.send_message(text)

        # Rate limiting
        elapsed = time.time() - start_time
        if elapsed < 7:
            time.sleep(7 - elapsed)

        return response.text.strip()
    except Exception as e:
        print(f"API Error: {str(e)}")
        return None


In [52]:
# prompt: for rows with Publishable as 1 and Conference as 'na', send in the value of prompt_classifier column to predict_conference and use the output to update the value of Conference column

# Assuming df, predict_conference, and other necessary variables are defined as in the provided code.

for index, row in df.iterrows():
    if row['Publishable'] == 1 and row['Conference'] == '':
        print(f"yes {index}")
        prompt = row['prompt_classifier']
        predicted_conf = predict_conference(prompt)
        if predicted_conf:
            df.loc[index, 'Conference'] = predicted_conf
        else:
            # Handle cases where the prediction fails (e.g., API error)
            print(f"Prediction failed for index {index}")
            # You might want to set a default value, skip the row, or retry later
            # For example:
            # df.loc[index, 'Conference'] = 'unknown'
            pass # or continue


print(df.head())

yes 102
   Doc Number  Inappropriate Methodologies  Incoherent Arguments  \
0           1                            2                     3   
1           2                            1                     1   
2           3                            2                     2   
3           4                            4                     4   
4           5                            4                     4   

   Unsubstantiated Claims  Total Score  Publishable Paper ID Conference  \
0                       2            7            0     P001         na   
1                       1            3            0     P002         na   
2                       2            6            0     P003         na   
3                       4           12            1     P004        KDD   
4                       4           12            1     P005       CVPR   

  Rationale                                           Abstract Maxfreq  \
0        na  Drone tracking and localization are essential

In [54]:
df['Conference'] = df['Conference'].replace('NeurIPS', 'NIPS')
df['Conference'].value_counts()


Unnamed: 0_level_0,count
Conference,Unnamed: 1_level_1
na,41
CVPR,26
NIPS,26
EMNLP,23
KDD,10
TMLR,9


In [59]:
import time
import google.generativeai as genai

def generate_conference_reasoning(full_text, final_conference, reference_papers, api_key="####"):
    """
    Generate reasoning for why a paper fits a conference
    Args:
        full_text (str): Research paper text
        final_conference (str): Selected conference
        reference_papers (str): Reference papers text
        api_key (str): Gemini API key
    Returns:
        str: Reasoning text (max 100 words)
    """
    # Configure Gemini
    genai.configure(api_key=api_key)

    # Setup model config
    generation_config = {
        "temperature": 0.1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 8192,
    }

    # System prompt
    sys_prompt = """
    You are an expert research paper classifier and reasoning generator.
    Your task is to analyze a research paper's full text, a set of reference papers,
    and a given conference to generate a short reasoning why the paper fits that conference.
    You must output a reasoning of maximum 100 words. The reasoning should justify why the
    paper fits into the specified conference, and include a sentence of comparisons to the reference papers.
    """

    # Initialize model
    model = genai.GenerativeModel(
        model_name="gemini-2.0-flash-exp",
        generation_config=generation_config,
        system_instruction=sys_prompt
    )

    # Process with rate limiting
    start_time = time.time()
    try:
        chat = model.start_chat(history=[])

        user_prompt = f"""
        Here is the full text of a research paper:
        {full_text}

        The final conference selection is: {final_conference}

        Here are the reference papers:
        {reference_papers}

        Provide a reasoning of maximum 100 words for why this paper best fits the selected conference.
        Add a sentence to justify your answer by comparing it to the provided reference papers.
        """

        response = chat.send_message(user_prompt)

        # Rate limiting
        elapsed = time.time() - start_time
        if elapsed < 7.2:
            time.sleep(7.2 - elapsed)

        return response.text.strip()
    except Exception as e:
        print(f"API Error: {str(e)}")
        return None



In [56]:

with open('/content/labeled_texts_marker.txt', 'r') as f:
    lines = f.readlines()

CVPR_reference = lines[5].strip()
EMNLP_reference = lines[7].strip()
KDD_reference = lines[9].strip()
NIPS_reference = lines[11].strip()
TMLR_reference = lines[13].strip()

In [60]:
# prompt: for every row in df with Publishable as 1, use generate_conference_reasoning(full_text, final_conference, reference_papers) send Fulltext column to full_text, Conference to final_conference, {conference_name}_reference for reference_papers. save the output to update the Reasoning column

for index, row in df.iterrows():
    if row['Publishable'] == 1:
        print(f'processing {index+1}')
        full_text = row['Fulltext']
        final_conference = row['Conference']

        if final_conference == 'CVPR':
            reference_papers = CVPR_reference
        elif final_conference == 'EMNLP':
            reference_papers = EMNLP_reference
        elif final_conference == 'KDD':
            reference_papers = KDD_reference
        elif final_conference == 'NIPS':
            reference_papers = NIPS_reference
        elif final_conference == 'TMLR':
            reference_papers = TMLR_reference
        else:
            reference_papers = ""

        reasoning = generate_conference_reasoning(full_text, final_conference, reference_papers)
        df.loc[index, 'Reasoning'] = reasoning

processing 4
processing 5
processing 7
processing 8
processing 9
processing 10
processing 11
processing 12
processing 13
processing 14
processing 15
processing 17
processing 18
processing 19
processing 21
processing 23
processing 24
processing 25
processing 27
processing 28
processing 29
processing 30
processing 31
processing 33
processing 34
processing 37
processing 40
processing 42
processing 44
processing 45
processing 46
processing 49
processing 50
processing 51
processing 52
processing 54
processing 55
processing 57
processing 58
processing 59
processing 60
processing 61
processing 62
processing 63
processing 64
processing 65
processing 66
processing 67
processing 68
processing 71
processing 72
processing 74
processing 75
processing 79
processing 80
processing 82
processing 83
processing 84
processing 85
processing 87
processing 88
processing 89
processing 90
processing 91
processing 92
processing 93
processing 95
processing 99
processing 101
processing 102
processing 103
processi

In [64]:
df

Unnamed: 0,Doc Number,Inappropriate Methodologies,Incoherent Arguments,Unsubstantiated Claims,Total Score,Publishable,Paper ID,Conference,Rationale,Abstract,Maxfreq,Conflist,Fulltext,prompt_classifier,Reasoning
0,1,2,3,2,7,0,P001,na,na,Drone tracking and localization are essential ...,,,# Leveraging Clustering Techniques for Enhance...,,na
1,2,1,1,1,3,0,P002,na,na,Virus transmission is intricately linked to th...,,,# Virus Propagation and their Far-Reaching Imp...,,na
2,3,2,2,2,6,0,P003,na,na,Explainable reinforcement learning has emerged...,,,# Explainable Reinforcement Learning for Finan...,,na
3,4,4,4,4,12,1,P004,KDD,,This study introduces a novel concept of train...,9,['KDD'],# Graph Neural Networks Without Training: Harn...,,This paper introduces a novel training-free gr...
4,5,4,4,4,12,1,P005,CVPR,,This research introduces a comprehensive cloth...,11,['CVPR'],# Collaborative Clothing Segmentation and Iden...,,The paper presents a novel clothing co-parsing...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,131,4,4,4,12,1,P131,CVPR,,This paper details our submission for stage 2 ...,5,"['TMLR', 'CVPR', 'KDD']",# Enhancing Disentanglement through Learned Ag...,Here is the full text of a research paper:\n# ...,This paper presents a method for enhancing dis...
131,132,2,2,1,5,0,P132,na,na,This study presents a groundbreaking approach ...,,,# Analyzing Fermentation Patterns with Multi-M...,,na
132,133,4,4,4,12,1,P133,EMNLP,,This paper reduces discontinuous parsing to se...,5,"['EMNLP', 'KDD', 'NIPS']",# Discontinuous Constituent Parsing as Sequenc...,Here is the full text of a research paper:\n# ...,This paper presents a novel sequence labeling ...
133,134,1,1,1,3,0,P134,na,na,The quintessential nature of DNA is intertwine...,,,# Unraveling the Enigmatic Parallels Between D...,,na


In [65]:
final_results_df = df[['Paper ID', 'Publishable', 'Conference', 'Reasoning']].copy()
print(final_results_df.head())

  Paper ID  Publishable Conference  \
0     P001            0         na   
1     P002            0         na   
2     P003            0         na   
3     P004            1        KDD   
4     P005            1       CVPR   

                                           Reasoning  
0                                                 na  
1                                                 na  
2                                                 na  
3  This paper introduces a novel training-free gr...  
4  The paper presents a novel clothing co-parsing...  


In [66]:
final_results_df['Conference'] = df['Conference'].str.lower()
final_results_df['Conference'] = df['Conference'].replace('nips', 'neurips')

In [67]:
from google.colab import files
final_results_df.to_csv('results.csv', encoding = 'utf-8-sig')
files.download('results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>