In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


gemini - DONOT USE CAUSE WE WILL HAVE TO PAY I JUST LEFT IT FOR TESTING

In [7]:
import torch
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json
import google.generativeai as genai
import os

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # Set up SciBERT
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # Set up Gemini
        API_KEY = os.getenv("API_KEY", "KEY")
        genai.configure(api_key=API_KEY)

        # Create Gemini Pro model
        self.gemini = genai.GenerativeModel('gemini-pro')

    def _generate_rationale(self, paper_content: str, is_publishable: bool, conference: str = None, scores: dict = None) -> str:
      if is_publishable:
          prompt = f"""
          Analyze this research paper for {conference} conference.
          Paper scores:
          - Technical similarity: {scores.get('technical', 0):.2f}
          - Methodology strength: {scores.get('method', 0):.2f}
          - Innovation level: {scores.get('innovation', 0):.2f}

          Evaluate based on:
          1. Technical alignment with {conference}
          2. Research methodology quality
          3. Innovation and contribution value

          Paper excerpt: {paper_content[:2000]}...

          Provide a concise rationale (max 100 words) explaining why this paper should be published at {conference}:
          """
      else:
          prompt = f"""
          Analyze this research paper that was classified as non-publishable.
          Paper scores:
          - Technical quality: {scores.get('technical', 0):.2f}
          - Methodology: {scores.get('method', 0):.2f}
          - Innovation: {scores.get('innovation', 0):.2f}

          Evaluate critical issues in:
          1. Technical soundness
          2. Research methodology
          3. Novel contribution

          Paper excerpt: {paper_content[:2000]}...

          Provide a concise rationale (max 100 words) explaining why this paper needs improvement before publication:
          """

      try:
          response = self.gemini.generate_content(prompt)
          return response.text[:300].strip()
      except Exception as e:
          logging.error(f"Error generating rationale: {e}")
          return "Error generating rationale"

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _setup_models(self):
        try:
            self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
            self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

            # Set up Gemini
            API_KEY = os.getenv("API_KEY", "AIzaSyABSVGA83cLKJ-RhSFFbzMJkzG3aYE_YWE")
            genai.configure(api_key=API_KEY)
            self.gemini = genai.GenerativeModel('gemini-pro')
        except Exception as e:
            logging.error(f"Error setting up models: {e}")
            raise

    def _get_embedding(self, text: str) -> torch.Tensor:
        if not text or not isinstance(text, str):
            raise ValueError("Invalid input text for embedding generation")

        try:
            inputs = self.tokenizer(
                text,
                max_length=512,
                truncation=True,
                padding='max_length',
                return_tensors="pt",
                return_attention_mask=True
            )

            # Move to device
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.scibert(**inputs)

            if outputs.last_hidden_state is None:
                raise ValueError("Model output is None")

            # Use attention mask for weighted mean
            mask = inputs['attention_mask'].unsqueeze(-1).expand(outputs.last_hidden_state.size())
            masked_embeddings = outputs.last_hidden_state * mask
            sum_embeddings = torch.sum(masked_embeddings, dim=1)
            sum_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
            return sum_embeddings / sum_mask

        except Exception as e:
            logging.error(f"Error generating embedding: {e}")
            raise

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Validate inputs
        if not reference_papers or not papers:
            raise ValueError("Empty input papers list")

        # Process reference papers
        try:
            for ref in reference_papers:
                if not ref.content:
                    logging.warning(f"Empty content in reference paper: {ref.path}")
                    continue

                emb = self._get_embedding(ref.content)
                if ref.label == "Publishable":
                    ref_embeddings["publishable"].append(emb)
                    if ref.conference not in ref_embeddings["conferences"]:
                        ref_embeddings["conferences"][ref.conference] = []
                    ref_embeddings["conferences"][ref.conference].append(emb)
                else:
                    ref_embeddings["nonpublishable"].append(emb)
        except Exception as e:
            logging.error(f"Error processing reference papers: {e}")
            raise

        # Process papers to classify
        for paper in papers:
            try:
                if not paper.content:
                    raise ValueError(f"Empty content in paper: {paper.path}")

                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                # Add threshold and weighting
                pub_threshold = 0.65
                is_publishable = (np.mean(pub_scores) * 1.2) > np.mean(nonpub_scores) and np.mean(pub_scores) > pub_threshold

                scores = {
                    'technical': np.mean(pub_scores),
                    'method': np.max(pub_scores),
                    'innovation': np.mean(pub_scores) if is_publishable else np.mean(nonpub_scores)
                }

                if is_publishable:
                    conf_scores = {
                        conf: np.mean([self._cosine_similarity(emb, ref) for ref in refs])
                        for conf, refs in ref_embeddings["conferences"].items()
                    }
                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]

                    rationale = self._generate_rationale(paper.content, True, best_conf, scores)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    rationale = self._generate_rationale(paper.content, False, scores=scores)
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": rationale
                    })

            except Exception as e:
                logging.error(f"Error processing paper {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": f"Error: {str(e)}"
                })

        return results

    def save_results_csv(self, results: List[Dict], output_path: str = "results.csv"):
        import pandas as pd
        df = pd.DataFrame(results)
        df.to_csv(output_path, index=False)
        print(f"Results saved to {output_path}")

def main():
    # Configure logging
    logging.basicConfig(level=logging.INFO)

    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    # Save results in both JSON and CSV formats
    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

    classifier.save_results_csv(results)

if __name__ == "__main__":
    main()

ERROR:root:Error generating rationale: ("Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 3. The candidate's safety_ratings are: [category: HARM_CATEGORY_SEXUALLY_EXPLICIT\nprobability: NEGLIGIBLE\n, category: HARM_CATEGORY_HATE_SPEECH\nprobability: NEGLIGIBLE\n, category: HARM_CATEGORY_HARASSMENT\nprobability: NEGLIGIBLE\n, category: HARM_CATEGORY_DANGEROUS_CONTENT\nprobability: MEDIUM\n].", [category: HARM_CATEGORY_SEXUALLY_EXPLICIT
probability: NEGLIGIBLE
, category: HARM_CATEGORY_HATE_SPEECH
probability: NEGLIGIBLE
, category: HARM_CATEGORY_HARASSMENT
probability: NEGLIGIBLE
, category: HARM_CATEGORY_DANGEROUS_CONTENT
probability: MEDIUM
])
ERROR:root:Error generating rationale: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource ha

Results saved to results.csv


PROMPT -1 WITH FLAN T5


In [9]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # FLAN-T5 for rationale generation
        self.rationale_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(self.device)
        self.rationale_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        # Create a structured prompt for the model
        prompt = (
            f"Given this research paper abstract: {paper_content[:500]}...\n\n"
            f"Explain in 2-3 sentences why this paper would be suitable for the {conference} conference. "
            "Focus on the paper's main contribution and methodology."
        )

        inputs = self.rationale_tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.rationale_model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=150,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                no_repeat_ngram_size=2,
                top_k=50,
                top_p=0.95
            )

        rationale = self.rationale_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return rationale

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        # [Previous load_dataset implementation remains the same]
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Prepare reference embeddings
        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na"
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e)
                })

        return results

def main():
    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

PROMPT 2 WITH FLAN-T5

In [12]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # FLAN-T5 for rationale generation
        self.rationale_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(self.device)
        self.rationale_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
         # Create a more targeted prompt that addresses conference fit
        conference_descriptions = {
            "CVPR": "Computer Vision and Pattern Recognition - focuses on computer vision, deep learning for visual tasks, and image processing innovations",
            "NeurIPS": "Neural Information Processing Systems - emphasizes machine learning, neural networks, and artificial intelligence advances",
            "EMNLP": "Empirical Methods in Natural Language Processing - centers on NLP, computational linguistics, and language understanding",
            "TMLR": "Transactions on Machine Learning Research - covers broad machine learning research with emphasis on thorough empirical validation",
            "KDD": "Knowledge Discovery and Data Mining - focuses on data mining, large-scale data analytics, and practical applications"
        }

        prompt = (
            f"Analyze this research paper and explain its suitability for {conference} conference.\n\n"
            f"Conference scope: {conference_descriptions[conference]}\n\n"
            f"Paper abstract: {paper_content[:500]}...\n\n"
            "Provide a 2-3 sentence rationale that addresses:\n"
            "1. How the paper's methodology and findings align with the conference's focus areas\n"
            "2. The significance of the research contribution to the field\n"
            "3. Why this specific conference is the most appropriate venue for this work"
        )

        inputs = self.rationale_tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.rationale_model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=150,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                no_repeat_ngram_size=2,
                top_k=50,
                top_p=0.95
            )

        rationale = self.rationale_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return rationale

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        # [Previous load_dataset implementation remains the same]
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Prepare reference embeddings
        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na"
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e)
                })

        return results

def main():
    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()

PROMPT 3 FLANT T5


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # FLAN-T5 for rationale generation
        self.rationale_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(self.device)
        self.rationale_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
         # Create a more targeted prompt that addresses conference fit
        conference_descriptions = {
            "CVPR": "Computer Vision and Pattern Recognition - focuses on computer vision, deep learning for visual tasks, and image processing innovations",
            "NeurIPS": "Neural Information Processing Systems - emphasizes machine learning, neural networks, and artificial intelligence advances",
            "EMNLP": "Empirical Methods in Natural Language Processing - centers on NLP, computational linguistics, and language understanding",
            "TMLR": "Transactions on Machine Learning Research - covers broad machine learning research with emphasis on thorough empirical validation",
            "KDD": "Knowledge Discovery and Data Mining - focuses on data mining, large-scale data analytics, and practical applications"
        }

        prompt = (
            f"Analyze this research paper and explain its suitability for {conference} conference.\n\n"
            f"Conference scope: {conference_descriptions[conference]}\n\n"
            f"Paper abstract: {paper_content[:500]}...\n\n"
            "Provide a 2-3 sentence rationale that addresses:\n"
            "1. How the paper's methodology and findings align with the conference's focus areas\n"
            "2. The significance of the research contribution to the field\n"
            "3. Why this specific conference is the most appropriate venue for this work"
        )

        inputs = self.rationale_tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.rationale_model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=150,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                no_repeat_ngram_size=2,
                top_k=50,
                top_p=0.95
            )

        rationale = self.rationale_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return rationale

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        # [Previous load_dataset implementation remains the same]
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Prepare reference embeddings
        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na"
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e)
                })

        return results

def main():
    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()