In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.1


gemini - PAID HENCE NOT USED (just a test)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json
import google.generativeai as genai
import os

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # Set up SciBERT
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # Set up Gemini
        API_KEY = os.getenv("API_KEY", "KEY")
        genai.configure(api_key=API_KEY)

        # Create Gemini Pro model
        self.gemini = genai.GenerativeModel('gemini-pro')

    def _generate_rationale(self, paper_content: str, is_publishable: bool, conference: str = None, scores: dict = None) -> str:
      if is_publishable:
          prompt = f"""
          Analyze this research paper for {conference} conference.
          Paper scores:
          - Technical similarity: {scores.get('technical', 0):.2f}
          - Methodology strength: {scores.get('method', 0):.2f}
          - Innovation level: {scores.get('innovation', 0):.2f}

          Evaluate based on:
          1. Technical alignment with {conference}
          2. Research methodology quality
          3. Innovation and contribution value

          Paper excerpt: {paper_content[:2000]}...

          Provide a concise rationale (max 100 words) explaining why this paper should be published at {conference}:
          """
      else:
          prompt = f"""
          Analyze this research paper that was classified as non-publishable.
          Paper scores:
          - Technical quality: {scores.get('technical', 0):.2f}
          - Methodology: {scores.get('method', 0):.2f}
          - Innovation: {scores.get('innovation', 0):.2f}

          Evaluate critical issues in:
          1. Technical soundness
          2. Research methodology
          3. Novel contribution

          Paper excerpt: {paper_content[:2000]}...

          Provide a concise rationale (max 100 words) explaining why this paper needs improvement before publication:
          """

      try:
          response = self.gemini.generate_content(prompt)
          return response.text[:300].strip()
      except Exception as e:
          logging.error(f"Error generating rationale: {e}")
          return "Error generating rationale"

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _setup_models(self):
        try:
            self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
            self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

            # Set up Gemini
            API_KEY = os.getenv("API_KEY", "AIzaSyABSVGA83cLKJ-RhSFFbzMJkzG3aYE_YWE")
            genai.configure(api_key=API_KEY)
            self.gemini = genai.GenerativeModel('gemini-pro')
        except Exception as e:
            logging.error(f"Error setting up models: {e}")
            raise

    def _get_embedding(self, text: str) -> torch.Tensor:
        if not text or not isinstance(text, str):
            raise ValueError("Invalid input text for embedding generation")

        try:
            inputs = self.tokenizer(
                text,
                max_length=512,
                truncation=True,
                padding='max_length',
                return_tensors="pt",
                return_attention_mask=True
            )

            # Move to device
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.scibert(**inputs)

            if outputs.last_hidden_state is None:
                raise ValueError("Model output is None")

            # Use attention mask for weighted mean
            mask = inputs['attention_mask'].unsqueeze(-1).expand(outputs.last_hidden_state.size())
            masked_embeddings = outputs.last_hidden_state * mask
            sum_embeddings = torch.sum(masked_embeddings, dim=1)
            sum_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
            return sum_embeddings / sum_mask

        except Exception as e:
            logging.error(f"Error generating embedding: {e}")
            raise

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Validate inputs
        if not reference_papers or not papers:
            raise ValueError("Empty input papers list")

        # Process reference papers
        try:
            for ref in reference_papers:
                if not ref.content:
                    logging.warning(f"Empty content in reference paper: {ref.path}")
                    continue

                emb = self._get_embedding(ref.content)
                if ref.label == "Publishable":
                    ref_embeddings["publishable"].append(emb)
                    if ref.conference not in ref_embeddings["conferences"]:
                        ref_embeddings["conferences"][ref.conference] = []
                    ref_embeddings["conferences"][ref.conference].append(emb)
                else:
                    ref_embeddings["nonpublishable"].append(emb)
        except Exception as e:
            logging.error(f"Error processing reference papers: {e}")
            raise

        # Process papers to classify
        for paper in papers:
            try:
                if not paper.content:
                    raise ValueError(f"Empty content in paper: {paper.path}")

                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                # Add threshold and weighting
                pub_threshold = 0.65
                is_publishable = (np.mean(pub_scores) * 1.2) > np.mean(nonpub_scores) and np.mean(pub_scores) > pub_threshold

                scores = {
                    'technical': np.mean(pub_scores),
                    'method': np.max(pub_scores),
                    'innovation': np.mean(pub_scores) if is_publishable else np.mean(nonpub_scores)
                }

                if is_publishable:
                    conf_scores = {
                        conf: np.mean([self._cosine_similarity(emb, ref) for ref in refs])
                        for conf, refs in ref_embeddings["conferences"].items()
                    }
                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]

                    rationale = self._generate_rationale(paper.content, True, best_conf, scores)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    rationale = self._generate_rationale(paper.content, False, scores=scores)
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": rationale
                    })

            except Exception as e:
                logging.error(f"Error processing paper {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": f"Error: {str(e)}"
                })

        return results

    def save_results_csv(self, results: List[Dict], output_path: str = "results.csv"):
        import pandas as pd
        df = pd.DataFrame(results)
        df.to_csv(output_path, index=False)
        print(f"Results saved to {output_path}")

def main():
    # Configure logging
    logging.basicConfig(level=logging.INFO)

    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    # Save results in both JSON and CSV formats
    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

    classifier.save_results_csv(results)

if __name__ == "__main__":
    main()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte

PROMPT -1 WITH FLAN T5


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # FLAN-T5 for rationale generation
        self.rationale_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(self.device)
        self.rationale_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        # Create a structured prompt for the model
        prompt = (
            f"Given this research paper abstract: {paper_content[:500]}...\n\n"
            f"Explain in 2-3 sentences why this paper would be suitable for the {conference} conference. "
            "Focus on the paper's main contribution and methodology."
        )

        inputs = self.rationale_tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.rationale_model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=150,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                no_repeat_ngram_size=2,
                top_k=50,
                top_p=0.95
            )

        rationale = self.rationale_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return rationale

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        # [Previous load_dataset implementation remains the same]
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Prepare reference embeddings
        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na"
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e)
                })

        return results

def main():
    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

PROMPT 2 WITH FLAN-T5

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # FLAN-T5 for rationale generation
        self.rationale_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(self.device)
        self.rationale_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
         # Create a more targeted prompt that addresses conference fit
        conference_descriptions = {
            "CVPR": "Computer Vision and Pattern Recognition - focuses on computer vision, deep learning for visual tasks, and image processing innovations",
            "NeurIPS": "Neural Information Processing Systems - emphasizes machine learning, neural networks, and artificial intelligence advances",
            "EMNLP": "Empirical Methods in Natural Language Processing - centers on NLP, computational linguistics, and language understanding",
            "TMLR": "Transactions on Machine Learning Research - covers broad machine learning research with emphasis on thorough empirical validation",
            "KDD": "Knowledge Discovery and Data Mining - focuses on data mining, large-scale data analytics, and practical applications"
        }

        prompt = (
            f"Analyze this research paper and explain its suitability for {conference} conference.\n\n"
            f"Conference scope: {conference_descriptions[conference]}\n\n"
            f"Paper abstract: {paper_content[:500]}...\n\n"
            "Provide a 2-3 sentence rationale that addresses:\n"
            "1. How the paper's methodology and findings align with the conference's focus areas\n"
            "2. The significance of the research contribution to the field\n"
            "3. Why this specific conference is the most appropriate venue for this work"
        )

        inputs = self.rationale_tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.rationale_model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=150,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                no_repeat_ngram_size=2,
                top_k=50,
                top_p=0.95
            )

        rationale = self.rationale_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return rationale

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        # [Previous load_dataset implementation remains the same]
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Prepare reference embeddings
        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na"
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e)
                })

        return results

def main():
    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()

PROMPT 3 FLANT T5


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # FLAN-T5 for rationale generation
        self.rationale_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(self.device)
        self.rationale_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
         # Create a more targeted prompt that addresses conference fit
        conference_descriptions = {
            "CVPR": "Computer Vision and Pattern Recognition - focuses on computer vision, deep learning for visual tasks, and image processing innovations",
            "NeurIPS": "Neural Information Processing Systems - emphasizes machine learning, neural networks, and artificial intelligence advances",
            "EMNLP": "Empirical Methods in Natural Language Processing - centers on NLP, computational linguistics, and language understanding",
            "TMLR": "Transactions on Machine Learning Research - covers broad machine learning research with emphasis on thorough empirical validation",
            "KDD": "Knowledge Discovery and Data Mining - focuses on data mining, large-scale data analytics, and practical applications"
        }

        prompt = (
            f"Analyze this research paper and explain its suitability for {conference} conference.\n\n"
            f"Conference scope: {conference_descriptions[conference]}\n\n"
            f"Paper abstract: {paper_content[:500]}...\n\n"
            "Provide a 2-3 sentence rationale that addresses:\n"
            "1. How the paper's methodology and findings align with the conference's focus areas\n"
            "2. The significance of the research contribution to the field\n"
            "3. Why this specific conference is the most appropriate venue for this work"
        )

        inputs = self.rationale_tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.rationale_model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=150,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                no_repeat_ngram_size=2,
                top_k=50,
                top_p=0.95
            )

        rationale = self.rationale_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return rationale

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        # [Previous load_dataset implementation remains the same]
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Prepare reference embeddings
        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na"
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e)
                })

        return results

def main():
    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()

PROMPT 4


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # FLAN-T5 for rationale generation
        self.rationale_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(self.device)
        self.rationale_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        # Create a structured prompt for the model
        prompt = (
            f"Given this research paper abstract: {paper_content[:500]}...\n\n"
            "Provide a rationale explaining why this paper is relevant to the {conference} conference. "
            "Be specific about the paper's contributions and how they align with the conference's themes."
        )




        inputs = self.rationale_tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.rationale_model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=150,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                no_repeat_ngram_size=2,
                top_k=50,
                top_p=0.95
            )

        rationale = self.rationale_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return rationale

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        # [Previous load_dataset implementation remains the same]
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Prepare reference embeddings
        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na"
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e)
                })

        return results

def main():
    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

BART
summary cnn model


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # BART for rationale generation
        self.rationale_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(self.device)
        self.rationale_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        # Create a structured prompt for the model
        prompt = (
            f"Given this research paper abstract: {paper_content[:500]}...\n\n"
            f"Explain in 2-3 sentences why this paper is suitable for the {conference} conference. "
            f"Start the rationale with 'It is relevant to {conference} because' and describe the main contribution of the paper and its connection to the focus of {conference}. "
            f"Keep it concise and clear."
        )

        inputs = self.rationale_tokenizer(
            prompt,
            max_length=1024,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.rationale_model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=150,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                no_repeat_ngram_size=2,
                top_k=50,
                top_p=0.95
            )

        rationale = self.rationale_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return rationale

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Prepare reference embeddings
        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na"
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e)
                })

        return results

def main():
    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

roberta q/a model


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self._setup_models()

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # RoBERTa for rationale generation
        self.qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2").to(self.device)
        self.qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        # Create conference-specific context
        conference_contexts = {
            "TMLR": "TMLR is a machine learning research conference focusing on theoretical advances, algorithms, and methodological innovations in machine learning.",
            "CVPR": "CVPR is a premier computer vision conference focusing on visual processing, recognition, understanding, and generation.",
            "EMNLP": "EMNLP is a natural language processing conference focusing on computational linguistics, language understanding, and text processing.",
            "NeurIPS": "NeurIPS is a conference focusing on neural information processing systems, machine learning theory, and artificial intelligence.",
            "KDD": "KDD is a conference focusing on data mining, knowledge discovery, and large-scale data analytics."
        }

        # Extract title and first part of abstract
        content_preview = paper_content[:1000].replace('\n', ' ').strip()
        try:
            title = content_preview.split('.')[0]
            abstract = ' '.join(content_preview.split('.')[1:3])
        except:
            title = content_preview[:100]
            abstract = content_preview[100:500]

        # Construct the context with paper info and conference details
        context = (
            f"Paper Title: {title}\n"
            f"Abstract: {abstract}\n\n"
            f"Conference Information: {conference_contexts[conference]}\n\n"
            "A paper is relevant to a conference if its technical contributions and research focus align with the conference's main themes. "
            "The explanation should describe specific technical aspects of the paper that match the conference's focus areas."
        )

        # Define questions to get different aspects of the rationale
        questions = [
            f"What specific technical contributions make this paper relevant to {conference}?",
            f"How does this paper's methodology align with {conference}'s main themes?",
            f"What is the main innovation of this paper that fits {conference}'s focus?"
        ]

        answers = []
        for question in questions:
            # Prepare input for the model
            inputs = self.qa_tokenizer(
                question,
                context,
                max_length=386,
                truncation=True,
                padding='max_length',
                return_tensors="pt"
            ).to(self.device)

            # Get model outputs
            with torch.no_grad():
                outputs = self.qa_model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                # Get the most likely answer span
                start_idx = torch.argmax(start_logits)
                end_idx = torch.argmax(end_logits)

                # Convert token indices to character indices
                tokens = self.qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
                answer = self.qa_tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx+1])

                if answer and len(answer.strip()) > 10:  # Only keep meaningful answers
                    answers.append(answer.strip())

        # Combine answers into a coherent rationale
        if answers:
            # Remove potential duplicates and very similar answers
            unique_answers = []
            for ans in answers:
                if not any(self._similar_strings(ans, existing) for existing in unique_answers):
                    unique_answers.append(ans)

            # Construct the final rationale
            rationale = f"This paper is relevant to {conference} because " + ". ".join(unique_answers[:2])
            return rationale
        else:
            return f"This paper appears relevant to {conference} based on its focus on {title}"

    def _similar_strings(self, str1: str, str2: str, threshold: float = 0.7) -> bool:
        """Helper function to check if two strings are very similar"""
        if len(str1) < 10 or len(str2) < 10:
            return False
        common = sum(1 for a, b in zip(str1.lower(), str2.lower()) if a == b)
        return common / max(len(str1), len(str2)) > threshold

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        # Load reference papers
        ref_path = base_path / "Reference"

        # Load publishable reference papers
        pub_path = ref_path / "Publishable"
        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load non-publishable reference papers
        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        # Prepare reference embeddings
        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na"
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e)
                })

        return results

def main():
    classifier = PaperClassifier()
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")
    results = classifier.classify_papers(reference_papers, papers)

    with open("results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte

optimizations and improvement


In [None]:
# prompt: No module named 'faiss'

!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json
import time
import psutil
from sklearn.model_selection import KFold
from json import JSONEncoder

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

@dataclass
class PerformanceMetrics:
    processing_time: float
    memory_used_mb: float
    gpu_memory_mb: float = 0

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self.reference_papers = []
        self._setup_models()
        self.performance_metrics = []

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # RoBERTa for rationale generation
        self.qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2").to(self.device)
        self.qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

    def _track_performance(func):
        def wrapper(self, *args, **kwargs):
            start_time = time.time()
            torch.cuda.reset_peak_memory_stats()

            result = func(self, *args, **kwargs)

            processing_time = time.time() - start_time
            memory_used = psutil.Process().memory_info().rss / 1024 / 1024  # MB
            gpu_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 if torch.cuda.is_available() else 0

            self.performance_metrics.append(PerformanceMetrics(
                processing_time=processing_time,
                memory_used_mb=memory_used,
                gpu_memory_mb=gpu_memory
            ))

            return result
        return wrapper

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        conference_contexts = {
            "TMLR": "TMLR is a machine learning research conference focusing on theoretical advances, algorithms, and methodological innovations in machine learning.",
            "CVPR": "CVPR is a premier computer vision conference focusing on visual processing, recognition, understanding, and generation.",
            "EMNLP": "EMNLP is a natural language processing conference focusing on computational linguistics, language understanding, and text processing.",
            "NeurIPS": "NeurIPS is a conference focusing on neural information processing systems, machine learning theory, and artificial intelligence.",
            "KDD": "KDD is a conference focusing on data mining, knowledge discovery, and large-scale data analytics."
        }

        content_preview = paper_content[:1000].replace('\n', ' ').strip()
        try:
            title = content_preview.split('.')[0]
            abstract = ' '.join(content_preview.split('.')[1:3])
        except:
            title = content_preview[:100]
            abstract = content_preview[100:500]

        context = (
            f"Paper Title: {title}\n"
            f"Abstract: {abstract}\n\n"
            f"Conference Information: {conference_contexts[conference]}\n\n"
            "A paper is relevant to a conference if its technical contributions and research focus align with the conference's main themes. "
            "The explanation should describe specific technical aspects of the paper that match the conference's focus areas."
        )

        questions = [
            f"What specific technical contributions make this paper relevant to {conference}?",
            f"How does this paper's methodology align with {conference}'s main themes?",
            f"What is the main innovation of this paper that fits {conference}'s focus?"
        ]

        answers = []
        for question in questions:
            inputs = self.qa_tokenizer(
                question,
                context,
                max_length=386,
                truncation=True,
                padding='max_length',
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                outputs = self.qa_model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                start_idx = torch.argmax(start_logits)
                end_idx = torch.argmax(end_logits)

                tokens = self.qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
                answer = self.qa_tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx+1])

                if answer and len(answer.strip()) > 10:
                    answers.append(answer.strip())

        if answers:
            unique_answers = []
            for ans in answers:
                if not any(self._similar_strings(ans, existing) for existing in unique_answers):
                    unique_answers.append(ans)

            rationale = f"This paper is relevant to {conference} because " + ". ".join(unique_answers[:2])
            return rationale
        else:
            return f"This paper appears relevant to {conference} based on its focus on {title}"

    def _similar_strings(self, str1: str, str2: str, threshold: float = 0.7) -> bool:
        if len(str1) < 10 or len(str2) < 10:
            return False
        common = sum(1 for a, b in zip(str1.lower(), str2.lower()) if a == b)
        return common / max(len(str1), len(str2)) > threshold

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        ref_path = base_path / "Reference"
        pub_path = ref_path / "Publishable"

        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        self.reference_papers = reference_papers
        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    @_track_performance
    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    @_track_performance
    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale,
                        "confidence_scores": {
                            "publishability": np.mean(pub_scores),
                            "conference": conf_scores[best_conf]
                        }
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na",
                        "confidence_scores": {
                            "publishability": np.mean(nonpub_scores)
                        }
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e),
                    "confidence_scores": {}
                })

        return results

    def calculate_metrics(self, validation_papers: List[Paper]) -> Dict:
        true_positives = 0
        false_positives = 0
        true_negatives = 0
        false_negatives = 0

        conference_correct = 0
        total_publishable = 0

        results = self.classify_papers(self.reference_papers, validation_papers)

        for paper, result in zip(validation_papers, results):
            is_actually_publishable = paper.label == "Publishable"
            is_predicted_publishable = result["publishable"] == 1

            if is_predicted_publishable and is_actually_publishable:
                true_positives += 1
                if paper.conference == result["conference"]:
                    conference_correct += 1
            elif is_predicted_publishable and not is_actually_publishable:
                false_positives += 1
            elif not is_predicted_publishable and not is_actually_publishable:
                true_negatives += 1
            else:
                false_negatives += 1

            if is_actually_publishable:
                total_publishable += 1

        accuracy = (true_positives + true_negatives) / len(validation_papers)

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        conference_accuracy = conference_correct / total_publishable if total_publishable > 0 else 0

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "conference_accuracy": conference_accuracy,
            "metrics_detail": {
                "true_positives": true_positives,
                "false_positives": false_positives,
                "true_negatives": true_negatives,
                "false_negatives": false_negatives,
                "total_papers": len(validation_papers)
            }
        }

    def cross_validate(self, k_folds: int = 5) -> Dict:
        """
        Perform k-fold cross-validation on the reference papers
        """
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        metrics_list = []

        reference_papers = np.array(self.reference_papers)

        for train_idx, val_idx in kf.split(reference_papers):
            train_papers = reference_papers[train_idx].tolist()
            val_papers = reference_papers[val_idx].tolist()

            self.reference_papers = train_papers
            metrics = self.calculate_metrics(val_papers)
            metrics_list.append(metrics)

        # Calculate average metrics
        avg_metrics = {
            "accuracy": np.mean([m["accuracy"] for m in metrics_list]),
            "f1_score": np.mean([m["f1_score"] for m in metrics_list]),
            "conference_accuracy": np.mean([m["conference_accuracy"] for m in metrics_list])
        }

        return {
            "fold_metrics": metrics_list,
            "average_metrics": avg_metrics
        }

    def get_performance_summary(self) -> Dict:
        """
        Get summary of performance metrics including processing time, memory usage, and GPU stats
        """
        total_time = sum(m.processing_time for m in self.performance_metrics)
        avg_memory = np.mean([m.memory_used_mb for m in self.performance_metrics])
        max_memory = max(m.memory_used_mb for m in self.performance_metrics)

        if torch.cuda.is_available():
            avg_gpu = np.mean([m.gpu_memory_mb for m in self.performance_metrics])
            max_gpu = max(m.gpu_memory_mb for m in self.performance_metrics)
        else:
            avg_gpu = max_gpu = 0

        return {
            "total_processing_time": total_time,
            "average_time_per_operation": total_time / len(self.performance_metrics),
            "memory_usage": {
                "average_mb": avg_memory,
                "peak_mb": max_memory
            },
            "gpu_usage": {
                "average_mb": avg_gpu,
                "peak_mb": max_gpu
            },
            "total_operations": len(self.performance_metrics),
            "device_used": str(self.device)
        }


class NumpyEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def main():
    # Initialize classifier
    classifier = PaperClassifier()

    # Load datasets
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")

    # Perform cross-validation
    cv_results = classifier.cross_validate(k_folds=5)

    # Classify papers
    classification_results = classifier.classify_papers(reference_papers, papers)

    # Get performance metrics
    performance_summary = classifier.get_performance_summary()

    # Convert numpy values to native Python types
    results = {
        "classifications": classification_results,
        "cross_validation": cv_results,
        "performance": performance_summary
    }

    # Use the custom encoder when writing to JSON
    with open("results.json", "w") as f:
        json.dump(results, f, indent=2, cls=NumpyEncoder)

    # Log summary statistics
    logging.info("Classification Complete!")
    logging.info(f"Total processing time: {float(performance_summary['total_processing_time']):.2f} seconds")
    logging.info(f"Average CV Accuracy: {float(cv_results['average_metrics']['accuracy']):.3f}")
    logging.info(f"Average CV F1 Score: {float(cv_results['average_metrics']['f1_score']):.3f}")
    logging.info(f"Average Conference Accuracy: {float(cv_results['average_metrics']['conference_accuracy']):.3f}")

if __name__ == "__main__":
    main()

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
pip install torch


[0mCollecting torch
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 

MAIN MODEL - WITHOUT PATHWAY CONNECTOR(GDRIVE)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json
import time
import psutil
from sklearn.model_selection import KFold
from json import JSONEncoder

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

@dataclass
class PerformanceMetrics:
    processing_time: float
    memory_used_mb: float
    gpu_memory_mb: float = 0

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self.reference_papers = []
        self._setup_models()
        self.performance_metrics = []

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # RoBERTa for rationale generation
        self.qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2").to(self.device)
        self.qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

    def _track_performance(func):
        def wrapper(self, *args, **kwargs):
            start_time = time.time()
            torch.cuda.reset_peak_memory_stats()

            result = func(self, *args, **kwargs)

            processing_time = time.time() - start_time
            memory_used = psutil.Process().memory_info().rss / 1024 / 1024  # MB
            gpu_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 if torch.cuda.is_available() else 0

            self.performance_metrics.append(PerformanceMetrics(
                processing_time=processing_time,
                memory_used_mb=memory_used,
                gpu_memory_mb=gpu_memory
            ))

            return result
        return wrapper

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        conference_contexts = {
            "TMLR": "TMLR is a machine learning research conference focusing on theoretical advances, algorithms, and methodological innovations in machine learning.",
            "CVPR": "CVPR is a premier computer vision conference focusing on visual processing, recognition, understanding, and generation.",
            "EMNLP": "EMNLP is a natural language processing conference focusing on computational linguistics, language understanding, and text processing.",
            "NeurIPS": "NeurIPS is a conference focusing on neural information processing systems, machine learning theory, and artificial intelligence.",
            "KDD": "KDD is a conference focusing on data mining, knowledge discovery, and large-scale data analytics."
        }

        content_preview = paper_content[:1000].replace('\n', ' ').strip()
        try:
            title = content_preview.split('.')[0]
            abstract = ' '.join(content_preview.split('.')[1:3])
        except:
            title = content_preview[:100]
            abstract = content_preview[100:500]

        context = (
            f"Paper Title: {title}\n"
            f"Abstract: {abstract}\n\n"
            f"Conference Information: {conference_contexts[conference]}\n\n"
            "A paper is relevant to a conference if its technical contributions and research focus align with the conference's main themes. "
            "The explanation should describe specific technical aspects of the paper that match the conference's focus areas."
        )

        questions = [
            f"What specific technical contributions make this paper relevant to {conference}?",
            f"How does this paper's methodology align with {conference}'s main themes?",
            f"What is the main innovation of this paper that fits {conference}'s focus?"
        ]

        answers = []
        for question in questions:
            inputs = self.qa_tokenizer(
                question,
                context,
                max_length=386,
                truncation=True,
                padding='max_length',
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                outputs = self.qa_model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                start_idx = torch.argmax(start_logits)
                end_idx = torch.argmax(end_logits)

                tokens = self.qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
                answer = self.qa_tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx+1])

                if answer and len(answer.strip()) > 10:
                    answers.append(answer.strip())

        if answers:
            unique_answers = []
            for ans in answers:
                if not any(self._similar_strings(ans, existing) for existing in unique_answers):
                    unique_answers.append(ans)

            rationale = f"This paper is relevant to {conference} because " + ". ".join(unique_answers[:2])
            return rationale
        else:
            return f"This paper appears relevant to {conference} based on its focus on {title}"

    def _similar_strings(self, str1: str, str2: str, threshold: float = 0.7) -> bool:
        if len(str1) < 10 or len(str2) < 10:
            return False
        common = sum(1 for a, b in zip(str1.lower(), str2.lower()) if a == b)
        return common / max(len(str1), len(str2)) > threshold

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        ref_path = base_path / "Reference"
        pub_path = ref_path / "Publishable"

        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        self.reference_papers = reference_papers
        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""

    @_track_performance
    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    @_track_performance
    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale,
                        "confidence_scores": {
                            "publishability": np.mean(pub_scores),
                            "conference": conf_scores[best_conf]
                        }
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na",
                        "confidence_scores": {
                            "publishability": np.mean(nonpub_scores)
                        }
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e),
                    "confidence_scores": {}
                })

        return results

    def calculate_metrics(self, validation_papers: List[Paper]) -> Dict:
        true_positives = 0
        false_positives = 0
        true_negatives = 0
        false_negatives = 0

        conference_correct = 0
        total_publishable = 0

        results = self.classify_papers(self.reference_papers, validation_papers)

        for paper, result in zip(validation_papers, results):
            is_actually_publishable = paper.label == "Publishable"
            is_predicted_publishable = result["publishable"] == 1

            if is_predicted_publishable and is_actually_publishable:
                true_positives += 1
                if paper.conference == result["conference"]:
                    conference_correct += 1
            elif is_predicted_publishable and not is_actually_publishable:
                false_positives += 1
            elif not is_predicted_publishable and not is_actually_publishable:
                true_negatives += 1
            else:
                false_negatives += 1

            if is_actually_publishable:
                total_publishable += 1

        accuracy = (true_positives + true_negatives) / len(validation_papers)

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        conference_accuracy = conference_correct / total_publishable if total_publishable > 0 else 0

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "conference_accuracy": conference_accuracy,
            "metrics_detail": {
                "true_positives": true_positives,
                "false_positives": false_positives,
                "true_negatives": true_negatives,
                "false_negatives": false_negatives,
                "total_papers": len(validation_papers)
            }
        }

    def cross_validate(self, k_folds: int = 5) -> Dict:
        """
        Perform k-fold cross-validation on the reference papers
        """
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        metrics_list = []

        reference_papers = np.array(self.reference_papers)

        for train_idx, val_idx in kf.split(reference_papers):
            train_papers = reference_papers[train_idx].tolist()
            val_papers = reference_papers[val_idx].tolist()

            self.reference_papers = train_papers
            metrics = self.calculate_metrics(val_papers)
            metrics_list.append(metrics)

        # Calculate average metrics
        avg_metrics = {
            "accuracy": np.mean([m["accuracy"] for m in metrics_list]),
            "f1_score": np.mean([m["f1_score"] for m in metrics_list]),
            "conference_accuracy": np.mean([m["conference_accuracy"] for m in metrics_list])
        }

        return {
            "fold_metrics": metrics_list,
            "average_metrics": avg_metrics
        }

    def get_performance_summary(self) -> Dict:
        """
        Get summary of performance metrics including processing time, memory usage, and GPU stats
        """
        total_time = sum(m.processing_time for m in self.performance_metrics)
        avg_memory = np.mean([m.memory_used_mb for m in self.performance_metrics])
        max_memory = max(m.memory_used_mb for m in self.performance_metrics)

        if torch.cuda.is_available():
            avg_gpu = np.mean([m.gpu_memory_mb for m in self.performance_metrics])
            max_gpu = max(m.gpu_memory_mb for m in self.performance_metrics)
        else:
            avg_gpu = max_gpu = 0

        return {
            "total_processing_time": total_time,
            "average_time_per_operation": total_time / len(self.performance_metrics),
            "memory_usage": {
                "average_mb": avg_memory,
                "peak_mb": max_memory
            },
            "gpu_usage": {
                "average_mb": avg_gpu,
                "peak_mb": max_gpu
            },
            "total_operations": len(self.performance_metrics),
            "device_used": str(self.device)
        }


class NumpyEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def main():
    # Initialize classifier
    classifier = PaperClassifier()

    # Load datasets
    reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")

    # Perform cross-validation
    cv_results = classifier.cross_validate(k_folds=5)

    # Classify papers
    classification_results = classifier.classify_papers(reference_papers, papers)

    # Get performance metrics
    performance_summary = classifier.get_performance_summary()

    # Prepare submission results (without confidence scores)
    submission_results = [{
        "paper_id": result["paper_id"],
        "publishable": result["publishable"],
        "conference": result["conference"],
        "rationale": result["rationale"]
    } for result in classification_results]

    # Prepare metrics results
    metrics_results = {
        "paper_metrics": [{
            "paper_id": result["paper_id"],
            "confidence_scores": result["confidence_scores"]
        } for result in classification_results],
        "cross_validation": cv_results,
        "performance": performance_summary
    }

    # Write submission results to results.json
    with open("results.json", "w") as f:
        json.dump(submission_results, f, indent=2, cls=NumpyEncoder)

    # Write metrics to metrics.json
    with open("metrics.json", "w") as f:
        json.dump(metrics_results, f, indent=2, cls=NumpyEncoder)

    # Log summary statistics
    logging.info("Classification Complete!")
    logging.info(f"Total processing time: {float(performance_summary['total_processing_time']):.2f} seconds")
    logging.info(f"Average CV Accuracy: {float(cv_results['average_metrics']['accuracy']):.3f}")
    logging.info(f"Average CV F1 Score: {float(cv_results['average_metrics']['f1_score']):.3f}")
    logging.info(f"Average Conference Accuracy: {float(cv_results['average_metrics']['conference_accuracy']):.3f}")

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

MAIN MODEL - WITH PATHWAY CONNECTOR(GDRIVE)

In [None]:
import pathway as pw
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
from pathlib import Path
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json
import time
import psutil
import torch
import fitz
from sklearn.model_selection import KFold
from json import JSONEncoder

@dataclass
class Paper:
    content: str
    path: str  # Changed from Path to str for GDrive compatibility
    is_reference: bool
    label: str = None
    conference: str = None

@dataclass
class PerformanceMetrics:
    processing_time: float
    memory_used_mb: float
    gpu_memory_mb: float = 0

class PaperClassifier:
    def __init__(self, credentials_file: str):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self.reference_papers = []
        self.credentials_file = credentials_file
        self._setup_models()
        self.performance_metrics = []

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # RoBERTa for rationale generation
        self.qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2").to(self.device)
        self.qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

    def _track_performance(func):
        def wrapper(self, *args, **kwargs):
            start_time = time.time()
            torch.cuda.reset_peak_memory_stats()

            result = func(self, *args, **kwargs)

            processing_time = time.time() - start_time
            memory_used = psutil.Process().memory_info().rss / 1024 / 1024  # MB
            gpu_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 if torch.cuda.is_available() else 0

            self.performance_metrics.append(PerformanceMetrics(
                processing_time=processing_time,
                memory_used_mb=memory_used,
                gpu_memory_mb=gpu_memory
            ))

            return result
        return wrapper

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        conference_contexts = {
            "TMLR": "TMLR is a machine learning research conference focusing on theoretical advances, algorithms, and methodological innovations in machine learning.",
            "CVPR": "CVPR is a premier computer vision conference focusing on visual processing, recognition, understanding, and generation.",
            "EMNLP": "EMNLP is a natural language processing conference focusing on computational linguistics, language understanding, and text processing.",
            "NeurIPS": "NeurIPS is a conference focusing on neural information processing systems, machine learning theory, and artificial intelligence.",
            "KDD": "KDD is a conference focusing on data mining, knowledge discovery, and large-scale data analytics."
        }

        content_preview = paper_content[:1000].replace('\n', ' ').strip()
        try:
            title = content_preview.split('.')[0]
            abstract = ' '.join(content_preview.split('.')[1:3])
        except:
            title = content_preview[:100]
            abstract = content_preview[100:500]

        context = (
            f"Paper Title: {title}\n"
            f"Abstract: {abstract}\n\n"
            f"Conference Information: {conference_contexts[conference]}\n\n"
            "A paper is relevant to a conference if its technical contributions and research focus align with the conference's main themes. "
            "The explanation should describe specific technical aspects of the paper that match the conference's focus areas."
        )

        questions = [
            f"What specific technical contributions make this paper relevant to {conference}?",
            f"How does this paper's methodology align with {conference}'s main themes?",
            f"What is the main innovation of this paper that fits {conference}'s focus?"
        ]

        answers = []
        for question in questions:
            inputs = self.qa_tokenizer(
                question,
                context,
                max_length=386,
                truncation=True,
                padding='max_length',
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                outputs = self.qa_model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                start_idx = torch.argmax(start_logits)
                end_idx = torch.argmax(end_logits)

                tokens = self.qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
                answer = self.qa_tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx+1])

                if answer and len(answer.strip()) > 10:
                    answers.append(answer.strip())

        if answers:
            unique_answers = []
            for ans in answers:
                if not any(self._similar_strings(ans, existing) for existing in unique_answers):
                    unique_answers.append(ans)

            rationale = f"This paper is relevant to {conference} because " + ". ".join(unique_answers[:2])
            return rationale
        else:
            return f"This paper appears relevant to {conference} based on its focus on {title}"

    def _similar_strings(self, str1: str, str2: str, threshold: float = 0.7) -> bool:
        if len(str1) < 10 or len(str2) < 10:
            return False
        common = sum(1 for a, b in zip(str1.lower(), str2.lower()) if a == b)
        return common / max(len(str1), len(str2)) > threshold

    def _extract_pdf_content(self, pdf_data: bytes) -> str:
        try:
            # Create a BytesIO object from the binary data
            from io import BytesIO
            pdf_stream = BytesIO(pdf_data)

            # Open PDF from memory stream
            doc = fitz.open(stream=pdf_stream, filetype="pdf")
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from PDF: {e}")
            return ""

    def load_dataset(self, reference_folder_id: str, papers_folder_id: str) -> Tuple[List[Paper], List[Paper]]:
        reference_papers = []
        papers_to_classify = []

        # Load reference papers (publishable)
        for conf in self.conferences:
            conf_folder_id = f"{reference_folder_id}/Publishable/{conf}"  # Adjust based on folder structure
            ref_table = pw.io.gdrive.read(
                object_id=conf_folder_id,
                service_user_credentials_file=self.credentials_file,
                mode="static",
                file_name_pattern="*.pdf",
                with_metadata=True
            )

            # Convert Pathway table to list of dictionaries
            ref_data = ref_table.to_dict()

            # Process each PDF in the conference folder
            for file_data in ref_data:
                content = self._extract_pdf_content(file_data['data'])  # Access binary data
                metadata = file_data['_metadata']  # Access metadata
                reference_papers.append(Paper(
                    content=content,
                    path=metadata['name'],
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        # Load reference papers (non-publishable)
        nonpub_folder_id = f"{reference_folder_id}/Non-Publishable"
        nonpub_table = pw.io.gdrive.read(
            object_id=nonpub_folder_id,
            service_user_credentials_file=self.credentials_file,
            mode="static",
            file_name_pattern="*.pdf",
            with_metadata=True
        )

        nonpub_data = nonpub_table.to_dict()
        for file_data in nonpub_data:
            content = self._extract_pdf_content(file_data['data'])
            metadata = file_data['_metadata']
            reference_papers.append(Paper(
                content=content,
                path=metadata['name'],
                is_reference=True,
                label="NonPublishable"
            ))

        # Load papers to classify
        papers_table = pw.io.gdrive.read(
            object_id=papers_folder_id,
            service_user_credentials_file=self.credentials_file,
            mode="static",
            file_name_pattern="*.pdf",
            with_metadata=True
        )

        papers_data = papers_table.to_dict()
        for file_data in papers_data:
            content = self._extract_pdf_content(file_data['data'])
            metadata = file_data['_metadata']
            papers_to_classify.append(Paper(
                content=content,
                path=metadata['name'],
                is_reference=False
            ))

        self.reference_papers = reference_papers
        return reference_papers, papers_to_classify

    @_track_performance
    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale,
                        "confidence_scores": {
                            "publishability": np.mean(pub_scores),
                            "conference": conf_scores[best_conf]
                        }
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na",
                        "confidence_scores": {
                            "publishability": np.mean(nonpub_scores)
                        }
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e),
                    "confidence_scores": {}
                })

        return results

    def calculate_metrics(self, validation_papers: List[Paper]) -> Dict:
        true_positives = 0
        false_positives = 0
        true_negatives = 0
        false_negatives = 0

        conference_correct = 0
        total_publishable = 0

        results = self.classify_papers(self.reference_papers, validation_papers)

        for paper, result in zip(validation_papers, results):
            is_actually_publishable = paper.label == "Publishable"
            is_predicted_publishable = result["publishable"] == 1

            if is_predicted_publishable and is_actually_publishable:
                true_positives += 1
                if paper.conference == result["conference"]:
                    conference_correct += 1
            elif is_predicted_publishable and not is_actually_publishable:
                false_positives += 1
            elif not is_predicted_publishable and not is_actually_publishable:
                true_negatives += 1
            else:
                false_negatives += 1

            if is_actually_publishable:
                total_publishable += 1

        accuracy = (true_positives + true_negatives) / len(validation_papers)

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        conference_accuracy = conference_correct / total_publishable if total_publishable > 0 else 0

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "conference_accuracy": conference_accuracy,
            "metrics_detail": {
                "true_positives": true_positives,
                "false_positives": false_positives,
                "true_negatives": true_negatives,
                "false_negatives": false_negatives,
                "total_papers": len(validation_papers)
            }
        }

    def cross_validate(self, k_folds: int = 5) -> Dict:
        """
        Perform k-fold cross-validation on the reference papers
        """
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        metrics_list = []

        reference_papers = np.array(self.reference_papers)

        for train_idx, val_idx in kf.split(reference_papers):
            train_papers = reference_papers[train_idx].tolist()
            val_papers = reference_papers[val_idx].tolist()

            self.reference_papers = train_papers
            metrics = self.calculate_metrics(val_papers)
            metrics_list.append(metrics)

        # Calculate average metrics
        avg_metrics = {
            "accuracy": np.mean([m["accuracy"] for m in metrics_list]),
            "f1_score": np.mean([m["f1_score"] for m in metrics_list]),
            "conference_accuracy": np.mean([m["conference_accuracy"] for m in metrics_list])
        }

        return {
            "fold_metrics": metrics_list,
            "average_metrics": avg_metrics
        }

    def get_performance_summary(self) -> Dict:
        """
        Get summary of performance metrics including processing time, memory usage, and GPU stats
        """
        total_time = sum(m.processing_time for m in self.performance_metrics)
        avg_memory = np.mean([m.memory_used_mb for m in self.performance_metrics])
        max_memory = max(m.memory_used_mb for m in self.performance_metrics)

        if torch.cuda.is_available():
            avg_gpu = np.mean([m.gpu_memory_mb for m in self.performance_metrics])
            max_gpu = max(m.gpu_memory_mb for m in self.performance_metrics)
        else:
            avg_gpu = max_gpu = 0

        return {
            "total_processing_time": total_time,
            "average_time_per_operation": total_time / len(self.performance_metrics),
            "memory_usage": {
                "average_mb": avg_memory,
                "peak_mb": max_memory
            },
            "gpu_usage": {
                "average_mb": avg_gpu,
                "peak_mb": max_gpu
            },
            "total_operations": len(self.performance_metrics),
            "device_used": str(self.device)
        }


class NumpyEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def main():
    # Initialize classifier with credentials file
    classifier = PaperClassifier(credentials_file="credentials.json")

    # Load datasets using Google Drive folder IDs
    reference_papers, papers = classifier.load_dataset(
        reference_folder_id="1-658SR6wI7EBthpHFJDHmJ_0iyaubU-f",
        papers_folder_id="1Y2Y0EsMalo26KcJiPYcAXh6UzgMNjh4u"
    )

    # Perform cross-validation
    cv_results = classifier.cross_validate(k_folds=5)

    # Classify papers
    classification_results = classifier.classify_papers(reference_papers, papers)

    # Get performance metrics
    performance_summary = classifier.get_performance_summary()

    # Prepare submission results (without confidence scores)
    submission_results = [{
        "paper_id": result["paper_id"],
        "publishable": result["publishable"],
        "conference": result["conference"],
        "rationale": result["rationale"]
    } for result in classification_results]

    # Prepare metrics results
    metrics_results = {
        "paper_metrics": [{
            "paper_id": result["paper_id"],
            "confidence_scores": result["confidence_scores"]
        } for result in classification_results],
        "cross_validation": cv_results,
        "performance": performance_summary
    }

    # Write submission results to results.json
    with open("results.json", "w") as f:
        json.dump(submission_results, f, indent=2, cls=NumpyEncoder)

    # Write metrics to metrics.json
    with open("metrics.json", "w") as f:
        json.dump(metrics_results, f, indent=2, cls=NumpyEncoder)

    # Log summary statistics
    logging.info("Classification Complete!")
    logging.info(f"Total processing time: {float(performance_summary['total_processing_time']):.2f} seconds")
    logging.info(f"Average CV Accuracy: {float(cv_results['average_metrics']['accuracy']):.3f}")
    logging.info(f"Average CV F1 Score: {float(cv_results['average_metrics']['f1_score']):.3f}")
    logging.info(f"Average Conference Accuracy: {float(cv_results['average_metrics']['conference_accuracy']):.3f}")

if __name__ == "__main__":
    main()

AttributeError: Table has no column with name to_dict.

In [None]:
pip install pathway google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client

Collecting pathway
  Downloading pathway-0.16.4-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting h3>=4 (from pathway)
  Downloading h3-4.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting sqlglot==10.6.1 (from pathway)
  Downloading sqlglot-10.6.1-py3-none-any.whl.metadata (14 kB)
Collecting python-sat>=0.1.8.dev0 (from pathway)
  Downloading python_sat-1.8.dev14-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Collecting beartype<0.16.0,>=0.14.0 (from pathway)
  Downloading beartype-0.15.0-py3-none-any.whl.metadata (28 kB)
Collecting diskcache>=5.2.1 (from pathway)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Co

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json
import time
import psutil
from sklearn.model_selection import KFold
from json import JSONEncoder
import pathway as pw

@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

@dataclass
class PerformanceMetrics:
    processing_time: float
    memory_used_mb: float
    gpu_memory_mb: float = 0

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self.reference_papers = []
        self._setup_models()
        self.performance_metrics = []

    def _setup_models(self):
        # SciBERT for embeddings
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

        # RoBERTa for rationale generation
        self.qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2").to(self.device)
        self.qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

    def _track_performance(func):
        def wrapper(self, *args, **kwargs):
            start_time = time.time()
            torch.cuda.reset_peak_memory_stats()

            result = func(self, *args, **kwargs)

            processing_time = time.time() - start_time
            memory_used = psutil.Process().memory_info().rss / 1024 / 1024  # MB
            gpu_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 if torch.cuda.is_available() else 0

            self.performance_metrics.append(PerformanceMetrics(
                processing_time=processing_time,
                memory_used_mb=memory_used,
                gpu_memory_mb=gpu_memory
            ))

            return result
        return wrapper

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        conference_contexts = {
            "TMLR": "TMLR is a machine learning research conference focusing on theoretical advances, algorithms, and methodological innovations in machine learning.",
            "CVPR": "CVPR is a premier computer vision conference focusing on visual processing, recognition, understanding, and generation.",
            "EMNLP": "EMNLP is a natural language processing conference focusing on computational linguistics, language understanding, and text processing.",
            "NeurIPS": "NeurIPS is a conference focusing on neural information processing systems, machine learning theory, and artificial intelligence.",
            "KDD": "KDD is a conference focusing on data mining, knowledge discovery, and large-scale data analytics."
        }

        content_preview = paper_content[:1000].replace('\n', ' ').strip()
        try:
            title = content_preview.split('.')[0]
            abstract = ' '.join(content_preview.split('.')[1:3])
        except:
            title = content_preview[:100]
            abstract = content_preview[100:500]

        context = (
            f"Paper Title: {title}\n"
            f"Abstract: {abstract}\n\n"
            f"Conference Information: {conference_contexts[conference]}\n\n"
            "A paper is relevant to a conference if its technical contributions and research focus align with the conference's main themes. "
            "The explanation should describe specific technical aspects of the paper that match the conference's focus areas."
        )

        questions = [
            f"What specific technical contributions make this paper relevant to {conference}?",
            f"How does this paper's methodology align with {conference}'s main themes?",
            f"What is the main innovation of this paper that fits {conference}'s focus?"
        ]

        answers = []
        for question in questions:
            inputs = self.qa_tokenizer(
                question,
                context,
                max_length=386,
                truncation=True,
                padding='max_length',
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                outputs = self.qa_model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                start_idx = torch.argmax(start_logits)
                end_idx = torch.argmax(end_logits)

                tokens = self.qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
                answer = self.qa_tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx+1])

                if answer and len(answer.strip()) > 10:
                    answers.append(answer.strip())

        if answers:
            unique_answers = []
            for ans in answers:
                if not any(self._similar_strings(ans, existing) for existing in unique_answers):
                    unique_answers.append(ans)

            rationale = f"This paper is relevant to {conference} because " + ". ".join(unique_answers[:2])
            return rationale
        else:
            return f"This paper appears relevant to {conference} based on its focus on {title}"

    def _similar_strings(self, str1: str, str2: str, threshold: float = 0.7) -> bool:
        if len(str1) < 10 or len(str2) < 10:
            return False
        common = sum(1 for a, b in zip(str1.lower(), str2.lower()) if a == b)
        return common / max(len(str1), len(str2)) > threshold

    def _load_pathway_dataset(self, object_id: str, credentials_file: str) -> Tuple[List[Paper], List[Paper]]:


        # Read data using Pathway
        table = pw.io.gdrive.read(
            object_id=object_id,
            service_user_credentials_file=credentials_file,
            file_name_pattern="*.pdf",
            with_metadata=True,
            mode="static"
        )

        # Convert to pandas DataFrame for processing
        output = pw.debug.compute_and_print(table)
        df = output[0].get_table()  # Get the first table's data as DataFrame

        reference_papers = []
        papers_to_classify = []

        # Process each file
        for _, row in df.iterrows():
            pdf_content = row['data']
            file_path = Path(row['_metadata']['name'])

            text_content = self._extract_pdf_content_from_binary(pdf_content)

            if "Reference" in str(file_path):
                if "Publishable" in str(file_path):
                    conf = next((c for c in self.conferences if c in str(file_path)), None)
                    reference_papers.append(Paper(
                        content=text_content,
                        path=file_path,
                        is_reference=True,
                        label="Publishable",
                        conference=conf
                    ))
                else:
                    reference_papers.append(Paper(
                        content=text_content,
                        path=file_path,
                        is_reference=True,
                        label="NonPublishable"
                    ))
            else:
                papers_to_classify.append(Paper(
                    content=text_content,
                    path=file_path,
                    is_reference=False
                ))

        self.reference_papers = reference_papers
        return reference_papers, papers_to_classify

    def _extract_pdf_content_from_binary(self, pdf_binary: bytes) -> str:
        try:
            doc = fitz.open(stream=pdf_binary, filetype="pdf")
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from PDF binary: {e}")
            return ""

    @_track_performance
    def _get_embedding(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.scibert(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    def _cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> float:
        return torch.cosine_similarity(a, b, dim=1).cpu().numpy()[0]

    @_track_performance
    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []
        ref_embeddings = {
            "publishable": [],
            "conferences": {},
            "nonpublishable": []
        }

        for ref in reference_papers:
            emb = self._get_embedding(ref.content)
            if ref.label == "Publishable":
                ref_embeddings["publishable"].append(emb)
                if ref.conference not in ref_embeddings["conferences"]:
                    ref_embeddings["conferences"][ref.conference] = []
                ref_embeddings["conferences"][ref.conference].append(emb)
            else:
                ref_embeddings["nonpublishable"].append(emb)

        for paper in papers:
            try:
                emb = self._get_embedding(paper.content)

                pub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["publishable"]]
                nonpub_scores = [self._cosine_similarity(emb, ref) for ref in ref_embeddings["nonpublishable"]]

                is_publishable = np.mean(pub_scores) > np.mean(nonpub_scores)

                if is_publishable:
                    conf_scores = {}
                    for conf, conf_embs in ref_embeddings["conferences"].items():
                        scores = [self._cosine_similarity(emb, ref) for ref in conf_embs]
                        conf_scores[conf] = np.mean(scores)

                    best_conf = max(conf_scores.items(), key=lambda x: x[1])[0]
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale,
                        "confidence_scores": {
                            "publishability": np.mean(pub_scores),
                            "conference": conf_scores[best_conf]
                        }
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na",
                        "confidence_scores": {
                            "publishability": np.mean(nonpub_scores)
                        }
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e),
                    "confidence_scores": {}
                })

        return results

    def calculate_metrics(self, validation_papers: List[Paper]) -> Dict:
        true_positives = 0
        false_positives = 0
        true_negatives = 0
        false_negatives = 0

        conference_correct = 0
        total_publishable = 0

        results = self.classify_papers(self.reference_papers, validation_papers)

        for paper, result in zip(validation_papers, results):
            is_actually_publishable = paper.label == "Publishable"
            is_predicted_publishable = result["publishable"] == 1

            if is_predicted_publishable and is_actually_publishable:
                true_positives += 1
                if paper.conference == result["conference"]:
                    conference_correct += 1
            elif is_predicted_publishable and not is_actually_publishable:
                false_positives += 1
            elif not is_predicted_publishable and not is_actually_publishable:
                true_negatives += 1
            else:
                false_negatives += 1

            if is_actually_publishable:
                total_publishable += 1

        accuracy = (true_positives + true_negatives) / len(validation_papers)

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        conference_accuracy = conference_correct / total_publishable if total_publishable > 0 else 0

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "conference_accuracy": conference_accuracy,
            "metrics_detail": {
                "true_positives": true_positives,
                "false_positives": false_positives,
                "true_negatives": true_negatives,
                "false_negatives": false_negatives,
                "total_papers": len(validation_papers)
            }
        }

    def cross_validate(self, k_folds: int = 5) -> Dict:
        """
        Perform k-fold cross-validation on the reference papers
        """
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        metrics_list = []

        reference_papers = np.array(self.reference_papers)

        for train_idx, val_idx in kf.split(reference_papers):
            train_papers = reference_papers[train_idx].tolist()
            val_papers = reference_papers[val_idx].tolist()

            self.reference_papers = train_papers
            metrics = self.calculate_metrics(val_papers)
            metrics_list.append(metrics)

        # Calculate average metrics
        avg_metrics = {
            "accuracy": np.mean([m["accuracy"] for m in metrics_list]),
            "f1_score": np.mean([m["f1_score"] for m in metrics_list]),
            "conference_accuracy": np.mean([m["conference_accuracy"] for m in metrics_list])
        }

        return {
            "fold_metrics": metrics_list,
            "average_metrics": avg_metrics
        }

    def get_performance_summary(self) -> Dict:
        """
        Get summary of performance metrics including processing time, memory usage, and GPU stats
        """
        total_time = sum(m.processing_time for m in self.performance_metrics)
        avg_memory = np.mean([m.memory_used_mb for m in self.performance_metrics])
        max_memory = max(m.memory_used_mb for m in self.performance_metrics)

        if torch.cuda.is_available():
            avg_gpu = np.mean([m.gpu_memory_mb for m in self.performance_metrics])
            max_gpu = max(m.gpu_memory_mb for m in self.performance_metrics)
        else:
            avg_gpu = max_gpu = 0

        return {
            "total_processing_time": total_time,
            "average_time_per_operation": total_time / len(self.performance_metrics),
            "memory_usage": {
                "average_mb": avg_memory,
                "peak_mb": max_memory
            },
            "gpu_usage": {
                "average_mb": avg_gpu,
                "peak_mb": max_gpu
            },
            "total_operations": len(self.performance_metrics),
            "device_used": str(self.device)
        }


class NumpyEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def main():
    # Initialize classifier
    classifier = PaperClassifier()

    # Load datasets using Pathway
    reference_papers, papers = classifier._load_pathway_dataset(
        object_id="1Z8z4craj36ighb8hzUzeM76OOgpUdsKr",
        credentials_file="credentials.json"
    )

    # Perform cross-validation
    cv_results = classifier.cross_validate(k_folds=5)

    # Classify papers
    classification_results = classifier.classify_papers(reference_papers, papers)

    # Get performance metrics
    performance_summary = classifier.get_performance_summary()

    # Prepare submission results (without confidence scores)
    submission_results = [{
        "paper_id": result["paper_id"],
        "publishable": result["publishable"],
        "conference": result["conference"],
        "rationale": result["rationale"]
    } for result in classification_results]

    # Prepare metrics results
    metrics_results = {
        "paper_metrics": [{
            "paper_id": result["paper_id"],
            "confidence_scores": result["confidence_scores"]
        } for result in classification_results],
        "cross_validation": cv_results,
        "performance": performance_summary
    }

    # Write submission results to results.json
    with open("results.json", "w") as f:
        json.dump(submission_results, f, indent=2, cls=NumpyEncoder)

    # Write metrics to metrics.json
    with open("metrics.json", "w") as f:
        json.dump(metrics_results, f, indent=2, cls=NumpyEncoder)

    # Log summary statistics
    logging.info("Classification Complete!")
    logging.info(f"Total processing time: {float(performance_summary['total_processing_time']):.2f} seconds")
    logging.info(f"Average CV Accuracy: {float(cv_results['average_metrics']['accuracy']):.3f}")
    logging.info(f"Average CV F1 Score: {float(cv_results['average_metrics']['f1_score']):.3f}")
    logging.info(f"Average Conference Accuracy: {float(cv_results['average_metrics']['conference_accuracy']):.3f}")

if __name__ == "__main__":
    main()



Buffered data was truncated after reaching the output size limit.

WITH PATHWAY VECTOR STORE AND WITHOUT CONNECTOR


In [None]:
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from pathlib import Path
import fitz
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Tuple
import logging
import json
import time
import psutil
from sklearn.model_selection import KFold
from json import JSONEncoder
import pathway as pw
from pathway.xpacks.llm.vector_store import VectorStoreServer, VectorStoreClient
from transformers import AutoTokenizer, AutoModel


@dataclass
class Paper:
    content: str
    path: Path
    is_reference: bool
    label: str = None
    conference: str = None

@dataclass
class PerformanceMetrics:
    processing_time: float
    memory_used_mb: float
    gpu_memory_mb: float = 0

class PaperClassifier:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.conferences = ["TMLR", "CVPR", "EMNLP", "NeurIPS", "KDD"]
        self.reference_papers = []
        self._setup_models()
        self.performance_metrics = []
        self._setup_vector_store()

    def _setup_models(self):
        # Only keep RoBERTa for rationale generation
        self.qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2").to(self.device)
        self.qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

    def _setup_vector_store(self):
        """Initialize Pathway vector store server and client"""
        # Setup SciBERT embedder
        self.tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
        self.scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").to(self.device)

        def embed_text(text: str) -> list[float]:
            inputs = self.tokenizer(
                text,
                max_length=512,
                truncation=True,
                padding=True,
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                outputs = self.scibert(**inputs)
            # Convert mean pooled output to list of floats
            embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()[0].tolist()
            return embedding

        # Initialize server
        self.vector_server = VectorStoreServer(
            embedder=embed_text,
            parser=None  # We're handling parsing separately
        )

        # Start server in a thread
        self.vector_server.run_server(
            host="localhost",
            port=8000,
            threaded=True
        )

        # Initialize client
        self.vector_client = VectorStoreClient(
            host="localhost",
            port=8000
        )

    def _index_reference_papers(self, reference_papers: List[Paper]):
        """Index reference papers using vector store server"""
        # Create a Pathway table from reference papers
        documents = [{
            "content": paper.content,
            "metadata": {
                "label": paper.label,
                "conference": paper.conference,
                "path": str(paper.path)
            }
        } for paper in reference_papers]

        # The documents will be automatically indexed by the server
        table = pw.Table.from_list(documents)
        self.vector_server.add_documents(table)

    def _get_similar_papers(self, query_paper: Paper, top_k: int = 5) -> List[Dict]:
        """Query similar papers using vector store client"""
        results = self.vector_client.query(
            query=query_paper.content,
            k=top_k
        )
        return results

    def _track_performance(func):
        def wrapper(self, *args, **kwargs):
            start_time = time.time()
            torch.cuda.reset_peak_memory_stats()

            result = func(self, *args, **kwargs)

            processing_time = time.time() - start_time
            memory_used = psutil.Process().memory_info().rss / 1024 / 1024  # MB
            gpu_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 if torch.cuda.is_available() else 0

            self.performance_metrics.append(PerformanceMetrics(
                processing_time=processing_time,
                memory_used_mb=memory_used,
                gpu_memory_mb=gpu_memory
            ))

            return result
        return wrapper

    def _generate_rationale(self, paper_content: str, conference: str) -> str:
        conference_contexts = {
            "TMLR": "TMLR is a machine learning research conference focusing on theoretical advances, algorithms, and methodological innovations in machine learning.",
            "CVPR": "CVPR is a premier computer vision conference focusing on visual processing, recognition, understanding, and generation.",
            "EMNLP": "EMNLP is a natural language processing conference focusing on computational linguistics, language understanding, and text processing.",
            "NeurIPS": "NeurIPS is a conference focusing on neural information processing systems, machine learning theory, and artificial intelligence.",
            "KDD": "KDD is a conference focusing on data mining, knowledge discovery, and large-scale data analytics."
        }

        content_preview = paper_content[:1000].replace('\n', ' ').strip()
        try:
            title = content_preview.split('.')[0]
            abstract = ' '.join(content_preview.split('.')[1:3])
        except:
            title = content_preview[:100]
            abstract = content_preview[100:500]

        context = (
            f"Paper Title: {title}\n"
            f"Abstract: {abstract}\n\n"
            f"Conference Information: {conference_contexts[conference]}\n\n"
            "A paper is relevant to a conference if its technical contributions and research focus align with the conference's main themes. "
            "The explanation should describe specific technical aspects of the paper that match the conference's focus areas."
        )

        questions = [
            f"What specific technical contributions make this paper relevant to {conference}?",
            f"How does this paper's methodology align with {conference}'s main themes?",
            f"What is the main innovation of this paper that fits {conference}'s focus?"
        ]

        answers = []
        for question in questions:
            inputs = self.qa_tokenizer(
                question,
                context,
                max_length=386,
                truncation=True,
                padding='max_length',
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                outputs = self.qa_model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                start_idx = torch.argmax(start_logits)
                end_idx = torch.argmax(end_logits)

                tokens = self.qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
                answer = self.qa_tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx+1])

                if answer and len(answer.strip()) > 10:
                    answers.append(answer.strip())

        if answers:
            unique_answers = []
            for ans in answers:
                if not any(self._similar_strings(ans, existing) for existing in unique_answers):
                    unique_answers.append(ans)

            rationale = f"This paper is relevant to {conference} because " + ". ".join(unique_answers[:2])
            return rationale
        else:
            return f"This paper appears relevant to {conference} based on its focus on {title}"

    def _similar_strings(self, str1: str, str2: str, threshold: float = 0.7) -> bool:
        if len(str1) < 10 or len(str2) < 10:
            return False
        common = sum(1 for a, b in zip(str1.lower(), str2.lower()) if a == b)
        return common / max(len(str1), len(str2)) > threshold

    def load_dataset(self, base_path: str) -> Tuple[List[Paper], List[Paper]]:
        base_path = Path(base_path)
        reference_papers = []
        papers_to_classify = []

        ref_path = base_path / "Reference"
        pub_path = ref_path / "Publishable"

        for conf in self.conferences:
            conf_path = pub_path / conf
            for pdf_path in conf_path.glob("*.pdf"):
                content = self._extract_pdf_content(pdf_path)
                reference_papers.append(Paper(
                    content=content,
                    path=pdf_path,
                    is_reference=True,
                    label="Publishable",
                    conference=conf
                ))

        nonpub_path = ref_path / "Non-Publishable"
        for pdf_path in nonpub_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            reference_papers.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=True,
                label="NonPublishable"
            ))

        papers_path = base_path / "Papers"
        for pdf_path in papers_path.glob("*.pdf"):
            content = self._extract_pdf_content(pdf_path)
            papers_to_classify.append(Paper(
                content=content,
                path=pdf_path,
                is_reference=False
            ))

        self.reference_papers = reference_papers
        return reference_papers, papers_to_classify

    def _extract_pdf_content(self, pdf_path: Path) -> str:
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()
            return text.strip()
        except Exception as e:
            logging.error(f"Error extracting content from {pdf_path}: {e}")
            return ""


    @_track_performance
    def classify_papers(self, reference_papers: List[Paper], papers: List[Paper]) -> List[Dict]:
        results = []

        # Index reference papers
        self._index_reference_papers(reference_papers)

        for paper in papers:
            try:
                # Get similar papers
                similar_papers = self._get_similar_papers(paper)

                # Analyze similarity scores and metadata
                publishable_scores = []
                conference_scores = {conf: [] for conf in self.conferences}

                for result in similar_papers:
                    score = result.score
                    metadata = result.metadata

                    if metadata["label"] == "Publishable":
                        publishable_scores.append(score)
                        if metadata["conference"]:
                            conference_scores[metadata["conference"]].append(score)

                # Determine publishability
                avg_publishable_score = np.mean(publishable_scores) if publishable_scores else 0
                is_publishable = avg_publishable_score > 0.7  # Threshold can be adjusted

                if is_publishable:
                    # Find best conference
                    conf_avg_scores = {
                        conf: np.mean(scores) if scores else 0
                        for conf, scores in conference_scores.items()
                    }
                    best_conf = max(conf_avg_scores.items(), key=lambda x: x[1])[0]

                    # Generate rationale
                    rationale = self._generate_rationale(paper.content, best_conf)

                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 1,
                        "conference": best_conf,
                        "rationale": rationale,
                        "confidence_scores": {
                            "publishability": avg_publishable_score,
                            "conference": conf_avg_scores[best_conf]
                        }
                    })
                else:
                    results.append({
                        "paper_id": paper.path.stem,
                        "publishable": 0,
                        "conference": "na",
                        "rationale": "na",
                        "confidence_scores": {
                            "publishability": avg_publishable_score
                        }
                    })

            except Exception as e:
                logging.error(f"Error processing {paper.path}: {e}")
                results.append({
                    "paper_id": paper.path.stem,
                    "publishable": 0,
                    "conference": "error",
                    "rationale": str(e),
                    "confidence_scores": {}
                })

        return results

    def calculate_metrics(self, validation_papers: List[Paper]) -> Dict:
        true_positives = 0
        false_positives = 0
        true_negatives = 0
        false_negatives = 0

        conference_correct = 0
        total_publishable = 0

        results = self.classify_papers(self.reference_papers, validation_papers)

        for paper, result in zip(validation_papers, results):
            is_actually_publishable = paper.label == "Publishable"
            is_predicted_publishable = result["publishable"] == 1

            if is_predicted_publishable and is_actually_publishable:
                true_positives += 1
                if paper.conference == result["conference"]:
                    conference_correct += 1
            elif is_predicted_publishable and not is_actually_publishable:
                false_positives += 1
            elif not is_predicted_publishable and not is_actually_publishable:
                true_negatives += 1
            else:
                false_negatives += 1

            if is_actually_publishable:
                total_publishable += 1

        accuracy = (true_positives + true_negatives) / len(validation_papers)

        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        conference_accuracy = conference_correct / total_publishable if total_publishable > 0 else 0

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "conference_accuracy": conference_accuracy,
            "metrics_detail": {
                "true_positives": true_positives,
                "false_positives": false_positives,
                "true_negatives": true_negatives,
                "false_negatives": false_negatives,
                "total_papers": len(validation_papers)
            }
        }

    def cross_validate(self, k_folds: int = 5) -> Dict:
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        metrics_list = []

        reference_papers = np.array(self.reference_papers)

        for train_idx, val_idx in kf.split(reference_papers):
            train_papers = reference_papers[train_idx].tolist()
            val_papers = reference_papers[val_idx].tolist()

            self.reference_papers = train_papers
            metrics = self.calculate_metrics(val_papers)
            metrics_list.append(metrics)

        avg_metrics = {
            "accuracy": np.mean([m["accuracy"] for m in metrics_list]),
            "f1_score": np.mean([m["f1_score"] for m in metrics_list]),
            "conference_accuracy": np.mean([m["conference_accuracy"] for m in metrics_list])
        }

        return {
            "fold_metrics": metrics_list,
            "average_metrics": avg_metrics
        }

    def get_performance_summary(self) -> Dict:
        total_time = sum(m.processing_time for m in self.performance_metrics)
        avg_memory = np.mean([m.memory_used_mb for m in self.performance_metrics])
        max_memory = max(m.memory_used_mb for m in self.performance_metrics)

        if torch.cuda.is_available():
            avg_gpu = np.mean([m.gpu_memory_mb for m in self.performance_metrics])
            max_gpu = max(m.gpu_memory_mb for m in self.performance_metrics)
        else:
            avg_gpu = max_gpu = 0

        return {
            "total_processing_time": total_time,
            "average_time_per_operation": total_time / len(self.performance_metrics),
            "memory_usage": {
                "average_mb": avg_memory,
                "peak_mb": max_memory
            },
            "gpu_usage": {
                "average_mb": avg_gpu,
                "peak_mb": max_gpu
            },
            "total_operations": len(self.performance_metrics),
            "device_used": str(self.device),
            "vector_store_stats": {
                "indexed_documents": self.vector_store.get_stats().get("total_documents", 0),
                "embedding_dimensions": self.vector_store.get_stats().get("embedding_dimensions", 0)
            }
        }


    class NumpyEncoder(JSONEncoder):
        def default(self, obj):
            if isinstance(obj, np.integer):
                return int(obj)
            if isinstance(obj, np.floating):
                return float(obj)
            if isinstance(obj, np.ndarray):
                return obj.tolist()
            return super(NumpyEncoder, self).default(obj)


    def main():
        # Initialize classifier
        classifier = PaperClassifier()

        # Load datasets
        reference_papers, papers = classifier.load_dataset("/content/drive/MyDrive/KDSH_2025_Dataset")

        # Perform cross-validation
        cv_results = classifier.cross_validate(k_folds=5)

        # Classify papers
        classification_results = classifier.classify_papers(reference_papers, papers)

        # Get performance metrics
        performance_summary = classifier.get_performance_summary()

        # Prepare submission results (without confidence scores)
        submission_results = [{
            "paper_id": result["paper_id"],
            "publishable": result["publishable"],
            "conference": result["conference"],
            "rationale": result["rationale"]
        } for result in classification_results]

        # Prepare metrics results
        metrics_results = {
            "paper_metrics": [{
                "paper_id": result["paper_id"],
                "confidence_scores": result["confidence_scores"]
            } for result in classification_results],
            "cross_validation": cv_results,
            "performance": performance_summary
        }

        # Write submission results to results.json
        with open("results.json", "w") as f:
            json.dump(submission_results, f, indent=2, cls=NumpyEncoder)

        # Write metrics to metrics.json
        with open("metrics.json", "w") as f:
            json.dump(metrics_results, f, indent=2, cls=NumpyEncoder)

        # Log summary statistics
        logging.info("Classification Complete!")
        logging.info(f"Total processing time: {float(performance_summary['total_processing_time']):.2f} seconds")
        logging.info(f"Average CV Accuracy: {float(cv_results['average_metrics']['accuracy']):.3f}")
        logging.info(f"Average CV F1 Score: {float(cv_results['average_metrics']['f1_score']):.3f}")
        logging.info(f"Average Conference Accuracy: {float(cv_results['average_metrics']['conference_accuracy']):.3f}")


    if __name__ == "__main__":
        main()