In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import streamlit as st
import nltk
import PyPDF2
import re
import os

In [None]:
try:
    nltk.download('punkt_tab')
    nltk.download('stopwords')
except:
    print("Note: NLTK data download failed. If stopwords aren't available, install nltk and run the downloads manually.")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text content from a PDF file."""
    if not os.path.exists(pdf_path):
        print(f"PDF file not found: {pdf_path}")
        return ""

    if not pdf_path.lower().endswith('.pdf'):
        print(f"File is not a PDF: {pdf_path}")
        return ""

    try:
        text = ""
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if len(reader.pages) == 0:
                print(f"PDF contains no pages: {pdf_path}")
                return ""

            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        if not text.strip():
            print(f"Extracted empty text from PDF: {pdf_path}")

        return text
    except PyPDF2.errors.PdfReadError as e:
        print(f"PDF read error: {e}")
        return ""
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

In [None]:
def extract_abstract(full_text):
    """Attempt to extract abstract from paper text."""
    if not full_text or len(full_text) < 500:
        return ""

    # Clean up text
    text = re.sub(r'\s+', ' ', full_text)

    # First approach: Look for "Abstract" label
    abstract_match = re.search(r'(?:abstract|ABSTRACT)[:\s]+([^\.]+(?:\.[^\.]+){2,10})', text, re.IGNORECASE)
    if abstract_match:
        abstract = abstract_match.group(1).strip()
        if len(abstract) > 100:
            return abstract

    # Second approach: Get first substantial paragraph (after title)
    lines = full_text.split('\n')
    paragraphs = []
    current_paragraph = []

    for line in lines:
        if line.strip():
            current_paragraph.append(line.strip())
        elif current_paragraph:
            paragraphs.append(' '.join(current_paragraph))
            current_paragraph = []

    if current_paragraph:  # Add the last paragraph
        paragraphs.append(' '.join(current_paragraph))

    # Skip potential title, author, affiliation paragraphs
    start_idx = 0
    for i, p in enumerate(paragraphs[:4]):
        if len(p) < 100 or any(word in p.lower() for word in ['university', 'department', '@', 'email']):
            start_idx = i + 1
        else:
            break

    # Get first substantial paragraph after metadata
    for p in paragraphs[start_idx:start_idx+5]:
        if len(p) > 150 and len(p) < 2000:
            return p

    # Last resort: just return the first substantial text
    for p in paragraphs:
        if len(p) > 200:
            return p

    return ""

In [None]:
def preprocess_text(text, remove_stopwords: bool = True):
    if not text or not isinstance(text, str):
        return ""

    try:
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text.strip().lower())

        # Tokenize
        tokens = word_tokenize(text)

        # Filter tokens
        if remove_stopwords:
            stop_words = set(stopwords.words('english'))
            tokens = [word for word in tokens if word.isalnum() and word not in stop_words and len(word) > 1]
        else:
            tokens = [word for word in tokens if word.isalnum() and len(word) > 1]

        return " ".join(tokens)
    except Exception as e:
        print(f"Text preprocessing failed: {e}")
        return text.lower()  # Return lowercase text as fallback

In [None]:
def calculate_relevance(query, abstract, paper_text):
    results = {
        'combined_score': 0.0,
        'abstract_score': 0.0,
        'full_text_score': 0.0,
        'keyword_score': 0.0,
        'status': 'success',
        'error': None
    }
    if not query or not isinstance(query, str):
        results['status'] = 'error'
        results['error'] = 'Invalid query'
        return results

    if not abstract:
        abstract = ""
    if not paper_text:
        paper_text = ""

    if not abstract and not paper_text:
        results['status'] = 'error'
        results['error'] = 'No paper content provided'
        return results

    try:
        # Extract key terms from the query
        query_terms = set(preprocess_text(query, True).split())

        # Method 1: TF-IDF Vectorization and Cosine Similarity
        query_processed = preprocess_text(query)
        abstract_processed = preprocess_text(abstract)

        # Limit paper text to avoid processing too much text
        paper_sample = paper_text[:50000]  # Use first 50K chars to avoid memory issues
        paper_processed = preprocess_text(paper_sample)

        if not query_processed:
            results['status'] = 'error'
            results['error'] = 'Query has no valid terms after preprocessing'
            return results

        # Handle empty processed text
        if not abstract_processed:
            abstract_processed = "empty_abstract"
        if not paper_processed:
            paper_processed = "empty_paper"

        documents = [query_processed, abstract_processed, paper_processed]

        # Calculate TF-IDF relevance
        vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 2))
        try:
            tfidf_matrix = vectorizer.fit_transform(documents)

            query_vector = tfidf_matrix[0]
            abstract_vector = tfidf_matrix[1]
            paper_vector = tfidf_matrix[2]

            if abstract_processed != "empty_abstract":
                abstract_similarity = cosine_similarity(query_vector, abstract_vector)[0][0]
            else:
                abstract_similarity = 0.0

            if paper_processed != "empty_paper":
                paper_similarity = cosine_similarity(query_vector, paper_vector)[0][0]
            else:
                paper_similarity = 0.0

            results['abstract_score'] = float(abstract_similarity)
            results['full_text_score'] = float(paper_similarity)
        except Exception as e:
            print(f"TF-IDF calculation failed: {e}")
            results['abstract_score'] = 0.0
            results['full_text_score'] = 0.0

        # Method 2: Keyword presence
        if query_terms:
            abstract_text = abstract.lower() if abstract else ""
            paper_text_sample = paper_text[:100000].lower() if paper_text else ""

            term_hits = 0
            for term in query_terms:
                if term in abstract_text:
                    term_hits += 2  # Double weight for abstract hits
                if term in paper_text_sample:
                    term_hits += 1

            max_possible_hits = len(query_terms) * 3  # 2 for abstract, 1 for full text
            keyword_score = term_hits / max_possible_hits if max_possible_hits > 0 else 0
            results['keyword_score'] = keyword_score

        # Calculate combined score
        abstract_weight = 0.35
        fulltext_weight = 0.45
        keyword_weight = 0.20

        combined_score = (
            abstract_weight * results['abstract_score'] +
            fulltext_weight * results['full_text_score'] +
            keyword_weight * results['keyword_score']
        )

        results['combined_score'] = float(combined_score)
        return results

    except Exception as e:
        print(f"Error calculating relevance: {e}")
        results['status'] = 'error'
        results['error'] = str(e)
        return results

In [None]:
def process_paper(query, pdf_path):
    results = {
        'query': query,
        'pdf_path': pdf_path,
        'paper_text_length': 0,
        'abstract': '',
        'abstract_method': '',
        'relevance': None,
        'status': 'success',
        'error': None
    }

    # Extract text from PDF
    paper_text = extract_text_from_pdf(pdf_path)
    if not paper_text:
        results['status'] = 'error'
        results['error'] = 'Failed to extract text from PDF'
        return results

    results['paper_text_length'] = len(paper_text)

    # Extract abstract
    abstract = extract_abstract(paper_text)
    results['abstract'] = abstract

    # Calculate relevance
    relevance_results = calculate_relevance(query, abstract, paper_text)
    results['relevance'] = relevance_results

    if relevance_results['status'] == 'error':
        results['status'] = 'warning'
        results['error'] = f"Relevance calculation issue: {relevance_results['error']}"

    return results

In [None]:
def main():
    print("\n=== Research Paper Relevance Calculator ===\n")

    try:
        # Get input
        query = input("Enter your search query: ").strip()
        if not query:
            print("Error: Query cannot be empty. Please provide a valid search query.")
            return

        pdf_path = input("Enter the path to the PDF file: ").strip()
        if not os.path.exists(pdf_path):
            print(f"Error: File not found: {pdf_path}")
            return

        # Process the paper
        print("\nProcessing paper...")
        results = process_paper(query, pdf_path)

        # Display results
        print("\n=== Results ===\n")

        if results['status'] == 'error':
            print(f"Error: {results['error']}")
            return

        print(f"Query: {results['query']}")
        print(f"PDF: {os.path.basename(results['pdf_path'])}")
        print(f"Extracted text: {results['paper_text_length']} characters")

        if results['abstract']:
            print(f"\nExtracted abstract ({len(results['abstract'])} chars):")
            print(f"{results['abstract'][:300]}..." if len(results['abstract']) > 300 else results['abstract'])
        else:
            print("\nNo abstract extracted")

        relevance = results['relevance']
        if relevance['status'] == 'success':
            print("\nRelevance Scores:")
            print(f"  Combined Score: {relevance['combined_score']:.4f}")
            print(f"  Abstract Relevance: {relevance['abstract_score']:.4f}")
            print(f"  Full Text Relevance: {relevance['full_text_score']:.4f}")
            print(f"  Keyword Match Score: {relevance['keyword_score']:.4f}")

            # Interpret the score
            if relevance['combined_score'] > 0.7:
                print("\nInterpretation: Highly relevant paper")
            elif relevance['combined_score'] > 0.4:
                print("\nInterpretation: Moderately relevant paper")
            elif relevance['combined_score'] > 0.2:
                print("\nInterpretation: Somewhat relevant paper")
            else:
                print("\nInterpretation: Likely not relevant to your query")
        else:
            print(f"\nWarning: {relevance['error']}")

    except KeyboardInterrupt:
        print("\nOperation cancelled by user")
    except Exception as e:
        print(f"\nAn error occurred: {e}")

In [None]:
if __name__ == "__main__":
    main()


=== Research Paper Relevance Calculator ===

Enter your search query: multi-objective reinforcement learning 
Enter the path to the PDF file: /content/2005.07513.pdf

Processing paper...

=== Results ===

Query: multi-objective reinforcement learning
PDF: 2005.07513.pdf
Extracted text: 96902 characters

Extracted abstract (1242 chars):
Many real-world problems require trading off mul- tiple competing objectives. However, these ob- jectives are often in different units and/or scales, which can make it challenging for practitioners to express numerical preferences over objectives in their native units. In this paper we propose a nov...

Relevance Scores:
  Combined Score: 0.3219
  Abstract Relevance: 0.1381
  Full Text Relevance: 0.1634
  Keyword Match Score: 1.0000

Interpretation: Somewhat relevant paper


In [None]:
def run_relevance_checker(uploaded_file):
    st.header("Relevance Checker")
    if not uploaded_file:
        st.info("Please upload a PDF file in the sidebar to begin")
        return
    temp_file_path = uploaded_file.name
    with open(temp_file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    # Get user query
    query = st.text_input("Enter your research topic or keywords to check relevance against the uploaded paper:")
    
    if query and temp_file_path:
        # Extract text from PDF
        paper_text = extract_text_from_pdf(temp_file_path)
        if not paper_text:
            st.error("Failed to extract text from PDF")
            return

        with st.spinner("Processing PDF..."):
        # Extract abstract
            abstract = extract_abstract(paper_text)

        with st.spinner("Analysing text..."):
        # Calculate relevance
            results = calculate_relevance(query, abstract, paper_text)

        # Display results
        st.subheader("Results")

        if results['status'] == 'error':
            st.error(f"Error: {results['error']}")
            return

        if abstract:
            with st.expander("View Extracted Abstract"):
                st.write(abstract)
        else:
            st.warning("No abstract could be extracted from the paper")

        st.write("### Relevance Scores")
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Combined Score", f"{results['combined_score']:.3f}")
        with col2:
            st.metric("Abstract Relevance", f"{results['abstract_score']:.3f}")
        with col3:
            st.metric("Full Text Relevance", f"{results['full_text_score']:.3f}")
        with col4:
            st.metric("Keyword Match", f"{results['keyword_score']:.3f}")

        # Interpretation
        st.write("### Interpretation")
        if results['combined_score'] > 0.7:
            st.success("Highly relevant paper")
        elif results['combined_score'] > 0.4:
            st.info("Moderately relevant paper")
        elif results['combined_score'] > 0.2:
            st.warning("Somewhat relevant paper")
        else:
            st.error("Likely not relevant to your query")