# arXiv Research Chatbot Model Training
This notebook trains a research chatbot using Sentence Transformers and BART summarization.

## Import Libraries

In [None]:

import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json


## Load and Preprocess Data

In [None]:

# Load dataset
def load_data(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                return pd.DataFrame()
    
    df = pd.DataFrame(data)
    df.columns = df.columns.str.strip().str.lower()
    
    # Filter for Computer Science papers
    df = df[df['categories'].str.contains('cs.', regex=True, na=False)]
    df['combined_text'] = df['title'] + " " + df['abstract']
    return df

df = load_data('c://Users/swathiga/Downloads/archive (14)/arxiv-metadata-oai-snapshot.json')


## Load Sentence Transformer for Semantic Search

In [None]:

# Load sentence transformer model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
def generate_embeddings(df):
    if df.empty:
        return np.array([])
    return np.array([sentence_model.encode(text) for text in df['combined_text']])

embeddings = generate_embeddings(df)


## Load Summarization Model

In [None]:

# Load summarization model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# Summarization function
def summarize_text(text, max_length=150):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = summarizer_model.generate(inputs["input_ids"], max_length=max_length, min_length=50, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


## Search and Retrieve Relevant Papers

In [None]:

# Search for relevant papers
def search_papers(query, top_n=5):
    if df.empty or embeddings.size == 0:
        print("No data available to search. Please check the dataset.")
        return pd.DataFrame(), []
    
    query_embedding = sentence_model.encode(query).reshape(1, -1)
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    return df.iloc[top_indices], similarities[top_indices]

query = "deep learning in healthcare"
top_papers, similarities = search_papers(query)

# Display top results
if not top_papers.empty:
    for i, paper in enumerate(top_papers.iterrows()):
        index, paper_data = paper
        print(f"{i+1}. {paper_data['title']}")
        print(f"Abstract Summary: {summarize_text(paper_data['abstract'])}
")
else:
    print("No relevant papers found.")
