In [9]:
#!python -m spacy download en_core_web_sm


In [None]:
import arxiv
import tkinter as tk
from tkinter import scrolledtext, messagebox
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import spacy
from gensim import corpora
from gensim.models import LdaModel
from langchain.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import os
import json
from pathlib import Path
import hashlib

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize HuggingFace Hub API token (you need to set this up)
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "***********"

# initialize Hub LLM

from langchain import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id='google/flan-t5-large',
    model_kwargs={"temperature": 0, "max_length": 512, 'max_new_tokens' : 250, 'top_k' : 10, 'top_p': 0.95, 'repetition_penalty':1.03}
)
class ResearchAssistant:
    def __init__(self):
        self.cache_dir = Path("cache")
        self.cache_dir.mkdir(exist_ok=True)
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def fetch_articles(self, query, max_results=5):
        try:
            search = arxiv.Search(
                query=query,
                max_results=max_results,
                sort_by=arxiv.SortCriterion.Relevance
            )
            return list(search.results())
        except Exception as e:
            raise Exception(f"Error fetching articles: {str(e)}")

    def summarize_text(self, text, max_length=50):
        try:
            summary = self.summarizer(text, max_length=max_length, min_length=30, do_sample=False)
            return summary[0]['summary_text']
        except Exception as e:
            raise Exception(f"Error summarizing text: {str(e)}")

    def extract_keywords(self, text, num_keywords=5):
        try:
            tfidf_matrix = self.vectorizer.fit_transform([text])
            feature_names = self.vectorizer.get_feature_names_out()
            tfidf_scores = tfidf_matrix.toarray()[0]
            sorted_indexes = np.argsort(tfidf_scores)[::-1]
            return [feature_names[i] for i in sorted_indexes[:num_keywords]]
        except Exception as e:
            raise Exception(f"Error extracting keywords: {str(e)}")

    def named_entity_recognition(self, text):
        try:
            doc = nlp(text)
            return [(ent.text, ent.label_) for ent in doc.ents]
        except Exception as e:
            raise Exception(f"Error in named entity recognition: {str(e)}")

    def topic_modeling(self, texts, num_topics=3):
        try:
            processed_texts = [[token.lemma_ for token in nlp(text) if not token.is_stop and token.is_alpha] for text in texts]
            dictionary = corpora.Dictionary(processed_texts)
            corpus = [dictionary.doc2bow(text) for text in processed_texts]
            lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
            return lda_model.print_topics()
        except Exception as e:
            raise Exception(f"Error in topic modeling: {str(e)}")

    def generate_insights(self, summaries):
        prompt = PromptTemplate(
            input_variables=["summaries"],
            template="Based on the following article summaries, provide key insights and potential research directions:\n\n{summaries}\n\nInsights:"
        )
        chain = LLMChain(llm=llm, prompt=prompt)
        return chain.run(summaries="\n\n".join(summaries))

    def cache_key(self, query):
        return hashlib.md5(query.encode()).hexdigest()

    def get_cached_result(self, query):
        key = self.cache_key(query)
        cache_file = self.cache_dir / f"{key}.json"
        if cache_file.exists():
            with cache_file.open("r") as f:
                return json.load(f)
        return None

    def cache_result(self, query, result):
        key = self.cache_key(query)
        cache_file = self.cache_dir / f"{key}.json"
        with cache_file.open("w") as f:
            json.dump(result, f)

    def research(self, query):
        cached_result = self.get_cached_result(query)
        if cached_result:
            return cached_result

        articles = self.fetch_articles(query)
        result = []
        summaries = []

        for article in articles:
            summary = self.summarize_text(article.summary)
            keywords = self.extract_keywords(article.summary)
            entities = self.named_entity_recognition(article.summary)
            
            article_data = {
                "title": article.title,
                "authors": [author.name for author in article.authors],
                "published": article.published.isoformat(),
                "summary": summary,
                "keywords": keywords,
                "entities": entities,
                "url": article.pdf_url
            }
            result.append(article_data)
            summaries.append(summary)

        topics = self.topic_modeling([article.summary for article in articles])
        insights = self.generate_insights(summaries)

        final_result = {
            "articles": result,
            "topics": topics,
            "insights": insights
        }

        self.cache_result(query, final_result)
        return final_result

class ResearchAssistantGUI:
    def __init__(self, master):
        self.master = master
        self.master.title("AI Research Assistant")
        self.master.geometry("800x600")
        self.assistant = ResearchAssistant()
        self.create_widgets()

    def create_widgets(self):
        self.query_label = tk.Label(self.master, text="Enter your research topic:")
        self.query_label.pack(pady=10)

        self.query_entry = tk.Entry(self.master, width=50)
        self.query_entry.pack(pady=10)

        self.search_button = tk.Button(self.master, text="Search", command=self.perform_search)
        self.search_button.pack(pady=10)

        self.result_text = scrolledtext.ScrolledText(self.master, wrap=tk.WORD, width=80, height=30)
        self.result_text.pack(pady=10, padx=10, expand=True, fill=tk.BOTH)

    def perform_search(self):
        query = self.query_entry.get()
        if not query:
            messagebox.showerror("Error", "Please enter a research topic")
            return

        self.result_text.delete(1.0, tk.END)
        self.result_text.insert(tk.END, "Searching... Please wait.\n\n")
        self.master.update()

        try:
            result = self.assistant.research(query)
            self.display_results(result)
        except Exception as e:
            messagebox.showerror("Error", str(e))

    def display_results(self, result):
        self.result_text.delete(1.0, tk.END)

        for i, article in enumerate(result["articles"], 1):
            self.result_text.insert(tk.END, f"Article {i}:\n")
            self.result_text.insert(tk.END, f"Title: {article['title']}\n")
            self.result_text.insert(tk.END, f"Authors: {', '.join(article['authors'])}\n")
            self.result_text.insert(tk.END, f"Published: {article['published']}\n")
            self.result_text.insert(tk.END, f"Summary: {article['summary']}\n")
            self.result_text.insert(tk.END, f"Keywords: {', '.join(article['keywords'])}\n")
            self.result_text.insert(tk.END, f"Named Entities: {', '.join([f'{ent[0]} ({ent[1]})' for ent in article['entities']])}\n")
            self.result_text.insert(tk.END, f"URL: {article['url']}\n")
            self.result_text.insert(tk.END, "-" * 50 + "\n")

        self.result_text.insert(tk.END, "\nTopics:\n")
        for topic in result["topics"]:
            self.result_text.insert(tk.END, f"{topic}\n")

        self.result_text.insert(tk.END, "\nInsights:\n")
        self.result_text.insert(tk.END, result["insights"])

if __name__ == "__main__":
    root = tk.Tk()
    app = ResearchAssistantGUI(root)
    root.mainloop()

Device set to use mps:0
