In [1]:
import requests
from bs4 import BeautifulSoup
import os

In [2]:
base_url = "https://apps.compu-flair.com"
apps = ['TCGA', 'scRNAseq', 'scRNAseq2', 'scRNAseqDEG','scRNAseqFilterRawCounts', 'geneSetAnalys', 'scRNAseqCellType']


In [3]:
for app in apps:
    try:
        content = []
        app_url = f"{base_url}/apps/{app}/description#section-what-is-this"
        print(app_url)
        app_response = requests.get(app_url)
        app_soup = BeautifulSoup(app_response.content, 'html.parser')
        
        # Extract and print text from the description page
        texts = app_soup.get_text(separator='\n', strip=True)
        app_details = f'App name: {app}'
        content.append(app_details)
        content.append(texts)
        content_string = " ".join(content)

    
        filename = app + '.txt'
        print(filename)
        filepath = os.path.join('all_apps', filename)

        # Writing the texts to a file
        with open(filepath, 'w', encoding='utf-8') as file:
            file.write(content_string)
        print(f"Text written to {filepath}")

    except Exception as e:
        print(f'An error has occurred: {e}')
        
print("All Apps Scraped Successfully")

https://apps.compu-flair.com/apps/TCGA/description#section-what-is-this
TCGA.txt
Text written to all_apps/TCGA.txt
https://apps.compu-flair.com/apps/scRNAseq/description#section-what-is-this
scRNAseq.txt
Text written to all_apps/scRNAseq.txt
https://apps.compu-flair.com/apps/scRNAseq2/description#section-what-is-this
scRNAseq2.txt
Text written to all_apps/scRNAseq2.txt
https://apps.compu-flair.com/apps/scRNAseqDEG/description#section-what-is-this
scRNAseqDEG.txt
Text written to all_apps/scRNAseqDEG.txt
https://apps.compu-flair.com/apps/scRNAseqFilterRawCounts/description#section-what-is-this
scRNAseqFilterRawCounts.txt
Text written to all_apps/scRNAseqFilterRawCounts.txt
https://apps.compu-flair.com/apps/geneSetAnalys/description#section-what-is-this
geneSetAnalys.txt
Text written to all_apps/geneSetAnalys.txt
https://apps.compu-flair.com/apps/scRNAseqCellType/description#section-what-is-this
scRNAseqCellType.txt
Text written to all_apps/scRNAseqCellType.txt
All Apps Scraped Successful

In [4]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

In [5]:
#Load from a directory
loader = DirectoryLoader('./all_apps/', glob="./*.txt", loader_cls=TextLoader) #- We will use this in case of many articles saved in a directory

documents = loader.load()

In [6]:
#splitting the text into smaller documents 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
len(texts)

91

In [7]:
print(texts[2].page_content)

P-value:
This column represents the probability of observing the given level of overlap between the input set and the gene set by random chance alone.
Adjusted P-value:
The "Adjusted P-value" column indicates the P-value that has been adjusted to control for multiple hypothesis testing. It accounts for the potential inflation of false positives.
Odds Ratio:
This column provides the ratio of the odds of finding the overlapping genes in the gene set compared to the odds of finding the overlapping genes outside the gene set.
Combined Score:
The "Combined Score" column represents the combined score for the gene set, calculated based on various statistical measures.
Genes:
This column lists the individual genes from the input set that were found to overlap with the gene set being analyzed.
barplot_(Database Name Here).png
: 
    The output file barplot_(Database Name Here).png is a visual representation of the Gene Set 
    Enrichment Analysis (GSEA) results related to the database.


In [8]:
print(f"#chunks: {len(texts)} #pages: {len(documents)}")

#chunks: 91 #pages: 7


In [9]:
import os
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv(".env")

OPENAI_API_KEY = os.getenv('openai_api_key')

# Embed and store the texts
embedding_function = OpenAIEmbeddings(model = 'text-embedding-ada-002',openai_api_key=OPENAI_API_KEY)


  warn_deprecated(


In [10]:
# import pinecone
# from langchain.vectorstores import Pinecone
# from pinecone.core.client.configuration import Configuration as OpenApiConfiguration

# pinecone_api_key = os.getenv("pinecone_api_key")
# pinecone_environ = os.getenv("pinecone_environ")
# def pinecone_setup():
#     try:
#         # Initialize Pinecone from the database
#         openapi_config = OpenApiConfiguration.get_default_copy()
#         openapi_config.proxy = "http://proxy.server:3128"
#         pinecone.init(api_key=pinecone_api_key, environment=pinecone_environ)
#         print("success")
#     except Exception as e:
#         print(f"Failed to initialize Pinecone: {e}")
# pinecone_setup()

In [11]:

from langchain.vectorstores import Chroma
'''
index_name = 'compflair'  # Already created at Pinecone console

indexes_list = pinecone.list_indexes()
print(indexes_list)

def load_docs_to_vectorstore():
    if index_name in indexes_list:
        try:
            pinecone_setup()
            
            docsearch = Pinecone.from_documents(texts,embedding=embedding_function,index_name=index_name)
            return docsearch
        except Exception as e:
            print(f"An error occurred  {e} ")
 
docsearch = load_docs_to_vectorstore()
'''

# load it into Chroma
db = Chroma.from_documents(texts, embedding_function)



In [12]:
def retriever(query):
    retriever = db.as_retriever(search_kwargs={"k": 10})
    docs = retriever.get_relevant_documents(query) # Here we are filtering documents with similar meaning to the query
    return docs


In [13]:
# Set your OpenAI API key here
OPENAI_API_KEY = os.getenv('openai_api_key')
from openai import OpenAI

client = OpenAI(
    api_key=OPENAI_API_KEY
)

def summarize_query(query, documents):
    # Construct the conversation with a more explicit JSON formatting instruction
    messages = [{"role": "system",
                 "content": """
                 You are a helpful research assistant. 
                 You will be given a series of documents and a question. Answer the question based on the given documents.

                 Your output format should be a json file.  
                 """
                }]

    # Append documents and query
    for doc in documents:
        content = doc.page_content if hasattr(doc, 'page_content') else ''
        messages.append({"role": "user", "content": content})
    messages.append({"role": "user", "content": query})

    # Make the API call
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-1106",
            messages=messages,
            temperature=0.7  # Adjust creativity
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None



In [14]:
query = """
what are possible analyses in this website?
""" 
query = """
I have a bunch of fastq.gz files. What application shall I use?
""" 
# docs = retriever(query)
docs = db.similarity_search(query)
summary = summarize_query(query, documents=docs)
print(summary)

{
  "application_recommendation": "You should use the 10X pre-scRNAseq application for processing your fastq.gz files. This application is designed specifically for single-cell RNA sequencing (scRNA-seq) data and will provide outputs such as multiqc_report.html and genes.tsv, which are essential for downstream analysis."
}


In [15]:
query = """
I have a bunch of fastq.gz files. What application shall I use?
""" 
docs = retriever(query)
summary = summarize_query(query, documents=docs)
print(summary)

{
  "application_recommendation": "You can use the PreProcessing 10X scRNAseq application for processing your fastq.gz files for single-cell RNA sequencing (scRNAseq) analysis. This application is designed to handle 10X Genomics data and can preprocess the raw fastq files to generate the required genes.tsv, barcodes.tsv, and matrix.mtx files for downstream analysis."
}
