In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


<h2><b> The overall file integrates the chatbot, tokenizer, LLM, visualization, and exported database subsystem to display a Streamlit chatbot running on localtunnel that can perform 2D and 3D visualizations of SMILES, push and pull chat history and visualizations from Firebase database, and perform inferences from the LLM. </b></h2>

---

# Install required libraries

In [None]:
!pip install torch pathlib stmol rdkit rdkit-pypi py3dmol pillow langchain torch accelerate pyrebase transformers sentence_transformers streamlit streamlit_chat faiss-cpu altair tiktoken huggingface-hub ctransformers pandas pypdf

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting py3dmol
  Downloading py3Dmol-2.1.0-py2.py3-none-any.whl (12 kB)
Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyre

https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main

## Install localtunnel to serve the Streamlit app

In [None]:
!npm install localtunnel
# Look into githubio

[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[34;40mnotice[0m[35m[0m created a lockfile as package-lock.json. You should commit this file.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No license field.
[0m
+ localtunnel@2.0.2
added 22 packages from 22 contributors and audited 22 packages in 3.625s

3 packages are looking for funding
  run `npm fund` for details

found 1 [93mmoderate[0m severity vulnerability
  run `npm audit fix` to fix them, or `npm audit` for details
[K[?25h

In [None]:
%run '/content/drive/MyDrive/Colab Notebooks/Data_Visualization_Molecular_Discovery_Chatbot.ipynb'



## Write ALL code to one file (app.py) for execution on localtunnel

In [None]:
%%writefile app.py

## Adapted Visualization Code from https://github.com/karthick1087/SMILES-to-PDB-Converter-and-3D-Visualizer/blob/main/app.py

# Import required libraries and modules
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import CTransformers
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory

### Utilfunctions.py
import pandas as pd
import secrets
import string

from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
import py3Dmol
import streamlit as st
import os
import tempfile
import base64

def smiles_to_pdb(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None, None
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol, AllChem.ETKDG())
    AllChem.UFFOptimizeMolecule(mol)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdb") as temp_pdb:
        Chem.MolToPDBFile(mol, temp_pdb.name)
        return temp_pdb.name, mol

# Function to generate 2D structure from SMILES notation
def generate_2d_structure(smiles):
    # Convert SMILES to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    # Add hydrogen atoms to the molecule
    mol = Chem.AddHs(mol)
    # Compute 2D coordinates for the molecule
    AllChem.Compute2DCoords(mol)
    return mol

from rdkit.Chem import Draw

# Function to visualize 2D structure using py3Dmol
def visualize_2d_structure(smiles):
    molecule_mol = generate_2d_structure(smiles)
    fp = '/content/cdk2_mol1.png'
    Draw.MolToFile(molecule_mol, fp)
    return fp

# Generate Mol File describing 3D Structure from SMILES notation
def generate_3d_structure(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol, randomSeed=42)
        pdb_data = Chem.MolToPDBBlock(mol)
        return pdb_data
    else:
        return None

from stmol import *

# Visualize 3D structure from file
def visualize_3d_structure(smiles):
    pdb_data = generate_3d_structure(smiles)
    if pdb_data:
        # Create py3Dmol view
        view = py3Dmol.view(width=800, height=400)
        view.addModel(pdb_data, 'pdb')
        # Set style and display
        view.setStyle({'cartoon': {'color': 'spectrum'}})
        view.zoomTo()

    else:
        st.write("No 3D structure available.")


def randomString():
    return ''.join(secrets.choice(string.ascii_letters) for _ in range(6))


def print_stats(data):
    series = pd.Series(data)
    # Use describe() to get statistics
    statistics_series = series.describe()
    # Display the statistics Series
    print(statistics_series)

###

# Function to tokenize text from PDF documents and create embeddings
def textTokenizer():
    # Load PDF files from the specified path
    loader = DirectoryLoader('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/', glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=10)
    text_chunks = text_splitter.split_documents(documents)

    # Create embeddings storing semantic information
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': "cpu"})

    # Vectorstore for fast similarity search via indexing
    vector_store = FAISS.from_documents(text_chunks, embeddings)

    return vector_store

# Function to create a conversational retrieval chain model
def createModel(temperature, kval, maxTokens):
    vector_store = textTokenizer()

    # Load Huggingface Llama2 LLM with specified hyperparmaters
    llm = CTransformers(model="/content/drive/MyDrive/LLama2HealthCareChatBot-master/llama-2-7b-chat.ggmlv3.q4_0.bin", model_type="llama",
                        config={'max_new_tokens': 128, 'temperature': 0.01}, n_ctx=4096)

    # Create memory object to store chat history
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    # Set up conversational chain that connects LLM, the indexed vectorized data, and the chatbot
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm, chain_type='stuff',
        retriever=vector_store.as_retriever(search_kwargs={"k": 2}),  # k hyperparameter
        memory=memory)

    return chain

# Import required libraries and modules
import streamlit as st
from streamlit_chat import message

# Function to handle the conversation with the chatbot
def conversation_chat(query, chain):
    result = chain({"question": query, "chat_history": st.session_state['history']})
    st.session_state['history'].append((query, result["answer"]))
    return result["answer"]

# Function to initialize Hey and Hello salutations at the initiation of chatbot
def initialize_session_state():
    if 'history' not in st.session_state:
        st.session_state['history'] = []

    if 'generated' not in st.session_state:
        st.session_state['generated'] = ["Hello! Ask me anything about proteins"]

    if 'past' not in st.session_state:
        st.session_state['past'] = ["Hey! 👋"]


import firebase_admin
from firebase_admin import credentials, firestore, storage

# Upload PNG file to Firebase Storage
def upload_to_storage(local_file_path, remote_file_name):
    bucket = storage.bucket()
    blob = bucket.blob(remote_file_name)
    blob.upload_from_filename(local_file_path)

    # Get download URL
    download_url = blob.generate_signed_url(expiration=3600)  # URL expires in 1 hour
    return download_url

# Function to encode MolViewSpec data
def encode_mvs_data(mvs_data):
    return base64.urlsafe_b64encode(mvs_data.encode()).decode('utf-8')

# Function to display chat history and handle user input
def display_chat_history(chain):
    reply_container = st.container()
    container = st.container()

    firebase_initialized = False

    with container:

        # Request user for prompt
        with st.form(key='my_form', clear_on_submit=True):
            user_input = st.text_input("Question:", placeholder="Ask any protein question", key='input')

            # Create send button to submit prompt
            submit_button = st.form_submit_button(label='Send')

        # pipe = create_new_model()

        # When input prompt is valid withand update session states respectively
        if submit_button and user_input:
            with st.spinner("Fetching response..."):
                    # Call conversational retrieval chain to produce response from LLM using previous chat history
                    #result = pipe(user_input, do_sample=True)
                    #(result[0]['generated_text'])
                    output = conversation_chat(user_input, chain)

            # Update input and output session states with current prompt and responses
            st.session_state['past'].append(user_input)
            st.session_state['generated'].append(output)

        # Create export button to export to database
        export_button = st.button(label='Export')

        if not firebase_admin._apps:  # Check if Firebase Admin SDK has not been initialized
            cred_obj = credentials.Certificate('/content/drive/MyDrive/LLama2HealthCareChatBot-master/serviceAccountKey.json') # REPLACE WITH OWN SERVICE ACCOUNT KEY FROM FIREBASE
            firebase_admin.initialize_app(cred_obj, {
                'databaseURL': "https://molecular-discovery-chatbot-default-rtdb.firebaseio.com/",
                'storageBucket': "molecular-discovery-chatbot.appspot.com"
            })

        db = firestore.client()
        ref = db.collection('chatHistory')
        # When user wants to export to database
        if export_button:

            # Get the chat history
            chat_history = st.session_state['history']

            # Convert chat history to a format suitable for Firebase (e.g., list of dictionaries)
            firebase_data = [{'user': message[0], 'bot': message[1]} for message in chat_history]


            for data in firebase_data:
                ref.collection('chatHistory').add(data)

        smiles_input = st.text_input("Enter SMILES to visualize:")
        if st.button("2D"):
            file_path = visualize_2d_structure(smiles_input)
            st.image(file_path)

        if st.button("Export 2D"):
            file_path = visualize_2d_structure(smiles_input)
            if file_path is not None:
                remote_file_name = "images/smiles_input.png"
                download_url = upload_to_storage(file_path, remote_file_name)
                st.write(download_url)
                # Save URL to Firebase Database
                # doc_id = save_to_database(db, download_url)
                st.write("Uploaded successfully")

        if st.button("3D"):
            pdb_file, mol = smiles_to_pdb(smiles_input)
            if pdb_file is not None:
                st.success("Conversion successful! PDB file generated.")
                st.write("### 3D Visualization:")
                with open(pdb_file, 'rb') as f:
                    pdb_data = f.read()
                encoded_pdb_data = base64.b64encode(pdb_data).decode('utf-8')

                # Construct MolViewSpec data
                mvs_data = """
                {
                    "metadata": {
                        "title": "Molecule Visualization",
                        "version": "1",
                        "timestamp": "2024-04-25T12:00:00"
                    },
                    "root": {
                        "kind": "root",
                        "children": [
                            {
                                "kind": "download",
                                "params": {
                                    "url": "data:chemical/x-pdb;base64," + "%s"
                                }
                            }
                        ]
                    }
                }
                """ % encoded_pdb_data

                # Encode MolViewSpec data
                encoded_mvs_data = encode_mvs_data(mvs_data)

                # Construct MolView URL
                link = f"https://molstar.org/viewer?mvs-format=mvsj&mvs-data={encoded_mvs_data}"
                st.markdown(f'[View 3D Visualization]({link})')

                st.write("### Download PDB file:")
                with open(pdb_file, "rb") as f:
                  pdb_bytes = f.read()
                st.download_button(
                    label="Download PDB file",
                    data=pdb_bytes,
                    file_name="molecule.pdb",
                    mime="chemical/x-pdb"
                )
                st.write("###")

        if st.button("Export 3D"):
            pass



    # UI development to generate emojis and bubbles when new response generated
    if st.session_state['generated']:
        with reply_container:
            for i in range(len(st.session_state['generated'])):
                message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="thumbs")
                message(st.session_state["generated"][i], key=str(i), avatar_style="fun-emoji")

file_path = None
# Main function to run the Streamlit app
def main():
    # Define the current page based on query parameters
    current_page = st.experimental_get_query_params().get("page", ["Landing"])[0]

    # On chatbot page
    if current_page == "Chatbot":

        # UI work displaying titles
        st.title("Molecular Discovery Chatbot 🧑🏽‍⚕️")

        # Initialize Hey and Hello message
        initialize_session_state()

        # Create Model
        chain1 = createModel(0.5, 2, 256)

        # Display chat history
        display_chat_history(chain1)

        # Create navigation to database tab button
        st.button("Go to Database", on_click=lambda: st.experimental_set_query_params(page="Database"))


    elif current_page == "Database":

        # UI work displaying titles
        st.header("Firebase Data")
        st.write("Molecule Database in a table:")

        # Create Fetch button to retrieve data from firebase
        fetch_button = st.button("Fetch Database")

        # When clicked
        if fetch_button:
            # Retrieve relevant data from firebase as key, vlue pairs
            firebase_sample_data = ref.reference('/bookData').get()
            chat_history_data = ref.reference('/chatHistory').get()

            # Output respective data in tabular format by deconstructing the pairs
            if firebase_sample_data:
                data_list = [{key: value} for key, value in firebase_sample_data.items()]
                st.table(data_list)
            if chat_history_data:
                data_list = [{'User': message['user'], 'Bot': message['bot']} for message in chat_history_data]
                st.table(data_list)
        else:
            st.write("No data available in firebase")
        # Create navigation to chatbot tab button
        st.button("Go to Chatbot", on_click=lambda: st.experimental_set_query_params(page="Chatbot"))
    else:
        # UI for landing page
        st.title("Welcome to Molecular Discovery Chatbot")
        st.write("Pick an option")

        # Create navigation to chatbot and database tab from landing page
        st.button("Go to Chatbot", on_click=lambda: st.experimental_set_query_params(page="Chatbot"))
        st.button("Go to Database", on_click=lambda: st.experimental_set_query_params(page="Database"))

# Run the main function to start the Streamlit app
main()

Overwriting app.py


## Run the Streamlit app in the background

In [None]:
!streamlit run app.py &>/content/logs.txt &

## Retrieve external URL public Ipv4 for local webserver

In [None]:
# Also found as external URL in log.txt
import urllib
print("Password/Enpoint IP for localtunnel is:",urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n"))

Password/Enpoint IP for localtunnel is: 34.86.245.151


## Expose the Streamlit app on port 8501

In [None]:
# https://theboroer.github.io/localtunnel-www/
!npx localtunnel --port 8501 & curl ipv4.icanhazip.com

# 5 min 52 seconds with T4 GPU
# 5 min 27 seconds with V100 GPU
# Improvement from 6 minute 30 seconds with CPU but workload is still highly unparallelized
# We barely use 0.7 / 16 GB GPU RAM

34.86.245.151
[K[?25hnpx: installed 22 in 4.442s
your url is: https://tender-buckets-cover.loca.lt


In [None]:
# 3D Visualization Credit: https://molstar.org/viewer-docs/extensions/mvs/

In [None]:
'''
What happened before/after fetching response (the delay)

https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
https://api.python.langchain.com/en/latest/chains/langchain.chains.conversational_retrieval.base.ConversationalRetrievalChain.html

We could explore smaller LLMs.
Have a few benchmark questions.
Quantify that migration to Google Colab did not significantly affect chatbot response. Craft a hypothesis (communication overhead, etc.) and identify bottlenecks (at different stages)
Evaluate amount of time each component took.
Validate response of specific questions and assess how the input PDF data contributes to response (without aspirin / with aspirin)
Are there specific questions that the default model cannot answer but our model does?

Presentation
- What are the conclusions from the results?
- What are the actions going forward?

There are many sources of GPU computing power (idle time etc.) and what constraints we have for the project.
Could we run this on Grace cluster, considering queue time?
'''

'\nWhat happened before/after fetching response (the delay)\n\nhttps://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore\nhttps://api.python.langchain.com/en/latest/chains/langchain.chains.conversational_retrieval.base.ConversationalRetrievalChain.html\n\nWe could explore smaller LLMs.\nHave a few benchmark questions.\nQuantify that migration to Google Colab did not significantly affect chatbot response. Craft a hypothesis (communication overhead, etc.) and identify bottlenecks (at different stages)\nEvaluate amount of time each component took.\nValidate response of specific questions and assess how the input PDF data contributes to response (without aspirin / with aspirin)\nAre there specific questions that the default model cannot answer but our model does?\n\nPresentation\n- What are the conclusions from the results?\n- What are the actions going forward?\n\nThere are many sources of GPU computing power (idle time etc.) and what constraints we have for the pro