In [89]:
#Load credentials and relevant python libraries

In [170]:
import sys, os

# Make sure the current notebook's folder is searched first
notebook_dir = os.path.abspath("")
if notebook_dir not in sys.path:
    sys.path.insert(0, notebook_dir)

In [172]:
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials

In [174]:
from utils import authenticate
credentials, PROJECT_ID = authenticate() # Get credentials and project ID

In [176]:
# Path to your service account key file
key_path = 'chatbot-key.json' #Path to the json key associated with your service account from google cloud

In [178]:
# Create credentials object

credentials = Credentials.from_service_account_file(
    key_path,
    scopes=['https://www.googleapis.com/auth/cloud-platform'])

if credentials.expired:
    credentials.refresh(Request())

In [180]:
PROJECT_ID = 'chatbot-ai-463712'
REGION = 'us-central1'

In [182]:
import vertexai

# initialize vertex
vertexai.init(project = PROJECT_ID, location = REGION, credentials = credentials)

In [184]:
## Load Stack Overflow questions and answers from BigQuery
import pandas as pd

In [186]:
#Using premade data which contains an excel of python stackoverflow question and answers
so_database = pd.read_csv('so_database_app.csv')

In [188]:
print("Shape: " + str(so_database.shape))
print(so_database)

Shape: (2000, 3)
                                             input_text  \
0     python's inspect.getfile returns "<string>"<p>...   
1     Passing parameter to function while multithrea...   
2     How do we test a specific method written in a ...   
3     how can i remove the black bg color of an imag...   
4     How to extract each sheet within an Excel file...   
...                                                 ...   
1995  Is it possible to made inline-block elements l...   
1996  Flip Clock code works on Codepen and doesn't w...   
1997  React Native How can I put one view in front o...   
1998  setting fixed width with 100% height of the pa...   
1999  How to make sidebar button not bring viewpoint...   

                                            output_text category  
0     <p><code>&lt;string&gt;</code> means that the ...   python  
1     <p>Try this and note the difference:</p>\n<pre...   python  
2     <p>Duplicate of <a href="https://stackoverflow...   python  
3     

In [190]:
#Loading the question embeddings

In [192]:
from vertexai.language_models import TextEmbeddingModel

In [194]:
embedding_model = TextEmbeddingModel.from_pretrained(
    "text-embedding-005")

In [196]:
import numpy as np
from utils import encode_text_to_embedding_batched

In [119]:
#In this project we are given with an already embedded data to save on API costs. The following to be used for further projects
# Encode the stack overflow data

#so_questions = so_database.input_text.tolist()
#question_embeddings = encode_text_to_embedding_batched(
#            sentences = so_questions,
#            api_calls_per_second = 20/60, 
#            batch_size = 5)

In [198]:
import pickle
with open('question_embeddings_app.pkl', 'rb') as file:
      
    # Call load method to deserialze
    question_embeddings = pickle.load(file)
  
    print(question_embeddings)

[[-0.03571156 -0.00240684  0.05860338 ... -0.03100227 -0.00855574
  -0.01997405]
 [-0.02024316 -0.0026255   0.01940405 ... -0.02158143 -0.05655403
  -0.01040497]
 [-0.05175979 -0.03712264  0.02699278 ... -0.07055898 -0.0402537
   0.00092099]
 ...
 [-0.00580394 -0.01621097  0.05829635 ... -0.03350992 -0.05343556
  -0.06016821]
 [-0.00436622 -0.02692963  0.03363771 ... -0.01686567 -0.03812337
  -0.02329491]
 [-0.04240424 -0.01633749  0.05516777 ... -0.02697376 -0.01751165
  -0.04558187]]


In [200]:
so_database['embeddings'] = question_embeddings.tolist()

In [202]:
so_database

Unnamed: 0,input_text,output_text,category,embeddings
0,"python's inspect.getfile returns ""<string>""<p>...",<p><code>&lt;string&gt;</code> means that the ...,python,"[-0.03571155667304993, -0.0024068362545222044,..."
1,Passing parameter to function while multithrea...,<p>Try this and note the difference:</p>\n<pre...,python,"[-0.020243162289261818, -0.002625499852001667,..."
2,How do we test a specific method written in a ...,"<p>Duplicate of <a href=""https://stackoverflow...",python,"[-0.05175979062914848, -0.03712264448404312, 0..."
3,how can i remove the black bg color of an imag...,<p>The alpha channel &quot;disappears&quot; be...,python,"[0.02206624671816826, -0.028208276256918907, 0..."
4,How to extract each sheet within an Excel file...,<p>You need to specify the <code>index</code> ...,python,"[-0.05498068407177925, -0.0032414537854492664,..."
...,...,...,...,...
1995,Is it possible to made inline-block elements l...,<p>If this is only for the visual purpose then...,css,"[-0.009190441109240055, -0.01732615754008293, ..."
1996,Flip Clock code works on Codepen and doesn't w...,<p>You forgot to attach the CSS file for the f...,css,"[-0.009033069014549255, -0.0009270847076550126..."
1997,React Native How can I put one view in front o...,<p>You can do it using zIndex for example:</p>...,css,"[-0.005803938489407301, -0.016210969537496567,..."
1998,setting fixed width with 100% height of the pa...,<p>You can use <code>width: calc(100% - 100px)...,css,"[-0.004366223234683275, -0.02692963369190693, ..."


In [127]:
## Semantic Search

#When a user asks a question, we can embed their query on the fly and search over all of the Stack Overflow question embeddings to find the most simliar datapoint.

In [317]:
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances_argmin as distances_argmin

In [319]:
query = ['How to create App.js file']

#query = ['To test an individual file we use the syntact pytest tests/file_name.py but I want to test a specific method in that file. Is there any way to test it like that?']

In [321]:
query_embedding = embedding_model.get_embeddings(query)[0].values

In [322]:
cos_sim_array = cosine_similarity([query_embedding],
                                  list(so_database.embeddings.values))

In [324]:
cos_sim_array.shape

(1, 2000)

In [327]:
#Once we have the embedding for both the query embeddings and database embeddings, we can use similarity search to retreive the value of the most similar value in database.

In [329]:
index_doc_cosine = np.argmax(cos_sim_array)

In [331]:
index_doc_distances = distances_argmin([query_embedding], 
                                       list(so_database.embeddings.values))[0]

In [333]:
so_database.input_text[index_doc_cosine]

"How to use virtual environment installed on Ubuntu which is installed on Windows?<p>Recently, I've installed  airflow on Windows 10 by installing it on Ubuntu from MS Store. And I can see Airflow web-server from a browser from Windows. However, now I want to create a DAG using Pycharm on Windows, and I have to select python interpreter with airflow installed in its venv.</p>\n<p>Where can I find this virtual environment from Windows?</p>"

In [335]:
so_database.output_text[index_doc_cosine]

'<p>I solved this problem myself. WSL is the feature of Pycharm professional edition, and there is no way to use it in the community edition.</p>\n<p><a href="https://www.jetbrains.com/help/pycharm/using-wsl-as-a-remote-interpreter.html#configure-wsl" rel="nofollow noreferrer">https://www.jetbrains.com/help/pycharm/using-wsl-as-a-remote-interpreter.html#configure-wsl</a></p>'

In [337]:
# Now we create Question answering with relevant context
#Now that we have found the most simliar Stack Overflow question, we can take the corresponding answer and use an LLM to produce a more conversational response.

In [339]:
from vertexai.preview.generative_models import GenerativeModel
from vertexai.preview.generative_models import GenerationConfig

In [341]:
#generation_model = TextGenerationModel.from_pretrained(
#    "text-bison@002")
model = GenerativeModel("gemini-2.0-flash-lite-001")

In [359]:
context = "Question: " + so_database.input_text[index_doc_cosine] +\
"\n Answer: " + so_database.output_text[index_doc_cosine]

query = "Where can I find this virtual environment from Windows"

print("Context Used:\n", context)

Context Used:
 Question: How to use virtual environment installed on Ubuntu which is installed on Windows?<p>Recently, I've installed  airflow on Windows 10 by installing it on Ubuntu from MS Store. And I can see Airflow web-server from a browser from Windows. However, now I want to create a DAG using Pycharm on Windows, and I have to select python interpreter with airflow installed in its venv.</p>
<p>Where can I find this virtual environment from Windows?</p>
 Answer: <p>I solved this problem myself. WSL is the feature of Pycharm professional edition, and there is no way to use it in the community edition.</p>
<p><a href="https://www.jetbrains.com/help/pycharm/using-wsl-as-a-remote-interpreter.html#configure-wsl" rel="nofollow noreferrer">https://www.jetbrains.com/help/pycharm/using-wsl-as-a-remote-interpreter.html#configure-wsl</a></p>


In [361]:
prompt = f"""You are a helpful assistant that answers programming-related questions.
Use the relevant information from the provided context to answer the user's question.
If the context does not help, reply with:
"I couldn't find a good match in the document database for your query"

Context:
{context}

User's Question:
{query}

Answer:"""
# 4️⃣ Generate the answer
response = model.generate_content(prompt)
print(response.text)

I solved this problem myself. WSL is the feature of Pycharm professional edition, and there is no way to use it in the community edition.
https://www.jetbrains.com/help/pycharm/using-wsl-as-a-remote-interpreter.html#configure-wsl



In [355]:
# Generate the answer with high randomness
config = GenerationConfig(
    temperature=0.7,
    top_p=0.8,
    max_output_tokens=256
)
response = model.generate_content(prompt, generation_config=config )
answer = response.text
print(answer)

I couldn't find a good match in the document database for your query



In [None]:
#Now the chatbot is working as expected using gemini flash!