In [45]:
from utils import authenticate

PROJECT_ID = "stately-command-416115"
credentials = authenticate()
REGION = "us-central1"
print(PROJECT_ID)

import vertexai
vertexai.init(project=PROJECT_ID, location=REGION, credentials=credentials)

ImportError: cannot import name 'clusters_2D' from 'utils' (c:\Users\RKeelan\Src\deep-learning-ai\utils.py)

In [6]:
from google.cloud import bigquery
import pandas as pd

def run_bq_query(sql):
    bq_client = bigquery.Client(project=PROJECT_ID, credentials=credentials)
    # Try dry run before executing the query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # If there's no error from the dry run, execute the query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, job_config=job_config)
    job_id = client_result.job_id
    
    # Wait for the query to finish
    df = client_result.result().to_arrow().to_pandas()
    print(f"Finished job_id: {job_id}")
    return df

In [9]:
language_list = ["python", "html", "r", "css"]
so_df = pd.DataFrame()
for language in language_list:
    print(f"Generating {language} dataframe")
    query = f"""
    SELECT
        CONCAT(q.title, q.body) as input_text
        a.body as output_text
    FROM
        `bigquery-public-data.stackoverflow.posts_questions` q
    JOIN
        `bigquery-public-data.stackoverflow.posts_answers` a
    ON
        q.accepted_answer_id = a.id
    WHERE
        q.accepted_answer_id IS NOT NULL AND
        REGEXP_CONTAINS(q.tags, r"{language}") AND
        a.creation_date >= "2020-01-01"
    LIMIT
        5000
    """

    #language_df = run_bq_query(query)
    #language_df["category"] = language
    #so_df = pd.concate([so_df, language_df], ignore_index=True)

so_df = pd.read_csv('so_database_app.csv')
so_df

Generating python dataframe
Generating html dataframe
Generating r dataframe
Generating css dataframe


Unnamed: 0,input_text,output_text,category
0,"python's inspect.getfile returns ""<string>""<p>...",<p><code>&lt;string&gt;</code> means that the ...,python
1,Passing parameter to function while multithrea...,<p>Try this and note the difference:</p>\n<pre...,python
2,How do we test a specific method written in a ...,"<p>Duplicate of <a href=""https://stackoverflow...",python
3,how can i remove the black bg color of an imag...,<p>The alpha channel &quot;disappears&quot; be...,python
4,How to extract each sheet within an Excel file...,<p>You need to specify the <code>index</code> ...,python
...,...,...,...
1995,Is it possible to made inline-block elements l...,<p>If this is only for the visual purpose then...,css
1996,Flip Clock code works on Codepen and doesn't w...,<p>You forgot to attach the CSS file for the f...,css
1997,React Native How can I put one view in front o...,<p>You can do it using zIndex for example:</p>...,css
1998,setting fixed width with 100% height of the pa...,<p>You can use <code>width: calc(100% - 100px)...,css


In [10]:
from vertexai.language_models import TextEmbeddingModel
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [11]:
import time
import numpy as np

def generate_batches(sentences, batch_size = 5):
    for i in range(0, len(sentences), batch_size):
        yield sentences[i:i+batch_size]

In [12]:
so_questions = so_df[0:200].input_text.tolist()
batches = generate_batches(sentences=so_questions)

In [14]:
batch = next(batches)
len(batch)

5

In [18]:
def encode_texts_to_embeddings(sentences):
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]
    

In [19]:
batch_embeddings = encode_texts_to_embeddings(batch)

In [20]:
f"{len(batch_embeddings)} embeddings of size {len(batch_embeddings[0])}"

'5 embeddings of size 768'

In [21]:
so_df = pd.read_csv('so_database_app.csv')
so_df.head()

Unnamed: 0,input_text,output_text,category
0,"python's inspect.getfile returns ""<string>""<p>...",<p><code>&lt;string&gt;</code> means that the ...,python
1,Passing parameter to function while multithrea...,<p>Try this and note the difference:</p>\n<pre...,python
2,How do we test a specific method written in a ...,"<p>Duplicate of <a href=""https://stackoverflow...",python
3,how can i remove the black bg color of an imag...,<p>The alpha channel &quot;disappears&quot; be...,python
4,How to extract each sheet within an Excel file...,<p>You need to specify the <code>index</code> ...,python


In [24]:
import pickle
with open('question_embeddings_app.pkl', 'rb') as file:
    question_embeddings = pickle.load(file)

print("Shape: " + str(question_embeddings.shape))
print(question_embeddings)

Shape: (2000, 768)
[[-0.03571156 -0.00240684  0.05860338 ... -0.03100227 -0.00855574
  -0.01997405]
 [-0.02024316 -0.0026255   0.01940405 ... -0.02158143 -0.05655403
  -0.01040497]
 [-0.05175979 -0.03712264  0.02699278 ... -0.07055898 -0.0402537
   0.00092099]
 ...
 [-0.00580394 -0.01621097  0.05829635 ... -0.03350992 -0.05343556
  -0.06016821]
 [-0.00436622 -0.02692963  0.03363771 ... -0.01686567 -0.03812337
  -0.02329491]
 [-0.04240424 -0.01633749  0.05516777 ... -0.02697376 -0.01751165
  -0.04558187]]


In [28]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

clustering_dataset = question_embeddings[:1000]
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init = 'auto').fit(clustering_dataset)

In [29]:
kmeans_labels = kmeans.labels_
PCA_model = PCA(n_components=2)
PCA_model.fit(clustering_dataset)
new_values = PCA_model.transform(clustering_dataset)

In [47]:
import matplotlib.pyplot as plt
import mplcursors
# from utils import clusters_2D
#%matplotlib ipympl

# clusters_2D(x_values=new_values[:,0], y_values=new_values[:,1], labels=so_df[:1000], kmeans_labels=kmeans_labels)

In [48]:
from sklearn.ensemble import IsolationForest
import numpy as np
input_text = """I am making cookies but don't 
                remember the correct ingredient proportions. 
                I have been unable to find 
                anything on the web."""

emb = model.get_embeddings([input_text])[0].values
embeddings_1 = question_embeddings.tolist()
embeddings_1.append(emb)
embeddings_array = np.array(embeddings_1)
print("Shape: " + str(embeddings_array.shape))
print(embeddings_array)

Shape: (2001, 768)
[[-0.03571156 -0.00240684  0.05860338 ... -0.03100227 -0.00855574
  -0.01997405]
 [-0.02024316 -0.0026255   0.01940405 ... -0.02158143 -0.05655403
  -0.01040497]
 [-0.05175979 -0.03712264  0.02699278 ... -0.07055898 -0.0402537
   0.00092099]
 ...
 [-0.00436622 -0.02692963  0.03363771 ... -0.01686567 -0.03812337
  -0.02329491]
 [-0.04240424 -0.01633749  0.05516777 ... -0.02697376 -0.01751165
  -0.04558187]
 [-0.00302366 -0.02049104  0.02172194 ... -0.04479321 -0.05254056
  -0.00319716]]


In [49]:
# Add the outlier text to the end of the stackoverflow dataframe
so_df = pd.read_csv('so_database_app.csv')
new_row = pd.Series([input_text, None, "baking"], index=so_df.columns)
so_df.loc[len(so_df)] = new_row
so_df.tail()

Unnamed: 0,input_text,output_text,category
1996,Flip Clock code works on Codepen and doesn't w...,<p>You forgot to attach the CSS file for the f...,css
1997,React Native How can I put one view in front o...,<p>You can do it using zIndex for example:</p>...,css
1998,setting fixed width with 100% height of the pa...,<p>You can use <code>width: calc(100% - 100px)...,css
1999,How to make sidebar button not bring viewpoint...,"<p>It is quite simple, just remove that href=""...",css
2000,I am making cookies but don't \n ...,,baking


In [50]:
clf = IsolationForest(contamination=0.005, random_state=2)
preds = clf.fit_predict(embeddings_array)
print(f"{len(preds)} predictions. Set of possible values: {set(preds)}")
so_df.loc[preds == -1]

2001 predictions. Set of possible values: {1, -1}


Unnamed: 0,input_text,output_text,category
203,extract channel names from a multi-channel ima...,<p>PerkinElmer QPI metadata are stored as XML ...,python
1018,ASP .NET - JSON Serializer not working on clas...,"<p>Ok, I forgot to add default <code>{ get; se...",r
1138,parse year and month from a string SQL BigQuer...,<p>How about using string operations?</p>\n<pr...,r
1313,Array initialization with ternary operator in ...,"<p>To make your code work, do the following in...",r
1358,How to represent 2 Entity with 2 Relation in E...,"<p><a href=""https://i.stack.imgur.com/BJxBP.pn...",r
1403,"Apache ignite Partition Map Exchange , Baselin...","<p>Long story short, these topics are about da...",r
1427,Shortcut to reveal in Finder for currently ope...,<p>No. It is not present but we can add it. Go...,r
1493,How to change id of datatable?<p>I have some w...,<p>In short - you can't. But maybe you can:</p...,r
1498,What’s the difference between Next.js rewrites...,<p><code>rewrites</code> are a convenient way ...,r
2000,I am making cookies but don't \n ...,,baking


In [51]:
# Remove the outlier about baking
so_df = so_df.drop(so_df.index[-1])
so_df

Unnamed: 0,input_text,output_text,category
0,"python's inspect.getfile returns ""<string>""<p>...",<p><code>&lt;string&gt;</code> means that the ...,python
1,Passing parameter to function while multithrea...,<p>Try this and note the difference:</p>\n<pre...,python
2,How do we test a specific method written in a ...,"<p>Duplicate of <a href=""https://stackoverflow...",python
3,how can i remove the black bg color of an imag...,<p>The alpha channel &quot;disappears&quot; be...,python
4,How to extract each sheet within an Excel file...,<p>You need to specify the <code>index</code> ...,python
...,...,...,...
1995,Is it possible to made inline-block elements l...,<p>If this is only for the visual purpose then...,css
1996,Flip Clock code works on Codepen and doesn't w...,<p>You forgot to attach the CSS file for the f...,css
1997,React Native How can I put one view in front o...,<p>You can do it using zIndex for example:</p>...,css
1998,setting fixed width with 100% height of the pa...,<p>You can use <code>width: calc(100% - 100px)...,css


In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

so_df = pd.read_csv('so_database_app.csv')
X = question_embeddings
X.shape

(2000, 768)

In [53]:
y = so_df['category'].values
y.shape

(2000,)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [56]:
from sklearn.metrics import accuracy_score
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.685


In [57]:
# Choose a number between 0 and 1999
i = 2
label = so_df.loc[i, 'category']
question = so_df.loc[i, 'input_text']

# Get the embedding
question_embedding = model.get_embeddings([question])[0].values
pred = clf.predict([question_embedding])

print(f"For question {i}, the prediction is `{pred[0]}`")
print(f"The actual label is `{label}`")
print(f"The question text is:")
print("-"*50)
print(question)

For question 2, the prediction is `python`
The actual label is `python`
The question text is:
--------------------------------------------------
How do we test a specific method written in a list of files for functional testing in python<p>The project has so many modules. There are functional test cases being written for almost every api written like for GET requests, POST requests and PUT requests. To test an individual file we use the syntact pytest tests/file_name.py
but I want to test a specific method in that file. Is there any way to test it like that??</p>
