In [1]:
from google.cloud import aiplatform
from google.auth import default

In [2]:
credentials, detected_project = default()

In [3]:
PROJECT_ID = "shaghayegh-vertex-ai-lab"   
REGION = "us-central1"

aiplatform.init(project=PROJECT_ID, location=REGION, credentials=credentials)
print("✅ Connected to:", PROJECT_ID, "| ADC project:", detected_project)

✅ Connected to: shaghayegh-vertex-ai-lab | ADC project: shaghayegh-vertex-ai-lab


In [4]:
import vertexai

In [5]:
vertexai.init(project = PROJECT_ID,
              location = REGION,
              credentials = credentials)

Import BigQuery to use as your data warehouse.

In [6]:
from google.cloud import bigquery

In [7]:
bq_client = bigquery.Client(project=PROJECT_ID,
                            credentials = credentials)

# using Stack Overflow Data on BigQuery Public Datasets.

In [9]:
QUERY_TABLES = """
SELECT
  table_name
FROM
  `bigquery-public-data.stackoverflow.INFORMATION_SCHEMA.TABLES`
"""

In [11]:
query_job = bq_client.query(QUERY_TABLES)
results = query_job.result()

for row in results:
    print(row.table_name)


posts_answers
users
posts_orphaned_tag_wiki
posts_tag_wiki
stackoverflow_posts
posts_questions
comments
posts_tag_wiki_excerpt
posts_wiki_placeholder
posts_privilege_wiki
post_history
badges
post_links
tags
votes
posts_moderator_nomination


# Data Retrieval

In [29]:
INSPECT_QUERY = """
SELECT
    *
FROM
    `bigquery-public-data.stackoverflow.posts_questions`
LIMIT 3
"""

In [14]:
import pandas as pd

In [30]:
query_job = bq_client.query(INSPECT_QUERY)  

In [31]:
stack_overflow_df = query_job\
    .result()\
    .to_arrow()\
    .to_pandas()
stack_overflow_df.head()




Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,71102973,Product-card snippet not loading images - Shopify,<p>Hi there i'm making some changes at my prod...,,0,0,NaT,2022-02-13 17:00:30.403000+00:00,,2022-02-13 17:00:30.403000+00:00,NaT,,,,18184792,,1,0,shopify|liquid|code-snippets,256
1,71104461,Google Sign In One Tap & Laravel Socialite - B...,<p>I am trying to set up Google Sign In One Ta...,,0,0,NaT,2022-02-13 20:11:45.200000+00:00,,2022-02-13 20:17:01.387000+00:00,2022-02-13 20:17:01.387000+00:00,,3922429.0,,3922429,,1,0,laravel|oauth-2.0,256
2,71110679,Robot Framework - how to get names of all keyw...,<p>I'm currently working on reporting of Robot...,,0,0,NaT,2022-02-14 10:51:58.633000+00:00,,2022-02-14 10:51:58.633000+00:00,NaT,,,,14633035,,1,0,robotframework|reporting,256


In [38]:
INSPECT_QUERY = """
SELECT
    *
FROM
    `bigquery-public-data.stackoverflow.posts_answers`
LIMIT 3
"""

In [39]:
query_job = bq_client.query(INSPECT_QUERY) 

In [40]:
stack_overflow_df = query_job\
    .result()\
    .to_arrow()\
    .to_pandas()
stack_overflow_df.head()



Unnamed: 0,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,favorite_count,last_activity_date,last_edit_date,last_editor_display_name,last_editor_user_id,owner_display_name,owner_user_id,parent_id,post_type_id,score,tags,view_count
0,61160630,,<p>Because you have multiple matching in your ...,,,0,NaT,2020-04-11 17:06:17.323000+00:00,,2020-04-11 17:17:40.823000+00:00,2020-04-11 17:17:40.823000+00:00,,3399356.0,,3399356,61159151,2,0,,
1,61160637,,<p><strong>1.</strong> If you try to use varia...,,,0,NaT,2020-04-11 17:06:38.623000+00:00,,2020-04-11 17:06:38.623000+00:00,NaT,,,,13113414,61160469,2,0,,
2,61160650,,<p>Here is how to do template matching with a ...,,,0,NaT,2020-04-11 17:07:22.420000+00:00,,2020-04-11 17:17:37.030000+00:00,2020-04-11 17:17:37.030000+00:00,,7355741.0,,7355741,61141508,2,0,,


In [32]:
QUERY_ALL = """"
SELECT
    *
FROM
    'bigquery-public-data.stackoverflow.posts_questions'
    """

In [33]:
query_job = bq_client.query(QUERY_ALL)

In [37]:
try:
    stack_overflow_df = query_job.result().to_arrow().to_pandas()
    stack_overflow_df.head()
except Exception as e:
     print('The DataFrame is too large to load into memory.', e)

The DataFrame is too large to load into memory. 400 Syntax error: Unclosed string literal at [1:1]; reason: invalidQuery, location: query, message: Syntax error: Unclosed string literal at [1:1]

Location: US
Job ID: f48b8660-d2db-4304-b881-6f1e3f39a085


Location: US
Job ID: f48b8660-d2db-4304-b881-6f1e3f39a085


Location: US
Job ID: f48b8660-d2db-4304-b881-6f1e3f39a085



# query optimizing

In [48]:
QUERY = """
SELECT
    CONCAT(q.title, q.body) as input_text,
    a.body AS output_text
FROM
    `bigquery-public-data.stackoverflow.posts_questions` q
JOIN
    `bigquery-public-data.stackoverflow.posts_answers` a
ON
    q.accepted_answer_id = a.id
WHERE
    q.accepted_answer_id IS NOT NULL AND
    REGEXP_CONTAINS(q.tags, "python") AND
    a.creation_date >= "2020-01-01"
LIMIT
    10000
"""

In [49]:
query_job = bq_client.query(QUERY)

In [50]:
stack_overflow_df = query_job.result().to_arrow().to_pandas()
stack_overflow_df.head()



Unnamed: 0,input_text,output_text
0,Simple way to measure cell execution time in i...,<p>That was only a problem in old versions.</p...
1,Plotting live pie-chart using matplotlib pyplo...,"<p>Fixed it.</p>\n<pre class=""lang-py prettypr..."
2,sklearn.compose.ColumnTransformer() handles si...,"<p>See the section 6.1.4 in the <a href=""https..."
3,How to import a local module into azure databr...,<p>I've solved this by using python's <code>eg...
4,R equivalent of python -m module for an R pack...,<p>As it turns out there was a really simple s...


# Adding Instructions

In [51]:
INSTRUCTION_TEMPLATE = f"""\
Please answer the following Stackoverflow question on Python. \
Answer it like you are a developer answering Stackoverflow questions.

Stackoverflow question:
"""

In [57]:
stack_overflow_df['input_text_instruct'] = INSTRUCTION_TEMPLATE + ' ' + stack_overflow_df['input_text']

In [64]:
stack_overflow_df.head()

Unnamed: 0,input_text,output_text,input_text_instruct
0,Simple way to measure cell execution time in i...,<p>That was only a problem in old versions.</p...,Please answer the following Stackoverflow ques...
1,Plotting live pie-chart using matplotlib pyplo...,"<p>Fixed it.</p>\n<pre class=""lang-py prettypr...",Please answer the following Stackoverflow ques...
2,sklearn.compose.ColumnTransformer() handles si...,"<p>See the section 6.1.4 in the <a href=""https...",Please answer the following Stackoverflow ques...
3,How to import a local module into azure databr...,<p>I've solved this by using python's <code>eg...,Please answer the following Stackoverflow ques...
4,R equivalent of python -m module for an R pack...,<p>As it turns out there was a really simple s...,Please answer the following Stackoverflow ques...


# Dataset for Tuning

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
train, evaluation = train_test_split(
    stack_overflow_df,
    test_size=0.2,
    random_state=42
)

# Versioning data

In [60]:
import datetime

In [61]:
date = datetime.datetime.now().strftime("%H:%d:%m:%Y")

# Generate a jsonl file.

In [66]:
cols = ['input_text_instruct','output_text']
tune_jsonl = train[cols].to_json(orient="records", lines=True)

In [67]:
training_data_filename = f"tune_data_stack_overflow_python_qa-{date}.jsonl"

In [68]:
with open (training_data_filename,'w') as f:
    f.write(tune_jsonl)

In [71]:
with open(training_data_filename, "r") as f:
    lines = f.readlines()

for line in lines[:5]:
    print(line)


{"input_text_instruct":"Please answer the following Stackoverflow question on Python. Answer it like you are a developer answering Stackoverflow questions.\n\nStackoverflow question:\n deformat locale number string back to number<p>I can create a locale Number string via<\/p>\n\n<pre><code>import locale\nlocale.setlocale(locale.LC_ALL, 'de_DE.utf8')\nnumberstring = '{0:n}'.format(number)\n<\/code><\/pre>\n\n<p>(I found this on <a href=\"https:\/\/stackoverflow.com\/a\/31923117\">this stackoverflow answer<\/a>.)<\/p>\n\n<p>Now, I have an input from the user. I don't know whether it is a valid locale number string or not. If it is, I want to convert it to a number. If it is not, I want to send an error and let him retry.<\/p>\n\n<p>What is the best-practice solution i.e. the one that uses the functions from the <code>locale<\/code> lib that are meant for this? (I guess this does exist?)<\/p>","output_text":"<p>Doesn't <a href=\"https:\/\/docs.python.org\/3\/library\/locale.html#locale.de

In [72]:
tune_jsonl = evaluation[cols].to_json(orient="records", lines=True)
validation_data_filename = f"tune_eval_data_stack_overflow_python_qa-{date}.jsonl"
with open (validation_data_filename,'w') as f:
    f.write(tune_jsonl)

In [73]:
with open(validation_data_filename, "r") as f:
    lines = f.readlines()

for line in lines[:5]:
    print(line)


{"input_text_instruct":"Please answer the following Stackoverflow question on Python. Answer it like you are a developer answering Stackoverflow questions.\n\nStackoverflow question:\n How to test if var is not either of two strings?<p>I've stumbled upon a problem I can't explain.<\/p>\n<pre><code>chosen = input()\nif chosen == &quot;1&quot; or chosen == &quot;2&quot;:\n  print(&quot;Okay&quot;)\nelse:\n  print(&quot;Please choose between 1 or 2.&quot;)\n<\/code><\/pre>\n<p>If written like that it executes as intended, but the flow felt weird, so I want to continue with else, so I changed the statement to !=<\/p>\n<pre><code>chosen = input()\nif chosen != &quot;1&quot; or chosen != &quot;2&quot;:\n  print(&quot;Please choose between 1 or 2.&quot;)\nelse:\n  print(&quot;Okay&quot;)\n<\/code><\/pre>\n<p>That way (to me) it feels natural to continue the code, but now no input returns &quot;Okay&quot;.<\/p>","output_text":"<p>Ideally, you'd use <code>in<\/code> for this, which reads much c