In [None]:
!pip3 install thirdai --upgrade
!pip3 install "thirdai[neural_db]"  # You may have to unquote this

In [1]:
from thirdai import licensing, neural_db as ndb

import nltk
nltk.download("punkt")

import os
if "THIRDAI_KEY" in os.environ:
    licensing.activate(os.environ["THIRDAI_KEY"])
else:
    licensing.activate("")  # Enter your ThirdAI key here
    
import pandas as pd
import textwrap
import os

[nltk_data] Downloading package punkt to /home/pratyush/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Download data

In [2]:
data_dir = "./stackoverflow_data/"
if not os.path.isdir(data_dir):
    os.system("mkdir "+data_dir)

os.system("wget -nv -O "+data_dir+"train.csv 'https://www.dropbox.com/scl/fi/e7ltns3teqmngai3fp71o/stackoverflow_train.csv?rlkey=f9pq0amo5swfrdi1hcvk52cuc&dl=0'")
os.system("wget -nv -O "+data_dir+"test.csv 'https://www.dropbox.com/scl/fi/9rqf0nf2cmti0ihn5x4uu/stackoverflow_test.csv?rlkey=cn3ttfenhgxrsaemidnat0kt2&dl=0'")
     

2024-03-11 10:00:10 URL:https://uc2ff0f28034af7151cc590a892f.dl.dropboxusercontent.com/cd/0/inline/CO56XcCi4QgH5aqBCiA-LfqTmYqg6zXn-CS8G5D4dBmr2KWnKLMLJUcT-jfBGsADcSUtVnvLZfvUTLkiU5xT52vfSsibHIs5TkQh1l-bhluPRz4c-Kv-4Xo-A1Jof3SrlzSmzK_TLDOKDuN4qal6krg7/file [26625547/26625547] -> "./stackoverflow_data/train.csv" [1]
2024-03-11 10:00:11 URL:https://uc79d18f4a29378589b05ac80d77.dl.dropboxusercontent.com/cd/0/inline/CO6TqT9id6BmVIZEVcPX5UExmUosZ4crTh88iWPcX6c5UeB6PiKN0kqXMZclZ2NlCrz8Ch3NwdT0ESXbGAeo3KpECFHcuEuTDL2w-Nvhj0Qzuj6byf7JNFo-PA2N9sPU9UwuiW62912oSYWOfhs4qyar/file [136081/136081] -> "./stackoverflow_data/test.csv" [1]


0

In [3]:
train_file = data_dir+"train.csv"
test_file = data_dir+"test.csv"

### Initialize NeuralDB

In [4]:
db = ndb.NeuralDB(fhr=100_000, 
                  embedding_dimension=1024,
                  extreme_output_dim=4000,
                  extreme_num_hashes=4, 
                  use_inverted_index=False)

### Evaluate metrics

-   Relevance denotes that model predicts a revalant answer to query
-   Helpfulness denotes that model predicts the highest scoring answer

In [5]:
def test(db):
    def precision_at_k(predicted, original, k):
        total_precision = 0
        for pred_instance, orig_instance in zip(predicted, original):
            total_precision += len(set(pred_instance[:k]) & set(orig_instance)) / k
        return total_precision / len(predicted) 
    test_df = pd.read_csv(test_file)
    questions = test_df['query'].to_list()
    true_labels_all = list(map(lambda x: list(map(int, x.split(","))), test_df['ids'].to_list()))
    predicted_all, original_all, original_top  = [], [], []
    results = db.search_batch(questions, top_k=1)
    for result, true_labels in list(zip(results, true_labels_all)):
        predicted_labels = [res.metadata['id'] for res in result]
        predicted_all.append(predicted_labels)
        original_all.append(true_labels)
        original_top.append(true_labels[:1])
    print("Relevance Precision@1 =", precision_at_k(predicted_all, original_all, 1)) 
    print("Helpfulness Precision@1 =", precision_at_k(predicted_all, original_top, 1)) 

### Unsupervised training on answer and titles

In [6]:
csv_file = ndb.CSV(train_file, id_column="id", strong_columns=['title', 'answer'])
source_ids = db.insert([csv_file], train=True, learning_rate=0.001)

loading data | source 'Documents:
train.csv'
loading data | source 'Documents:
train.csv' | vectors 20000 | batches 10 | time 2.195s | complete

train | epoch 0 | train_steps 10 | train_hash_precision@5=0.04441  | train_batches 10 | time 2.748s

loading data | source 'Documents:
train.csv'
loading data | source 'Documents:
train.csv' | vectors 20000 | batches 10 | time 2.103s | complete

train | epoch 1 | train_steps 20 | train_hash_precision@5=0.06808  | train_batches 10 | time 1.586s

loading data | source 'Documents:
train.csv'
loading data | source 'Documents:
train.csv' | vectors 20000 | batches 10 | time 2.164s | complete

train | epoch 2 | train_steps 30 | train_hash_precision@5=0.09506  | train_batches 10 | time 1.547s

loading data | source 'Documents:
train.csv'
loading data | source 'Documents:
train.csv' | vectors 20000 | batches 10 | time 2.059s | complete

train | epoch 3 | train_steps 40 | train_hash_precision@5=0.12612  | train_batches 10 | time 1.485s

loading data | s

In [7]:
test(db)

Relevance Precision@1 = 0.04522613065326633
Helpfulness Precision@1 = 0.04020100502512563


### Supervised training on questions

In [8]:
sup_data = ndb.Sup(
            train_file,
            query_column="query",
            id_delimiter="",
            id_column="id",
            source_id=source_ids[0],
        )
db.supervised_train([sup_data], learning_rate=0.001, epochs=10)

loading data | source 'Supervised training samples'
loading data | source 'Supervised training samples' | vectors 20000 | batches 10 | time 0.826s | complete

train | epoch 10 | train_steps 110 |  | train_batches 10 | time 2.980s  

train | epoch 11 | train_steps 120 |  | train_batches 10 | time 2.978s  

train | epoch 12 | train_steps 130 |  | train_batches 10 | time 2.972s  

train | epoch 13 | train_steps 140 |  | train_batches 10 | time 3.004s  

train | epoch 14 | train_steps 150 |  | train_batches 10 | time 3.005s  

train | epoch 15 | train_steps 160 |  | train_batches 10 | time 2.620s  

train | epoch 16 | train_steps 170 |  | train_batches 10 | time 2.921s  

train | epoch 17 | train_steps 180 |  | train_batches 10 | time 2.944s  

train | epoch 18 | train_steps 190 |  | train_batches 10 | time 3.111s  

train | epoch 19 | train_steps 200 |  | train_batches 10 | time 3.085s  



In [9]:
test(db)

Relevance Precision@1 = 0.9547738693467337
Helpfulness Precision@1 = 0.7738693467336684


### Upvoting highest scoring answers

In [11]:
train_df = pd.read_csv(train_file)
train_df = train_df.sort_values('score', ascending=False).groupby('query').first().reset_index()
batches_to_upvote=[(row.query, row.id) for row in train_df.itertuples()]

In [12]:
import random
random.shuffle(batches_to_upvote)

db.text_to_result_batch(batches_to_upvote)

In [13]:
test(db)

Relevance Precision@1 = 0.9597989949748744
Helpfulness Precision@1 = 0.9195979899497487


### Let's ask

In [14]:
wrapper = textwrap.TextWrapper(width=100)

In [15]:
query = """I possess an algorithm that produces strings from a list of input words. 
How do I isolate only the strings that sound like English words? ie. reject 
RDLO while retaining LORD. EDIT: To clarify, they don't have to be real words 
in the dictionary. They just need to resemble English. For instance, KEAL would be 
acceptable."
"""

results = db.search(query,top_k=1)
for result in results:
    answer = result.metadata['answer']
    wrapped_text = wrapper.wrap(text = answer)
    for element in wrapped_text:
        print(element)
    print()

 You can build a markov-chain of a huge english text. Afterwards you can feed words into the markov
chain and check how high the probability is that the word is english. See here:
http://en.wikipedia.org/wiki/Markov_chain At the bottom of the page you can see the markov text
generator. What you want is exactly the reverse of it. In a nutshell: The markov-chain stores for
each character the probabilities of which next character will follow. You can extend this idea to
two or three characters if you have enough memory.



In [16]:
query = """What is the optimal way to unit test a method that invokes multiple methods, 
for instance: modify(string value) { if(value.Length &gt; 5) replaceit(value); else 
changeit(value); } This pseudo code has a modify method that (currently) calls either replaceit() 
or changeit() . I have already written tests for replaceit and changeit , so creating a new test for
modify will be 99% the same set of code. I need to test it though because it might change 
in the future. So do I duplicate the existing test code? Shift the test code to a common function? 
Any other suggestions? I'm uncertain of the best practice here.
"""

results = db.search(query,top_k=1)
for result in results:
    answer = result.metadata['answer']
    wrapped_text = wrapper.wrap(text = answer)
    for element in wrapped_text:
        print(element)
    print()

 You basically need 2 tests. 1) Pass in a string like "The Quick Brown Fox Jumps!" (length greater
than five) makes sure that the value is affected by replaceit(...) 2) Pass in a string like "Foo"
(length is less than five) and make sure that the value is affected by changeit(...) Your test (in
pseudo code) might look like this: testLongValue() { string testValue = "A value longer than 5
chars"; string expected = "Replaced!"; string actual = modify(testValue); assertEqual(expected,
actual); } testShortValue() { string testValue = "len4"; string expected = "Changed!"; string actual
= modify(testValue); assertEqual(expected, actual); } Obviously I could give you a more realistic
example if I knew what replacit() and changeit() were supposed to do, but this should give you the
idea. If it mutates the original value reference instead of returning it, you can just use testValue
as the actual value after the call occurs.



In [17]:
query = """I am seeking to develop a very lightweight GUI front end in Windows.
It's meant to perform a simple task - when a hot key combination is pressed it
opens up a text box. Any text can be pasted in and then saved with a simple text 
box. I am aiming to avoid any menu bar or toolbars completely. What would be the
perfect GUI library to create something like this?"""

results = db.search(query,top_k=1)
for result in results:
    answer = result.metadata['answer']
    wrapped_text = wrapper.wrap(text = answer)
    for element in wrapped_text:
        print(element)
    print()

 The question is pretty broad, but I'm partial to markup-based UIs. Here's a window with a text box
in WPF: &lt;Window xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"&gt;
&lt;Grid&gt; &lt;TextBox x:Name="InputBox"/&gt; &lt;/Grid&gt; &lt;/Window&gt; Now I won't even try
to claim that WPF has the shortest learning curve, but it is the most powerful on Windows and it's
pretty easy to pick up with the right tooling. (i.e. Expression Blend). Blend isn't cheap but some
folks already have it for free and don't know it (students, MSDN subscribers, some startups). Visual
Studio 2010 is much improved in this area too, so Blend may not be needed.

