In [None]:
!pip3 install thirdai --upgrade
!pip3 install "thirdai[neural_db]"  # You may have to unquote this

In [1]:
from thirdai import licensing, neural_db as ndb

import nltk
nltk.download("punkt")


import os
if "THIRDAI_KEY" in os.environ:
    licensing.activate(os.environ["THIRDAI_KEY"])
else:
    licensing.activate("")  # Enter your ThirdAI key here
    
import pandas as pd
import textwrap
import os

[nltk_data] Downloading package punkt to /home/pratyush/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/pratyush/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Download data

In [2]:
data_dir = "./data/"
if not os.path.isdir(data_dir):
    os.system("mkdir "+data_dir)

os.system("wget -nv -O "+data_dir+"train.csv 'https://www.dropbox.com/scl/fi/e98v27l171otn75x8woe3/train_large.csv?rlkey=6irpgini3g8gwcn78zwu253pp&dl=0'")
os.system("wget -nv -O "+data_dir+"test.csv 'https://www.dropbox.com/scl/fi/56r5rrrlm2l7oz9gb7tm9/test_large.csv?rlkey=a0rqjdmhsizxlh0uuyft7vi65&dl=0'")

2024-04-01 15:05:56 URL:https://ucf1e07618567c31b464743abe43.dl.dropboxusercontent.com/cd/0/inline/CQOIbjTqiSGmC1mtm6SZMtqeP_ImoAoKY91oosW0SQf8XeVxdDEW0qrMZMgwAjjk0FKgzkkjqozm0_xMThtpFfUZgTxIQgLjh73b3h-uVOawSnu1vZRudUyFGDuKHVbKP774uosFZUjnYje6sef1wVPX/file [216786183/216786183] -> "./data/train.csv" [1]
2024-04-01 15:05:59 URL:https://uc9eed4732a185b67d99c6369491.dl.dropboxusercontent.com/cd/0/inline/CQMqXtrs0B7NzVASOSa4wO80fVkK8CZokoA046Zw-LdRbIY3Nn3453KU0dRpVk-wM19NVeqXc30z9eAy62npLDvFVw8FZRNK8Hbaq2ubz3dSxohuK_tvLAH2LBjt1sEhlCTlUBSPpeVGx33BjkUYX6Ob/file [1510895/1510895] -> "./data/test.csv" [1]


0

In [3]:
train_file = data_dir+"train.csv"
test_file = data_dir+"test.csv"

### Initialize NeuralDB

In [4]:
db = ndb.NeuralDB(fhr=160_000, 
                  embedding_dimension=4096,
                  extreme_output_dim=10_000,
                  num_models_per_shard=4,
                  extreme_num_hashes=1, 
                  use_inverted_index=False)

### Evaluate metrics

-   Relevance denotes that model predicts a revalant answer to query
-   Helpfulness denotes that model predicts the highest scoring answer

In [5]:
def test(db):
    def precision_at_k(predicted, original, k):
        total_precision = 0
        for pred_instance, orig_instance in zip(predicted, original):
            total_precision += len(set(pred_instance[:k]) & set(orig_instance)) / k
        return total_precision / len(predicted) 
    test_df = pd.read_csv(test_file)
    questions = test_df['query'].to_list()
    true_labels_all = list(map(lambda x: list(map(int, x.split(","))), test_df['ids'].to_list()))
    predicted_all, original_all, original_top  = [], [], []
    results = db.search_batch(questions, top_k=1, label_probing=True)
    for result, true_labels in list(zip(results, true_labels_all)):
        predicted_labels = [res.metadata['id'] for res in result]
        predicted_all.append(predicted_labels)
        original_all.append(true_labels)
        original_top.append(true_labels[:1])
    print("Relevance Precision@1 =", precision_at_k(predicted_all, original_all, 1)) 
    print("Helpfulness Precision@1 =", precision_at_k(predicted_all, original_top, 1)) 

### Unsupervised training on answer and titles

In [None]:
csv_file = ndb.CSV(train_file, id_column="id", strong_columns=['title', 'answer'])
source_ids = db.insert([csv_file], train=True, learning_rate=0.001)

In [7]:
test(db)

Relevance Precision@1 = 0.117
Helpfulness Precision@1 = 0.031


### Supervised training on questions

In [None]:
sup_data = ndb.Sup(
            train_file,
            query_column="query",
            id_delimiter="",
            id_column="id",
            source_id=source_ids[0],
        )
db.supervised_train([sup_data], learning_rate=0.001, epochs=10)

In [9]:
test(db)

Relevance Precision@1 = 0.982
Helpfulness Precision@1 = 0.241


### Upvoting highest scoring answers

In [10]:
train_df = pd.read_csv(train_file)
train_df = train_df.sort_values('score', ascending=False).groupby('query').first().reset_index()
batches_to_upvote=[(row.query, row.id) for row in train_df.itertuples()]

In [None]:
import random
random.shuffle(batches_to_upvote)

db.text_to_result_batch(batches_to_upvote, n_balancing_samples=1, n_upvote_samples=1, epochs=3)

In [12]:
test(db)

Relevance Precision@1 = 0.997
Helpfulness Precision@1 = 0.9185


### Let's ask

In [13]:
wrapper = textwrap.TextWrapper(width=100)

In [52]:
query = 'I require something similar, a collection of elements without any duplicates. Does Common Lisp, specifically SBCL, offer anything of this sort?'

results = db.search(query,top_k=1, label_probing=True)
for result in results:
    answer = result.metadata['answer']
    wrapped_text = wrapper.wrap(text = answer)
    for element in wrapped_text:
        print(element)
    print()

 For a quick solution, just use hash tables, as has been mentioned before. However, if you prefer a
more principled approach, you can take a look at FSet , which is “a functional set-theoretic
collections library”. Among others, it contains classes and operations for sets and bags. (EDIT:)
The cleanest way would probably be to define your set-oriented operations as generic functions. A
set of generic functions is basically equivalent to a Java interface, after all. You can simply
implement methods on the standard HASH-TABLE class as a first prototype and allow other
implementations as well.



In [54]:
query = 'I am utilizing a particular command in my C# code, which functions properly. However, it is reported to malfunction in "unmanaged" code. What is managed or unmanaged code?'

results = db.search(query,top_k=1, label_probing=True)
for result in results:
    answer = result.metadata['answer']
    wrapped_text = wrapper.wrap(text = answer)
    for element in wrapped_text:
        print(element)
    print()

 This is a good article about the subject. To summarize, Managed code is not compiled to machine
code but to an intermediate language which is interpreted and executed by some service on a machine
and is therefore operating within a (hopefully!) secure framework which handles dangerous things
like memory and threads for you. In modern usage this frequently means .NET but does not have to. An
application program that is executed within a runtime engine installed in the same machine. The
application cannot run without it. The runtime environment provides the general library of software
routines that the program uses and typically performs memory management. It may also provide just-
in-time (JIT) conversion from source code to executable code or from an intermediate language to
executable code. Java, Visual Basic and .NET's Common Language Runtime (CLR) are examples of runtime
engines. ( Read more ) Unmanaged code is compiled to machine code and therefore executed by the OS
directly. It 