## Clean the Data

In [1]:
# %pip install --upgrade langchain langchain-openai chromadb


In [2]:
import sqlite3
from pydantic import BaseModel

from langchain_core.documents import Document


In [3]:
# read data from "./data/db.sqlite3"
conn = sqlite3.connect("./data/db.sqlite3")
# create cursor
cur = conn.cursor()
# read data from strains_strain table
cur.execute("SELECT * FROM strains_strain")
# fetch all data
data = cur.fetchall()


In [4]:
data[0]


(1,
 'Green Crack',
 'green-crack',
 'Green Crack is a highly energetic and uplifting sativa-dominant strain known for its sharp focus and mental stimulation. The effects of Green Crack are invigorating, making it a popular choice for daytime use to boost creativity and productivity. This strain has a unique combination of sweet and tangy flavors with hints of citrus and tropical fruits. Green Crack is also valued for its ability to relieve stress and fatigue, providing a burst of energy without the typical crash associated with other strains.',
 'Sativa',
 '2024-03-14 13:40:17.463182',
 '2024-03-17 21:39:09.294111',
 'Green Crack is a legendary sativa strain known for its energizing and uplifting effects. Its bright green buds are covered in orange hairs and glistening trichomes, giving off a sweet and tangy aroma that instantly invigorates the senses. When smoked, Green Crack delivers a powerful burst of mental clarity and creativity, perfect for sparking inspiration and motivation. 

In [5]:
len(data[0])


15

In [6]:
len(data)


7233

In [7]:
class Strain(BaseModel):
    sid: int
    name: str
    wikileaf_slug: str
    description: str
    strain_type: str
    stoner_description: str
    poem: str
    cons: list[str]
    effects: list[str]
    flavors: list[str]
    medical_use: list[str]
    pros: list[str]
    one_liner: str


### Create a validation class

In [8]:
cleaned_data: list[Strain] = []
for row in data:
    strain = Strain(
        sid=row[0],
        name=row[1],
        wikileaf_slug=row[2],
        description=row[3],
        strain_type=row[4],
        stoner_description=row[7],
        poem=row[8],
        cons=eval(row[9]),
        effects=eval(row[10]),
        flavors=eval(row[11]),
        medical_use=eval(row[12]),
        pros=eval(row[13]),
        one_liner=row[14],
    )
    cleaned_data.append(strain)

del data


In [9]:
cleaned_data[:5]


[Strain(sid=1, name='Green Crack', wikileaf_slug='green-crack', description='Green Crack is a highly energetic and uplifting sativa-dominant strain known for its sharp focus and mental stimulation. The effects of Green Crack are invigorating, making it a popular choice for daytime use to boost creativity and productivity. This strain has a unique combination of sweet and tangy flavors with hints of citrus and tropical fruits. Green Crack is also valued for its ability to relieve stress and fatigue, providing a burst of energy without the typical crash associated with other strains.', strain_type='Sativa', stoner_description='Green Crack is a legendary sativa strain known for its energizing and uplifting effects. Its bright green buds are covered in orange hairs and glistening trichomes, giving off a sweet and tangy aroma that instantly invigorates the senses. When smoked, Green Crack delivers a powerful burst of mental clarity and creativity, perfect for sparking inspiration and motiva

### Save Validated Data in json

In [10]:
# write cleaned data to "./data/strains.json"
import json

with open("./data/strains.jsonl", "w") as f:
    json.dump(list(map(dict, cleaned_data)), f, indent=4)


## Injest Data

### Convert data to langchain Documents

In [11]:
docs = list(
    map(
        lambda doc: Document(str(dict(doc)), metadata={"name": doc.name}),
        cleaned_data,
    )
)


In [12]:
print(list(map(lambda doc: doc.name, cleaned_data[:5])))


['Green Crack', 'Blue Dream', 'Sour Diesel', 'Gorilla Glue #4', 'Skywalker OG']


### Save the data

In [13]:
from langchain.vectorstores.chroma import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings
import chromadb

from typing import Literal
import time


def add_documents_to_chroma(
    documents: list[Document],
    store: Chroma,
    batch_size: int = 10,
    sleep_time: float | Literal["auto"] = "auto",
) -> list[str]:
    """
    add_documents adds documents to the store while preventing rate limits.

    Parameters
    ----------
    documents : list[Document]
        The documents to be added to the store
    store : Chroma
        The store to add the documents to
    batch_size : int, optional
        The number of documents to add at once, by default 10
    sleep_time : float | Literal["auto"], optional
        The time to sleep between requests, by default "auto".

    Returns
    -------
    list[str]
        The ids of the documents added to the store
    """
    ids: list[str] = []
    for i in range(0, len(documents), batch_size):
        if sleep_time == "auto":
            t1 = time.perf_counter()
            ids.extend(store.add_documents(documents[i : i + batch_size]))
            t2 = time.perf_counter()
            time.sleep(2 * (t2 - t1))
        else:
            ids.extend(store.add_documents(documents[i : i + batch_size]))
            time.sleep(sleep_time)
        print(f"Added documents {i} to {i + batch_size}")

    return ids


In [14]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
new_client = chromadb.PersistentClient()
chroma = Chroma(
    client=new_client,
    collection_name="strains",
    embedding_function=embeddings,
)

add_documents_to_chroma(docs, chroma, sleep_time=1)


Added documents 0 to 10
Added documents 10 to 20
Added documents 20 to 30
Added documents 30 to 40
Added documents 40 to 50
Added documents 50 to 60
Added documents 60 to 70
Added documents 70 to 80
Added documents 80 to 90
Added documents 90 to 100
Added documents 100 to 110
Added documents 110 to 120
Added documents 120 to 130
Added documents 130 to 140
Added documents 140 to 150
Added documents 150 to 160
Added documents 160 to 170
Added documents 170 to 180
Added documents 180 to 190
Added documents 190 to 200
Added documents 200 to 210
Added documents 210 to 220
Added documents 220 to 230
Added documents 230 to 240
Added documents 240 to 250
Added documents 250 to 260
Added documents 260 to 270
Added documents 270 to 280
Added documents 280 to 290
Added documents 290 to 300
Added documents 300 to 310
Added documents 310 to 320
Added documents 320 to 330
Added documents 330 to 340
Added documents 340 to 350
Added documents 350 to 360
Added documents 360 to 370
Added documents 370 t

['ba5f5a72-f26b-436c-96b9-e1910bea3511',
 '539c3b6b-f7d7-409b-a93d-5b145137c30c',
 'ecad809e-69dc-4a7c-9170-06d8c6dcb6c5',
 'd1d68f62-5571-4054-8afe-e1bc7349932e',
 'e3fb2cc7-decc-4b54-9f61-f3999debdd77',
 'd72c9d75-1d03-4a3a-b175-631b81aa7e92',
 '96fa38fd-6d8e-48ae-9501-0d821cc68c87',
 '98d92f53-99e6-4785-9eeb-b010f22f949c',
 '93645ea2-a14c-4772-846b-aa201f9f0be7',
 '3c0353d4-0fff-4331-bc84-1580aca39dd2',
 'dde643e0-1e58-41a2-af86-e1b2b953957b',
 '43290317-e3bf-40e8-9201-0ca7b66ca9e2',
 '34fd63cf-34e5-4580-96a2-9899781f8186',
 '042734bc-d350-43b8-8c8f-f3d33f150cc5',
 '87c0c2d2-3447-4987-92f9-fe1f6844737f',
 '79d19319-7eff-4230-b294-bef54fc9d4ca',
 '62364b54-6791-431a-ae97-099433f57df4',
 '27ceff4f-fd5d-43ff-81ea-cf575c490eaf',
 '99f74e8c-6dae-4277-8662-86e66b69e2c6',
 '8811f189-8cb9-46ec-8197-5d428b631da5',
 '92ff82c4-476f-48c3-927e-3637b63d7fe0',
 '5225a9a0-7901-4385-97df-bd576ae3ec19',
 'af64f33d-3fc6-4c54-b108-755335b9ea33',
 '6583d701-871a-4114-a91a-a392f106b260',
 '4c245941-09e7-