## Clean the Data

In [16]:
%pip install --upgrade langchain langchain-openai chromadb


Collecting langchain
  Using cached langchain-0.1.13-py3-none-any.whl (810 kB)
  Using cached langchain-0.1.12-py3-none-any.whl (809 kB)
Collecting langchain-openai
  Using cached langchain_openai-0.1.1-py3-none-any.whl (32 kB)
  Using cached langchain_openai-0.0.8-py3-none-any.whl (32 kB)
Collecting chromadb
  Using cached chromadb-0.4.24-py3-none-any.whl (525 kB)
  Using cached chromadb-0.4.23-py3-none-any.whl (521 kB)
Note: you may need to restart the kernel to use updated packages.


In [17]:
import sqlite3
from pydantic import BaseModel

from langchain_core.documents import Document


In [18]:
# read data from "./data/db.sqlite3"
conn = sqlite3.connect("./data/db.sqlite3")
# create cursor
cur = conn.cursor()
# read data from strains_strain table
cur.execute("SELECT * FROM strains_strain")
# fetch all data
data = cur.fetchall()


In [19]:
data[0]


(1,
 'Green Crack',
 'green-crack',
 'Green Crack is a highly energetic and uplifting sativa-dominant strain known for its sharp focus and mental stimulation. The effects of Green Crack are invigorating, making it a popular choice for daytime use to boost creativity and productivity. This strain has a unique combination of sweet and tangy flavors with hints of citrus and tropical fruits. Green Crack is also valued for its ability to relieve stress and fatigue, providing a burst of energy without the typical crash associated with other strains.',
 'Sativa',
 '2024-03-14 13:40:17.463182',
 '2024-03-17 21:39:09.294111',
 'Green Crack is a legendary sativa strain known for its energizing and uplifting effects. Its bright green buds are covered in orange hairs and glistening trichomes, giving off a sweet and tangy aroma that instantly invigorates the senses. When smoked, Green Crack delivers a powerful burst of mental clarity and creativity, perfect for sparking inspiration and motivation. 

In [20]:
print(len(data[0]))


15


In [21]:
class Strain(BaseModel):
    sid: int
    name: str
    wikileaf_slug: str
    description: str
    strain_type: str
    stoner_description: str
    poem: str
    cons: list[str]
    effects: list[str]
    flavors: list[str]
    medical_use: list[str]
    pros: list[str]
    one_liner: str


### Create a validation class

In [22]:
cleaned_data: list[Strain] = []
for row in data:
    strain = Strain(
        sid=row[0],
        name=row[1],
        wikileaf_slug=row[2],
        description=row[3],
        strain_type=row[4],
        stoner_description=row[7],
        poem=row[8],
        cons=eval(row[9]),
        effects=eval(row[10]),
        flavors=eval(row[11]),
        medical_use=eval(row[12]),
        pros=eval(row[13]),
        one_liner=row[14],
    )
    cleaned_data.append(strain)

del data


In [23]:
cleaned_data[:5]


[Strain(sid=1, name='Green Crack', wikileaf_slug='green-crack', description='Green Crack is a highly energetic and uplifting sativa-dominant strain known for its sharp focus and mental stimulation. The effects of Green Crack are invigorating, making it a popular choice for daytime use to boost creativity and productivity. This strain has a unique combination of sweet and tangy flavors with hints of citrus and tropical fruits. Green Crack is also valued for its ability to relieve stress and fatigue, providing a burst of energy without the typical crash associated with other strains.', strain_type='Sativa', stoner_description='Green Crack is a legendary sativa strain known for its energizing and uplifting effects. Its bright green buds are covered in orange hairs and glistening trichomes, giving off a sweet and tangy aroma that instantly invigorates the senses. When smoked, Green Crack delivers a powerful burst of mental clarity and creativity, perfect for sparking inspiration and motiva

### Save Validated Data in json

In [24]:
# write cleaned data to "./data/strains.json"
import json

with open("./data/strains.jsonl", "w") as f:
    json.dump(list(map(dict, cleaned_data)), f, indent=4)


## Injest Data

### Convert data to langchain Documents

In [25]:
docs = list(
    map(
        lambda doc: Document(str(dict(doc)), metadata={"name": doc.name}),
        cleaned_data,
    )
)


In [26]:
print(list(map(lambda doc: doc.name, cleaned_data[:5])))


['Green Crack', 'Blue Dream', 'Sour Diesel', 'Gorilla Glue #4', 'Skywalker OG']


### Save the data

In [32]:
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import CacheBackedEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    dimensions=256,
    show_progress_bar=True,
    skip_empty=True,
    timeout=120,
    retry_min_seconds=60,
    retry_max_seconds=90,
)
cached_embedder = CacheBackedEmbeddings(embeddings, store)
db = Chroma.from_documents(docs, cached_embedder)


RuntimeError: [91mYour system has an unsupported version of sqlite3. Chroma                     requires sqlite3 >= 3.35.0.[0m
[94mPlease visit                     https://docs.trychroma.com/troubleshooting#sqlite to learn how                     to upgrade.[0m