# Store data to watsonx.data

In order to provide context automatically, this information has to be stored in a local database. Therefore, in this lab we collect data from the internet and store it in watsonx.data.

### Initialize configuration

In [None]:
import sys
sys.path.append("../../utils")
import wxd_utils

conf=wxd_utils.load_conf()
print(conf)

### Load data from wikipedia

We get data from wikipedia about the nobel price winner in literature in 2023. This data is used as context for our question.

In [None]:
import wikipedia

# fetch wikipedia articles
articles = {
    'Nobel price in literature': None, 
    '2023 Nobel price in literature': 72508137,
    '2024 Nobel price in literature': 75098159
}
for k,v in articles.items():
    if v:
        article = wikipedia.page(pageid=v)
    else:
        article = wikipedia.page(k)
    articles[k] = article.content
    print(f"Successfully fetched {k}")

print(f"Successfully fetched {len(articles)} articles ")

### Connect watsonx.data

In [None]:
wxd_engine = wxd_utils.connect_wxd(conf)

### Create Schema in watsonx.data Hive Bucket to store wikipedia data

In [None]:
import pandas as pd
import sqlalchemy

try: 
  create_schema_result = pd.read_sql("""

    CREATE SCHEMA hive_data.watsonxai WITH ( location = 's3a://hive-bucket/watsonx_ai')

    """, wxd_engine)
  
except sqlalchemy.exc.SQLAlchemyError as e:
  print("Error creating schema:", str(e))

### Create table to hold wikipedia data in schema from above

In [None]:
try:

    create_table_result = pd.read_sql("""

        CREATE TABLE hive_data.watsonxai.wikipedia
        (
            "id" varchar,
            "text" varchar, 
            "title" varchar  )
        WITH (
            format = 'PARQUET'
        )
     
    """, wxd_engine)
  
except sqlalchemy.exc.SQLAlchemyError as e:
  print("Error creating table:", str(e))

### Chunk and insert data

In [None]:
chunks = wxd_utils.chunk_articles(articles, 255)
chunks

In [None]:
for item in chunks:
    insert_stmt = f"insert into hive_data.watsonxai.wikipedia values ('{item['id']}', '{item['chunk']}', '{item['title']}')"
            
    with wxd_engine.connect() as connection:
        connection.execute(insert_stmt)
    
    print(f"{title} {item['id']}/{len(chunks)} INSERTED")

In [None]:
# confirm data inserted

wiki_articles = pd.read_sql("select * from hive_data.watsonxai.wikipedia", wxd_engine)
wiki_articles