## Load Data for Demos ##

### Note - first download csv files from s3://td-demos-data/CSV/ ###

Run each cell to load data for Use Case demos:

1. demo.Accounts_Mapping_BFS - Dimension table for Data Engineering demo
2. demo.Customer_BFS - Dimenstion table for Data Engineering demo
3. demo.Amazon_Fine_Foods_Reviews - Reviews for NBTC and Sentiment Analyzer
    - demo.nltk_stopwords for textparser
    - demo.bin_table for histogram function
4. demo.housing_prices_full - housing prices data for Numeric Regression
5. demo_ofs.Txn_History - copy of the txn history table for DS/OAF demos.
6. demo_ofs.UK_Retail_Data - Online retail sales data for KMeans Clustering.

<hr>

## Data Loads for Data Engineering Demos ##

- **demo.Accounts_Mapping_BFS**
- **demo.Customer_BFS**

In [None]:
# /* -------------------------------------------------------- */
# -- Perform this as SYSDBA.
# -- Load demo.Accounts_Mapping_EBS from source file
# /* -------------------------------------------------------- */

import teradatasql, json

# load vars json
with open('../vars.json', 'r') as f:
    session_vars = json.load(f)

name = session_vars['hierarchy']['SYSDBA']['username']
pwd = session_vars['hierarchy']['SYSDBA']['password']

with teradatasql.connect(host = session_vars['environment']['host'], 
                     user = name, 
                     password = pwd) as con:
    cur = con.cursor()
    
    try:
        cur.execute('DROP TABLE Demo.Accounts_Mapping_BFS')
    except Exception as e:
        # Table already exists
        if str(e.args).find("3807") >= 1:
            pass
        else:
            raise
            
    
    qry = '''
    CREATE MULTISET TABLE demo.Accounts_Mapping_BFS,
    STORAGE = TD_NDSSTORAGE
        (nameOrig VARCHAR(11),
         customer_identifier VARCHAR(100))
    PRIMARY INDEX(nameOrig) 
    '''
    cur.execute(qry)
    
    qry = '''{fn teradata_read_csv(customer_mapping.csv)} insert into demo.Accounts_Mapping_BFS (?, ?)'''
    cur.execute(qry)
        

In [2]:
# /* -------------------------------------------------------- */
# -- Perform this as SYSDBA.
# -- Load demo.Customer_EBS from S3
# /* -------------------------------------------------------- */

import teradatasql, json

# load vars json
with open('../vars.json', 'r') as f:
    session_vars = json.load(f)

name = session_vars['hierarchy']['SYSDBA']['username']
pwd = session_vars['hierarchy']['SYSDBA']['password']

with teradatasql.connect(host = session_vars['environment']['host'], 
                     user = name, 
                     password = pwd) as con:
    cur = con.cursor()
    
    try:
        cur.execute('DROP TABLE Demo.Customers_BFS')
    except Exception as e:
        # Table already exists
        if str(e.args).find("3807") >= 1:
            pass
        else:
            raise
            
    
    qry = '''
    CREATE TABLE demo.Customers_BFS,
    STORAGE = TD_NDSSTORAGE
         (
         id BIGINT,
         customer_identifier VARCHAR(26),
         firstname VARCHAR(50),
         lastname VARCHAR(90),
         email VARCHAR(90),
         phone VARCHAR(50),
         birthday VARCHAR(10),
         streetaddress VARCHAR(200),
         city VARCHAR(90),
         state VARCHAR(50),
         zipcode VARCHAR(10),
         latitude FLOAT,
         longitude FLOAT,
         num_accounts INTEGER)
    PRIMARY INDEX(id);
    '''
    cur.execute(qry)
    
    qry = '''
    INSERT INTO demo.Customers_BFS

    SELECT
         CAST(Col1 AS BIGINT) id,
         CAST(Col2 AS VARCHAR(26)) customer_identifier,
         CAST(Col3 AS VARCHAR(50)) firstname,
         CAST(Col4 AS VARCHAR(90)) lastname,
         CAST(Col5 AS VARCHAR(90)) email,
         CAST(Col6 AS VARCHAR(50)) phone,
         CAST(Col7 AS VARCHAR(10)) birthday,
         CAST(Col8 AS VARCHAR(200)) streetaddress,
         CAST(Col9 AS VARCHAR(90)) city,
         CAST(Col10 AS VARCHAR(50)) state,
         CAST(Col11 AS VARCHAR(10)) zipcode,
         CAST(Col12 AS FLOAT) latitude,
         CAST(Col13 AS FLOAT) longitude,
         CAST(Col14 AS INTEGER) numaccounts
    FROM
    (
         LOCATION = '/s3/s3.amazonaws.com/td-usecases-data-store/retail_sample_data/FSCustomerJourney/customers.csv'
         ROWFORMAT = '{"field_delimiter":",","record_delimiter":"\\n","character_set":"LATIN"}'
         STRIP_ENCLOSING_CHAR = '"'
        HEADER = 'FALSE'
        AUTHORIZATION = retail_sample_data.DEMO_AUTH_NOS
    ) as dt
    '''
    cur.execute(qry)
        

<hr>

## Data Loads for Text Demos ##
### NBTC and Sentiment Analyzer ###

- **demo.Amazon_Fine_Foods_Reviews**
- **demo.nltk_stopwords**
- **demo.bin_table**

In [3]:
# /* -------------------------------------------------------- */
# -- Perform this as SYSDBA.
# -- Load demo.Amazon_Fine_Foods_Reviews data
# /* -------------------------------------------------------- */

import teradatasql, json

# load vars json
with open('../vars.json', 'r') as f:
    session_vars = json.load(f)

name = session_vars['hierarchy']['SYSDBA']['username']
pwd = session_vars['hierarchy']['SYSDBA']['password']

with teradatasql.connect(host = session_vars['environment']['host'], 
                     user = name, 
                     password = pwd) as con:
    cur = con.cursor()
    
    try:
        cur.execute('DROP TABLE Demo.Amazon_Fine_Foods_Reviews')
    except Exception as e:
        # Table already exists
        if str(e.args).find("3807") >= 1:
            pass
        else:
            raise
            
    
    qry = '''
        CREATE TABLE Demo.Amazon_Fine_Foods_Reviews, 
        STORAGE = TD_OFSSTORAGE
        (
        doc_id int, 
        rating int,
        review varchar(21500) CHARACTER SET LATIN
        
        );
    '''
    cur.execute(qry)
    
    qry = '''{fn teradata_read_csv(amazon_reviews.csv)} insert into Demo.Amazon_Fine_Foods_Reviews (?, ?, ?)'''
    cur.execute(qry)

In [4]:
# /* -------------------------------------------------------- */
# -- Perform this as SYSDBA.
# -- Load demo.nltk_stopwords
# /* -------------------------------------------------------- */

import teradatasql, json

# load vars json
with open('../vars.json', 'r') as f:
    session_vars = json.load(f)

name = session_vars['hierarchy']['SYSDBA']['username']
pwd = session_vars['hierarchy']['SYSDBA']['password']

with teradatasql.connect(host = session_vars['environment']['host'], 
                     user = name, 
                     password = pwd) as con:
    cur = con.cursor()
    
    try:
        cur.execute('DROP TABLE demo.nltk_stopwords')
    except Exception as e:
        # Table already exists
        if str(e.args).find("3807") >= 1:
            pass
        else:
            raise
            
    
    qry = '''
        CREATE TABLE demo.stop_words, 
        STORAGE = TD_NDSSTORAGE(
         word varchar(100) CHARACTER SET LATIN
        );
    '''
    cur.execute(qry)
    
    qry = '''{fn teradata_read_csv(nltk_stopwords.csv)} insert into Demo.stop_words (?)'''
    cur.execute(qry)

In [5]:
# /* -------------------------------------------------------- */
# -- Perform this as SYSDBA.
# -- Load demo.bin_table
# /* -------------------------------------------------------- */

import teradatasql, json

# load vars json
with open('../vars.json', 'r') as f:
    session_vars = json.load(f)

name = session_vars['hierarchy']['SYSDBA']['username']
pwd = session_vars['hierarchy']['SYSDBA']['password']

with teradatasql.connect(host = session_vars['environment']['host'], 
                     user = name, 
                     password = pwd) as con:
    cur = con.cursor()
    
    try:
        cur.execute('DROP TABLE demo.bin_table')
    except Exception as e:
        # Table already exists
        if str(e.args).find("3807") >= 1:
            pass
        else:
            raise
            
    
    qry = '''
        CREATE TABLE demo.bin_table, 
        STORAGE = TD_NDSSTORAGE(
         ColumnName varchar(100) CHARACTER SET LATIN,
         MinValue FLOAT,
         MaxValue FLOAT,
         Label VARCHAR(20)
         
        );
    '''
    cur.execute(qry)
    
    qry = '''{fn teradata_read_csv(bin_table.csv)} insert into Demo.bin_table (?,?,?,?)'''
    cur.execute(qry)

<hr>

## Data Loads for Numeric Regression ##
- **demo.housing_prices_full**

In [6]:
# /* -------------------------------------------------------- */
# -- Perform this as SYSDBA.
# -- Load demo.housing_prices_full
# /* -------------------------------------------------------- */

import teradatasql, json

# load vars json
with open('../vars.json', 'r') as f:
    session_vars = json.load(f)

name = session_vars['hierarchy']['SYSDBA']['username']
pwd = session_vars['hierarchy']['SYSDBA']['password']

with teradatasql.connect(host = session_vars['environment']['host'], 
                     user = name, 
                     password = pwd) as con:
    cur = con.cursor()
    
    try:
        cur.execute('DROP TABLE demo_ofs.housing_prices_full')
    except Exception as e:
        # Table already exists
        if str(e.args).find("3807") >= 1:
            pass
        else:
            raise

    qry = '''
    CREATE TABLE demo_ofs.housing_prices_full,

    STORAGE=TD_OFSSTORAGE

         (
          id BIGINT,
          mssubclass BIGINT,
          mszoning VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          lotfrontage FLOAT,
          lotarea BIGINT,
          street VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          alley VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          lotshape VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          landcontour VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          utilities VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          lotconfig VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          landslope VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          neighborhood VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          condition1 VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          condition2 VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          bldgtype VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          housestyle VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          overallqual BIGINT,
          overallcond BIGINT,
          yearbuilt BIGINT,
          yearremodadd BIGINT,
          roofstyle VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          roofmatl VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          exterior1st VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          exterior2nd VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          masvnrtype VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          masvnrarea FLOAT,
          exterqual VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          extercond VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          foundation VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          bsmtqual VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          bsmtcond VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          bsmtexposure VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          bsmtfintype1 VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          bsmtfinsf1 BIGINT,
          bsmtfintype2 VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          bsmtfinsf2 BIGINT,
          bsmtunfsf BIGINT,
          totalbsmtsf BIGINT,
          heating VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          heatingqc VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          centralair VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          electrical VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          _1stflrsf BIGINT,
          _2ndflrsf BIGINT,
          lowqualfinsf BIGINT,
          grlivarea BIGINT,
          bsmtfullbath BIGINT,
          bsmthalfbath BIGINT,
          fullbath BIGINT,
          halfbath BIGINT,
          bedroomabvgr BIGINT,
          kitchenabvgr BIGINT,
          kitchenqual VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          totrmsabvgrd BIGINT,
          functional VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          fireplaces BIGINT,
          fireplacequ VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          garagetype VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          garageyrblt BIGINT,
          garagefinish VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          garagecars BIGINT,
          garagearea BIGINT,
          garagequal VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          garagecond VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          paveddrive VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          wooddecksf BIGINT,
          openporchsf BIGINT,
          enclosedporch BIGINT,
          _3ssnporch BIGINT,
          screenporch BIGINT,
          poolarea BIGINT,
          poolqc VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          fence VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          miscfeature VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          miscval BIGINT,
          mosold BIGINT,
          yrsold BIGINT,
          saletype VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          salecondition VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
          saleprice BIGINT,
          part BIGINT)
    ;
    '''

    cur.execute(qry)
    
    qry = '''{fn teradata_read_csv(housing_full_partition.csv)} insert into Demo_ofs.housing_prices_full (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)'''
    cur.execute(qry)

<hr>

## Data Loads for OAF/Data Science Demo ##

- **demo_ofs.Txn_History**

In [7]:
# /* -------------------------------------------------------- */
# -- Perform this as SYSDBA.
# -- Load demo_ofs.Txn_History
# /* -------------------------------------------------------- */

import teradatasql, json

# load vars json
with open('../vars.json', 'r') as f:
    session_vars = json.load(f)

name = session_vars['hierarchy']['SYSDBA']['username']
pwd = session_vars['hierarchy']['SYSDBA']['password']

with teradatasql.connect(host = session_vars['environment']['host'], 
                     user = name, 
                     password = pwd) as con:
    cur = con.cursor()
    
    try:
        cur.execute('DROP TABLE demo_ofs.Txn_History')
    except Exception as e:
        # Can't drop table due to BackupCount
        if str(e.args).find("4880") >= 1:
            pass
        else:
            raise

    qry = '''
    /* Load the data into OFS directly from S3
    */

    --EXPLAIN
    CREATE TABLE demo_ofs.Txn_History,

        /*Note the use of STORAGE here to specify OFS
        This can also be set to a user and database default */

        STORAGE=TD_OFSSTORAGE

        /* We can use a SELECT statement to populate the table
        using WITH DATA */
        AS (

            /*Provide json path information to identify the proper fields */
            SELECT Payload.txn_id txn_id,
                Payload.step step,
                Payload."type" "txn_type",
                CAST(Payload.amount AS FLOAT) amount,
                Payload.nameOrig nameOrig,
                CAST(Payload.oldbalanceOrig AS FLOAT) oldbalanceOrig,
                CAST(Payload.newbalanceOrig AS FLOAT) newbalanceOrig,
                Payload.nameDest nameDest,
                CAST(Payload.oldbalanceDest AS FLOAT) oldbalanceDest,
                CAST(Payload.newbalanceDest AS FLOAT) newbalanceDest,
                CAST(Payload.isFraud AS INTEGER) isFraud,
                CAST(Payload.isFlaggedFraud AS INTEGER) isFlaggedFraud

            FROM (
                LOCATION = '/s3/s3.amazonaws.com/trial-datasets/FraudReduction/'
                AUTHORIZATION = Repositories.PubAuth
    ) AS D
    ) WITH DATA
    '''
    try:
        cur.execute(qry)
    except Exception as e:
        # Table exists
        if str(e.args).find("3803") >= 1: 
            pass
        else:
            raise
    
    qry = '''
    /* Insert some additional rows into our OFS Table */
    INSERT INTO demo_ofs.Txn_History

            /*Provide json path information to identify the proper fields */
            SELECT TOP 10000 Payload.txn_id txn_id,
                Payload.step step,
                Payload."type" "txn_type",
                CAST(Payload.amount AS FLOAT) amount,
                Payload.nameOrig nameOrig,
                CAST(Payload.oldbalanceOrig AS FLOAT) oldbalanceOrig,
                CAST(Payload.newbalanceOrig AS FLOAT) newbalanceOrig,
                Payload.nameDest nameDest,
                CAST(Payload.oldbalanceDest AS FLOAT) oldbalanceDest,
                CAST(Payload.newbalanceDest AS FLOAT) newbalanceDest,
                CAST(Payload.isFraud AS INTEGER) isFraud,
                CAST(Payload.isFlaggedFraud AS INTEGER) isFlaggedFraud

            FROM (
                LOCATION = '/s3/s3.amazonaws.com/trial-datasets/FraudReduction/'
                AUTHORIZATION = Repositories.PubAuth
    ) D
    '''
    cur.execute(qry)

<hr>

## Data Loads for KMeans Clustering Demo ##
- **demo_ofs.UK_Retail_Data**

**As of JAN-2023 drop, using Teradataml due to errors with teradatasql batch loading**

In [8]:
# # /* -------------------------------------------------------- */
# # -- Perform this as SYSDBA.
# # -- Load demo_ofs.UK_Retail_Data  data
# # /* -------------------------------------------------------- */

# import teradatasql, json

# # load vars json
# with open('../vars.json', 'r') as f:
#     session_vars = json.load(f)

# name = session_vars['hierarchy']['SYSDBA']['username']
# pwd = session_vars['hierarchy']['SYSDBA']['password']

# with teradatasql.connect(host = session_vars['environment']['host'], 
#                      user = name, 
#                      password = pwd) as con:
#     cur = con.cursor()
    
#     try:
#         cur.execute('DROP TABLE Demo.UK_Retail_Data')
#     except Exception as e:
#         # Table already exists
#         if str(e.args).find("3807") >= 1:
#             pass
#         else:
#             raise
            
    
#     qry = '''
# CREATE MULTISET TABLE demo.UK_Retail_Data, 
#     STORAGE = TD_OFSSTORAGE
#      (
#       InvoiceNo VARCHAR(10) CHARACTER SET UNICODE NOT CASESPECIFIC,
#       StockCode VARCHAR(10) CHARACTER SET UNICODE NOT CASESPECIFIC,
#       Description VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
#       Quantity BIGINT,
#       InvoiceDate TIMESTAMP(6),
#       UnitPrice FLOAT,
#       CustomerID FLOAT,
#       Country VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC);
#     '''
#     cur.execute(qry)
    
#     qry = '''{fn teradata_read_csv(UK_Retail_Data.csv)} insert into demo.UK_Retail_Data (?, ?, ?, ?, ?, ?, ?, ?)'''
#     cur.execute(qry)

In [9]:
# /* -------------------------------------------------------- */
# -- Perform this as SYSDBA.
# -- Load demo_ofs.UK_Retail_Data  data
# /* -------------------------------------------------------- */

import json
from teradataml import *

# load vars json
with open('../vars.json', 'r') as f:
    session_vars = json.load(f)


name = session_vars['hierarchy']['SYSDBA']['username']
pwd = session_vars['hierarchy']['SYSDBA']['password']

eng = create_context(host = session_vars['environment']['host'], username = name, password = pwd)

try:
    eng.execute('DROP TABLE demo_ofs.UK_Retail_Data')
except Exception as e:
    # Table already exists
    if str(e.args).find("3807") >= 1:
        pass
    else:
        raise

qry = '''
CREATE MULTISET TABLE demo_ofs.UK_Retail_Data
    --STORAGE = TD_OFSSTORAGE
     (
      InvoiceNo VARCHAR(10) CHARACTER SET UNICODE NOT CASESPECIFIC,
      StockCode VARCHAR(10) CHARACTER SET UNICODE NOT CASESPECIFIC,
      Description VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC,
      Quantity BIGINT,
      InvoiceDate TIMESTAMP(6),
      UnitPrice FLOAT,
      CustomerID FLOAT,
      Country VARCHAR(1024) CHARACTER SET UNICODE NOT CASESPECIFIC);
    '''
    
eng.execute(qry)

df = pd.read_csv('UK_Retail_Data.csv', header = None, names = ['InvoiceNo', 
                                                               'StockCode', 
                                                               'Description', 
                                                               'Quantity', 
                                                               'InvoiceDate', 
                                                               'UnitPrice', 
                                                               'CustomerID', 
                                                               'Country'])

# Cast underlying datatype as string since NULLs inferred as FLOAT
df['Description'] = df['Description'].astype(str)

copy_to_sql(df.dropna(), table_name = 'UK_Retail_Data', schema_name = 'demo_ofs', if_exists = 'append')

remove_context()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


True