In [1]:
import pandas as pd
import teradataml
import time
import gc

# check if data already exists

In [1]:
complaints_uploaded = False
embeddings_uploaded = False

In [None]:
try:
    DF_complaints = DataFrame("consumer_complaints")
    print("consumer_complaints already exists")
    if DF_complaints.shape == (80771, 19):
        complaints_uploaded = True
        print("consumer_complaints has got the right shape")
        
except:
    pass
    

In [None]:
try:
    DF_embeddings = DataFrame("consumer_embeddings")    
    print("consumer_embeddings already exists")
    if DF_embeddings.shape == (80765, 769):
        embeddings_uploaded = True
        print("consumer_embeddings has got the right shape")
except:
    pass
    

# read and upload complaints

In [None]:
if not complaints_uploaded:
    df_complaints = pd.read_parquet("data_raw/consumer-complaints.parquet")
    df_complaints.columns = ['row_id',
         'date_received',
         'product',
         'subproduct',
         'issue',
         'subissue',
         'consumer_complaint_narrative',
         'company_public_response',
         'company',
         'state',
         'zip_code',
         'tags',
         'consumer_consent_provided',
         'submitted_via',
         'date_sent_to_company',
         'company_response_to_consumer',
         'timely_response',
         'consumer_disputed',
         'complaint_id']
    

    df_complaints["date_received"] = pd.to_datetime(df_complaints.date_received)
    df_complaints["date_sent_to_company"] = pd.to_datetime(df_complaints.date_sent_to_company)
    df_complaints["consumer_complaint_narrative"] = df_complaints["consumer_complaint_narrative"].str.slice(0, 32000)
    df_complaints["date_received"] = pd.to_datetime(df_complaints.date_received)
    df_complaints["consumer_disputed"] = df_complaints["consumer_disputed"].astype(str)
    
    start_time = time.time()
    print("Start time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)))

    print("start upload: consumer_complaints")
    copy_to_sql(df_complaints, "consumer_complaints", 
                     types = {
         'row_id':INTEGER(),
         'date_received':DATE() ,
         'product':VARCHAR(300),
         'subproduct':VARCHAR(300),
         'issue':VARCHAR(300),
         'subissue':VARCHAR(300),
         'consumer_complaint_narrative':VARCHAR(32000),
         'company_public_response':VARCHAR(300),
         'company':VARCHAR(300),
         'state':VARCHAR(300),
         'zip_code':VARCHAR(300),
         'tags':VARCHAR(300),
         'consumer_consent_provided':VARCHAR(300),
         'submitted_via':VARCHAR(300),
         'date_sent_to_company':DATE(),
         'company_response_to_consumer':VARCHAR(300),
         'timely_response':VARCHAR(300),
         'consumer_disputed':VARCHAR(300),
         'complaint_id':INTEGER()
                             },
                     primary_index="row_id",
                     if_exists="fail"
                    )
    del(df_complaints)
    gc.collect()
    print("finished upload: consumer_complaints")
    end_time = time.time()
    print("End time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time)))
    print("Elapsed time (seconds):", end_time - start_time)

# read and upload embeddings 

In [None]:
if not embeddings_uploaded:
    start_time = time.time()
    print("Start time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)))
    print("start upload: consumer_embeddings")
    for i in range (1,10):
        df_emb_i = pd.read_parquet(f"data_embeddings/embeddings_{i}.parquet")
        fastload(df_emb_i, "consumer_embeddings", 
                     primary_index="row_id",
                     if_exists="append"
                    )
        del(df_emb_i)
        print(f"Step {i} of 9 completed")
    gc.collect()
    print("finished upload: consumer_embeddings")
    end_time = time.time()
    print("End time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time)))
    print("Elapsed time (seconds):", end_time - start_time)