In [2]:
!pip install --upgrade --user -q google-cloud-aiplatform langchain faiss-cpu==1.7.4

In [3]:
import google.cloud.bigquery as bq
import langchain
from google.cloud import aiplatform
from langchain.llms import VertexAI
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.prompts import PromptTemplate
from langchain.schema import format_document

# Print LangChain and Vertex AI versions
print(f"LangChain version: {langchain.__version__}")
print(f"Vertex AI SDK version: {aiplatform.__version__}")

LangChain version: 0.1.20
Vertex AI SDK version: 1.51.0


In [4]:
llm = VertexAI(model_name="gemini-pro", temperature=0)

llm("Hello Gemini!")

  warn_deprecated(
  warn_deprecated(


'Hello! How can I assist you today?'

In [5]:
# Load the data
loader = CSVLoader(file_path="./data/test_faulty_data.csv",csv_args={
                'delimiter': ','})

data = loader.load()

In [10]:
print(data[7].page_content)

path: /data_exchange__erp_prod/raw/01_SAP_P10/0101/v1/2024/0101__product_master_main_prdha_other__mm__202403__part0002.csv
product_id: 000000000000011871
drop_id: D1
launch_type_mp: REGLAUNCH
season_type: SS
price_point: 060
shop_live_date_actual: 1985-01-29
material_name: SQUIRREL
category_segment_id: D
category_segment_name: HOME
product_group: 34
product_group_name: HOME
product_segment: DD
product_segment_name: DECORATIVE
item_group: 10
item_group_name: DECORATIVE
item_subgroup: 
item_subgroup_name: 
prdha_level4: 
prdha_level4_name: 
product_hierarchy: 3410
length: 
width: 
height: 
material_status: 80
base_unit_of_measure: STST
net_weight: 28.36
gross_weight: 66.0
weight_unit: G
article_number: 7662
size: NR042000
global_retirement_date: 2019-12-31
designer: 0035
theme: 0170
collection_code: VG
launch_date_ex_sfl: 1985-01-01
saison: 1985SS
product_default_variant: 11871
product_description_en: The lively squirrel runs through the woods on its search for food. This one in faceted 

In [None]:
#NB we probably need to chunck the input since it is too large

In [24]:
#set up a chain to pass the document to the LLM 
# Use code generation model
llm = VertexAI(model_name="gemini-pro",temperature=1, max_output_tokens=4096)

# Define the chain
chain = (
    {
        "content": lambda docs: "\n\n".join(
            format_document(doc, PromptTemplate.from_template("{page_content}"))
            for doc in docs
        )
    }
    | PromptTemplate.from_template(
        """
        In the following document:
        \n\n{content}
        I want to know if there are columns that contain entries that have the following format NNN NN.
        If so, tell me the row and the columsn where these are located and add an explanation about what you found
        """
    )
    | llm
)

# Invoke the chain with the documents, and remove code backticks
result = chain.invoke(data[6:8]).strip("```")
print(result)

## Analysis of Columns for Entries in the Format "NNN NN"

I have analyzed the provided data and identified several columns that contain entries in the format "NNN NN". Here's a breakdown:

**Column** | **Row(s)** | **Explanation**
---|---|---| 
`product_ean_code` | 1, 2 | This column contains product European Article Numbers (EANs), which are 13-digit codes in the format "NNN NNNNNNNNNNN". The first three digits (NNN) represent the GS1 prefix, which identifies the country of origin.
`product_upc_code` | 1, 2 | This column contains product Universal Product Codes (UPCs), which are 12-digit codes in the format "NNNNNNNNNNNN". The first digit (N) represents the number system, and the next five digits (NNNNN) represent the manufacturer code.
`ca_controlcode_brazil` | 2 | This column contains the Brazilian control code for the product, which is in the format "NNN.NN.NN.NN". The first three digits (NNN) represent the tax code, and the remaining digits (NN.NN.NN.NN) represent the product cod

In [25]:
#load data into pandas df 
import pandas as pd

df = pd.read_csv('./data/test_faulty_data.csv',delimiter = ',')

In [27]:
pd.set_option('display.max_columns', None)
df.head(11)

Unnamed: 0,path,product_id,drop_id,launch_type_mp,season_type,price_point,shop_live_date_actual,material_name,category_segment_id,category_segment_name,product_group,product_group_name,product_segment,product_segment_name,item_group,item_group_name,item_subgroup,item_subgroup_name,prdha_level4,prdha_level4_name,product_hierarchy,length,width,height,material_status,base_unit_of_measure,net_weight,gross_weight,weight_unit,article_number,size,global_retirement_date,designer,theme,collection_code,launch_date_ex_sfl,saison,product_default_variant,product_description_en,product_dimension_imperial_en,product_dimension_metric_en,product_ean_code,product_master_id,product_name_en,product_phone_model_en,product_upc_code,product_variant_color_en,product_variant_size_en,catalog_id,Z020_STONECOL,Z020_COLLECTION_TYPE,Z020_COUNTRY_EXCL,Z020_LABEL_COLOR,Z020_LADAT_PLAN,Z020_PROD_FAM,Z020_PL,Z020_PST,Z020_PT,Z020_SIZE,Z020_ADDITIONAL_STONECOL,Z020_MANDATORY_PRODUCT,Z020_GENDER,Z020_EXCLUSIVITY,Z020_PYRAMID,Z020_PILLAR,ca_gift_occasion,ca_inner_color,ca_inner_material,ca_kcp,ca_country_of_origin_iso,ca_material_composition,ca_metal_color,ca_metal_type,ca_outer_color,ca_outer_material,ca_shape,ca_general_shape,ca_consumer_color,ca_product_set,ca_product_sub_type,ca_product_type,ca_plating_type,ca_primary_stone_shape,ca_default_category,ca_facets_number,ca_fragile,ca_gross_weight_carton_gr,ca_net_weight_carton_gr,ca_product_weight_kg,ca_jewelry_height,ca_jewelry_width,ca_model_year,ca_number_boxes,ca_number_of_stones,ca_number_pieces_sku,ca_season_of_the_product,ca_sustainability_label,ca_motif,ca_stone_color,ca_stone_treatment_method,ca_stone_type,ca_are_batteries_included,ca_azo_pcp,ca_band_type,ca_band_width,ca_batteries_required,ca_battery_type,ca_battery_weight_in_grams,ca_book_pages,ca_calendar_type,ca_carton_height_mm,ca_carton_length_mm,ca_case_diameter_mm,ca_case_length_mm,ca_case_thickness,ca_case_width_mm,ca_cgb_b2b_dontshowin_aut,ca_chain_length,ca_chain_length_mm,ca_controlcode_brazil,ca_controlcode_india,ca_country_of_origin,ca_dangerous_good,ca_depth_carton_mm,ca_designer,ca_dial_length_mm,ca_dial_width_mm,ca_dimension_code_hide,ca_drop_length,ca_etui,ca_how_is_the_lithium_battery_packaged,ca_material_strap,ca_maximum_lenght_mm,ca_maximum_water_pressure_resistance,ca_number_of_batteries_included,ca_opening_distance_mm,ca_pearl_dimensions_mm,ca_pen_line_width_mm,ca_pendant_diameter_mm,ca_pendant_length_mm,ca_pendant_width_mm,ca_product_account_limit,ca_product_depth_mm,ca_product_diameter_mm,ca_product_effective_length_mm,ca_product_height_mm,ca_product_inside_depth_mm,ca_product_inside_length_mm,ca_product_inside_width_mm,ca_product_width_mm,ca_products_inner_pack,ca_resizable,ca_straps_length,ca_straps_width,ca_usb_capacity,ca_watch_strap_length_mm,ca_watch_strap_width_mm,ca_watch_waterproof,ZZNFMAT,ZZPLMID,extraction_date
0,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11859,,,,60,1984-01-01,QUEEN CRYSTAL,D,HOME,34,HOME,DA,FUNCTIONAL,20,FUNCTIONAL,,,,,3420,82.0,82.0,62.0,80,STST,23.78,71.3,G,7469,NR052000,2010-12-31,35,129,VG,1984-01-01,,11859.0,,,,9003140118599,11859.0,,,768549000000.0,,,swaCatalog,,,,,,,CELEBRATIONS,HOME DECOR,CELEBRATIONS,,,,,,,,,,,,in->at,,,,,,,,,,,,,,,,,71.3,23.78,,,,,,,,,,,,,,,,,,,,,,,62.0,82.0,,,,,,,,,70139100.0,,,82.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15-03-2024
1,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11860,,,,70,1984-01-01,CHESS SET-QUEEN JET,D,HOME,34,HOME,DA,FUNCTIONAL,20,FUNCTIONAL,,,,,3420,,,,80,STST,25.258,71.618,G,7469,NR052010,2019-12-31,35,129,VG,1984-01-01,,11860.0,,,,9003140118605,11860.0,,,768549000000.0,,,swaCatalog,,,,,,,CELEBRATIONS,HOME DECOR,CELEBRATIONS,,,,,,,,,,,,in->at,,,,,,,,,,,,,,,,,71.618,25.258,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,70139100.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15-03-2024
2,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11861,,,,60,1984-01-01,KING CRYSTAL,D,HOME,34,HOME,DA,FUNCTIONAL,20,FUNCTIONAL,,,,,3420,,,,80,STST,29.26,75.5,G,7469,NR062000,2010-12-31,35,129,VG,1984-01-01,,11861.0,,,,9003140118612,11861.0,,,768549000000.0,,,swaCatalog,,,,,,,CELEBRATIONS,HOME DECOR,CELEBRATIONS,,,,,,,,,,,,in->at,,,,,,,,,,,,,,,,,75.5,29.26,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15-03-2024
3,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11862,,,,70,1984-01-01,CHESS SET-KING JET,D,HOME,34,HOME,DA,FUNCTIONAL,20,FUNCTIONAL,,,,,3420,,,,80,STST,28.757,75.862,G,7469,NR062010,2019-12-31,35,129,VG,1984-01-01,,11862.0,,,,9003140118629,11862.0,,,768549000000.0,,,swaCatalog,,,,,,,CELEBRATIONS,HOME DECOR,CELEBRATIONS,,,,,,,,,,,,in->at,,,,,,,,,,,,,,,,,75.862,28.757,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,70139100.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15-03-2024
4,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11864,D1,REGLAUNCH,SS,220,1985-01-29,BUNCH OF GRAPES,D,HOME,34,HOME,DD,DECORATIVE,10,DECORATIVE,,,,,3410,,,,80,STST,337.0,603.0,G,7509,NR150070,2019-12-31,997,191,VG,1985-01-01,1985SS,11864.0,Faceted clear crystal Bunch of Grapes with gol...,Size: 5 7/8 x 2 3/4 inches,Size: 15 x 7 cm,9003140118643,11864.0,Bunch of Grapes,,768549000000.0,,,swaCatalog,,,,,,,NATURE COLLECTIONS,CRYSTAL NATURE,NATURE COLLECTIONS,,,,,,,,,,,,in->at,,,,,,,,,,,,,,,,,603.0,337.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15-03-2024
5,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11866,D1,PRELAUNCH,SS,0,,WATERLILY CANDLEHOLDER N,D,HOME,34,HOME,DA,FUNCTIONAL,20,FUNCTIONAL,,,,,3420,,,,71,STST,161.0,161.0,G,7600,NR124000,2020-08-31,35,191,,2013-11-22,2014SS,,,,,9003140118667,,,,,,,swaCatalog,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15-03-2024
6,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11867,D1,REGLAUNCH,SS,150,1985-01-29,WATERLILY CANDLEHOLDER,D,HOME,34,HOME,DA,FUNCTIONAL,20,FUNCTIONAL,,,,,3420,125.0,120.0,81.0,80,STST,132.0,272.0,G,7600,NR124000,2020-08-31,35,191,VG,1985-01-01,1985SS,11867.0,Small clear crystal candleholder in the shape ...,Size: 1 7/8 x 3 x 2 5/8 inches,Size: 5 x 7.8 x 6.8 cm,9003140118674,11867.0,Waterlily Candleholder,,768549000000.0,,,swaCatalog,CRYSTAL,GC,,CRY,01.01.1985,WATERLILY,CELEBRATIONS,HOME DECOR,CELEBRATIONS,,,NO,,,VOLUME DRIVER,SCIENCE OF FACETS,,,,,in->xs|ru->at,,,,,,,,White,,,Home Decor,,,405.0,,,272.0,132.0,,5.0,78.0,2020.0,,,,2014FW,,flower,,,,,,,,,,,,,81.0,125.0,,,,,,68.0,,7013.99.00,70139100.0,Serbia,,120.0,Max Schreck,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,P_470,15-03-2024
7,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11871,D1,REGLAUNCH,SS,60,1985-01-29,SQUIRREL,D,HOME,34,HOME,DD,DECORATIVE,10,DECORATIVE,,,,,3410,,,,80,STST,28.36,66.0,G,7662,NR042000,2019-12-31,35,170,VG,1985-01-01,1985SS,11871.0,The lively squirrel runs through the woods on ...,Size: 7/8 x 1 5/8 inches,Size: 2.2 x 4.2 cm,9003140118711,11871.0,Squirrel,,768549000000.0,,,swaCatalog,,,,,,,NATURE COLLECTIONS,CRYSTAL NATURE,NATURE COLLECTIONS,,,,,,,,,,,,in->at|ru->at,,,,,,,,,Hot picks Louise Pentland Something blue,,,,,,,,66.0,28.36,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,7013.91.10,,,,,,,,,,,,,,,,126 80,,,,,,,,,,,,,,,,,,,,,,,,,15-03-2024
8,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11872,D1,PRELAUNCH,SS,0,2000-01-01,11872,D,HOME,20,CSM,DD,DECORATIVE,84,PACKAGING MATERIAL HOME,10.0,OBJECTS,,,208410,,,,80,STST,0.0,0.0,G,7700,NR060060,2004-12-31,0,0,,2000-01-01,2000SS,,,,,9003140118728,,,,,,,swaCatalog,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15-03-2024
9,/data_exchange__erp_prod/raw/01_SAP_P10/0101/v...,11888,D2,REGLAUNCH,,10,,A 9400 NR101001 CRY BBZ M201,C,ACCESSORIES,59,BJB,CF,OTHER ACCESSORIES,10,PERSONALIZATION,10.0,KEY RINGS,,,591010,,,,80,STST,12.326,19.946,G,9400,NR101001,2004-04-30,0,0,,,,11888.0,,,,9003140118889,11888.0,,,768549500000.0,,,swaCatalog,,,,,,,,,,,,,,,,,,,,,in->cn|ru->cn,,,,,,,,,,,,,,,,,19.946,12.326,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7326.20.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15-03-2024
