In [1]:
import pandas as pd

### Load the Excel files

In [7]:
buyer_order_df = pd.read_excel("../Dataset/BuyerOrder.xlsx")
current_stock_df = pd.read_excel("../Dataset/CurrentStock.xlsx")
metadata_buyer_order_df = pd.read_excel("../Dataset/metadata-BuyerOrder.xlsx")
metadata_current_stock_df = pd.read_excel("../Dataset/metadata-CurrentStock.xlsx")

In [9]:
# Display first few rows of each to inspect structure
buyer_order_head = buyer_order_df.head()
buyer_order_head

Unnamed: 0,BuyerName,BuyerOrderNo,BuyerOrderStatus,StyleName,StyleCode,ProductGroup,Category,SubCategory,BuyerOrderQty,BuyerOrderDate,BuyerOrderValue,Currency,BuyerDeliveryDate,BuyerShippedQty,BuyerShippedValue,BuyerShippedInvoiceNo
0,shikha buyer,WFXCompany/OC217.1,Cancelled,WFX SAMPLE THREAD_2200005_2200005,SEW-THREAD-WFXSAMPLETHREAD,SEWING TRIMS,Trims,THREAD,500.0,2022-01-04,1000000.0,INR,2022-01-30,,,
1,Ashhar Buyer,116180.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19,10000.0,INR,2021-11-19,,,
2,Ashhar Buyer,116181.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19,1000.0,INR,2021-11-19,,,
3,Ashhar Buyer,116184.1,Cancelled,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19,10000.0,INR,2021-11-19,,,
4,Ashhar Buyer,116186.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19,10000.0,INR,2021-11-19,,,


In [10]:
current_stock_head = current_stock_df.head()
current_stock_head

Unnamed: 0,SiteName,Category,ProductGroup,ProductSubCatCode,ArticleName,ArticleCode,ColorName,ColorCode,SizeName,SizeCode,...,GRNNo,GRNCreatedBy,GRNDate,Ageing,SupplierPONo,UOM,Quantity,PendingtoDispatch_UnderQC,Rate,Value
0,RM FABRIC STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,black,20,18LINE,18LINE,...,R-1001-115,tanpreet,2019-02-21 17:33:26.253,2353,F/18.19/549,MTRS,200.0,0.0,2.5,500.0
1,RM FABRIC STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,black,20,0-3,0-3,...,R-1001-115,tanpreet,2019-02-21 17:33:26.237,2353,F/18.19/549,MTRS,200.0,0.0,3.0,600.0
2,RM FABRIC STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,,,,,...,,,NaT,480,,MTRS,900.0,0.0,4.0,3600.0
3,ABC FABRIC STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,,,,,...,R-FABSTORE-443,Apoorva,2018-08-16 16:15:11.113,5365,,MTRS,200.0,0.0,1.0,200.0
4,ACHIEVER CUT STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,BLACK,BLACK01,,,...,,,NaT,4187,,MTRS,1.0,0.0,0.0,0.0


In [11]:
metadata_buyer_order_head = metadata_buyer_order_df.head()
metadata_buyer_order_head

Unnamed: 0,Column_name,Type
0,BuyerName,varchar
1,BuyerOrderNo,varchar
2,BuyerOrderStatus,varchar
3,StyleName,varchar
4,StyleCode,varchar


In [12]:
metadata_current_stock_head = metadata_current_stock_df.head()
metadata_current_stock_head

Unnamed: 0,Column Name,Type
0,SiteName,varchar
1,Category,varchar
2,ProductGroup,varchar
3,ProductSubCatCode,varchar
4,ArticleName,varchar


### Preprocessing Plan before SQLite conversion
We need to clean both datasets according to their metadata, so your LLM agent can query a consistent database.

#### Step 1 — Clean BuyerOrder
* Convert column names to snake_case.
* Enforce data types from metadata.
* Handle missing values (NaN → NULL).
* Normalize dates to YYYY-MM-DD.

In [13]:
# Make a copy
buyer_order_clean = buyer_order_df.copy()

In [14]:
# --- 1. Convert column names to snake_case ---
buyer_order_clean.columns = (
    buyer_order_clean.columns.str.strip()
    .str.replace(" ", "_")
    .str.replace("-", "_")
    .str.lower()
)

In [15]:
# --- 2. Enforce data types based on metadata ---
# Create a metadata dictionary from metadata_buyer_order_df
metadata_dict_buyer = dict(
    zip(metadata_buyer_order_df["Column_name"], metadata_buyer_order_df["Type"])
)

In [16]:
# Mapping SQL types to pandas types
sql_to_pd_type = {
    "varchar": "string",
    "int": "Int64",
    "float": "float64",
    "date": "datetime64[ns]",
    "datetime": "datetime64[ns]"
}

In [17]:
# Convert columns to appropriate data types where possible
for col, dtype in metadata_dict_buyer.items():
    col_snake = col.strip().replace(" ", "_").replace("-", "_").lower()
    if col_snake in buyer_order_clean.columns:
        pd_dtype = sql_to_pd_type.get(dtype.lower(), None)
        if pd_dtype:
            try:
                buyer_order_clean[col_snake] = buyer_order_clean[col_snake].astype(pd_dtype)
            except Exception:
                # Coerce errors for dates and numbers
                if "datetime" in pd_dtype or "date" in pd_dtype:
                    buyer_order_clean[col_snake] = pd.to_datetime(buyer_order_clean[col_snake], errors="coerce")
                else:
                    buyer_order_clean[col_snake] = pd.to_numeric(buyer_order_clean[col_snake], errors="coerce")

In [18]:
# --- 3. Handle missing values (replace NaN with None for SQLite compatibility) ---
buyer_order_clean = buyer_order_clean.where(pd.notnull(buyer_order_clean), None)

In [19]:
# Preview cleaned BuyerOrder
buyer_order_clean.head()

Unnamed: 0,buyername,buyerorderno,buyerorderstatus,stylename,stylecode,productgroup,category,subcategory,buyerorderqty,buyerorderdate,buyerordervalue,currency,buyerdeliverydate,buyershippedqty,buyershippedvalue,buyershippedinvoiceno
0,shikha buyer,WFXCompany/OC217.1,Cancelled,WFX SAMPLE THREAD_2200005_2200005,SEW-THREAD-WFXSAMPLETHREAD,SEWING TRIMS,Trims,THREAD,500.0,2022-01-04,1000000.0,INR,2022-01-30,,,
1,Ashhar Buyer,116180.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19,10000.0,INR,2021-11-19,,,
2,Ashhar Buyer,116181.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19,1000.0,INR,2021-11-19,,,
3,Ashhar Buyer,116184.1,Cancelled,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19,10000.0,INR,2021-11-19,,,
4,Ashhar Buyer,116186.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19,10000.0,INR,2021-11-19,,,


In [21]:
# Save cleaned BuyerOrder to CSV
buyer_order_csv_path ="../data/csv_xlsx/cleaned_BuyerOrder.csv"
buyer_order_clean.to_csv(buyer_order_csv_path, index=False)

#### Step 2 — Clean CurrentStock
* Same as above (column names, dtypes).

* Fix inconsistencies (e.g., "BLACK" vs "black", "NaN" in Color).

* Normalize numerical columns.

In [22]:
# Step 1: Clean CurrentStock DataFrame

# Make a copy
current_stock_clean = current_stock_df.copy()

In [23]:
# --- 1. Convert column names to snake_case ---
current_stock_clean.columns = (
    current_stock_clean.columns.str.strip()
    .str.replace(" ", "_")
    .str.replace("-", "_")
    .str.lower()
)

In [24]:
# --- 2. Enforce data types based on metadata ---
# Create a metadata dictionary from metadata_current_stock_df
metadata_dict_stock = dict(
    zip(metadata_current_stock_df["Column Name"], metadata_current_stock_df["Type"])
)

In [25]:
# Mapping SQL types to pandas types (reuse from BuyerOrder step)
sql_to_pd_type_stock = {
    "varchar": "string",
    "int": "Int64",
    "float": "float64",
    "date": "datetime64[ns]",
    "datetime": "datetime64[ns]"
}

In [26]:
# Convert columns to appropriate data types where possible
for col, dtype in metadata_dict_stock.items():
    col_snake = col.strip().replace(" ", "_").replace("-", "_").lower()
    if col_snake in current_stock_clean.columns:
        pd_dtype = sql_to_pd_type_stock.get(dtype.lower(), None)
        if pd_dtype:
            try:
                current_stock_clean[col_snake] = current_stock_clean[col_snake].astype(pd_dtype)
            except Exception:
                if "datetime" in pd_dtype or "date" in pd_dtype:
                    current_stock_clean[col_snake] = pd.to_datetime(current_stock_clean[col_snake], errors="coerce")
                else:
                    current_stock_clean[col_snake] = pd.to_numeric(current_stock_clean[col_snake], errors="coerce")

In [27]:
# --- 3. Handle missing values (replace NaN with None for SQLite compatibility) ---
current_stock_clean = current_stock_clean.where(pd.notnull(current_stock_clean), None)

In [28]:
# Preview cleaned CurrentStock
current_stock_clean.head()

Unnamed: 0,sitename,category,productgroup,productsubcatcode,articlename,articlecode,colorname,colorcode,sizename,sizecode,...,grnno,grncreatedby,grndate,ageing,supplierpono,uom,quantity,pendingtodispatch_underqc,rate,value
0,RM FABRIC STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,black,20,18LINE,18LINE,...,R-1001-115,tanpreet,2019-02-21 17:33:26.253,2353,F/18.19/549,MTRS,200.0,0.0,2.5,500.0
1,RM FABRIC STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,black,20,0-3,0-3,...,R-1001-115,tanpreet,2019-02-21 17:33:26.237,2353,F/18.19/549,MTRS,200.0,0.0,3.0,600.0
2,RM FABRIC STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,,,,,...,,,NaT,480,,MTRS,900.0,0.0,4.0,3600.0
3,ABC FABRIC STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,,,,,...,R-FABSTORE-443,Apoorva,2018-08-16 16:15:11.113,5365,,MTRS,200.0,0.0,1.0,200.0
4,ACHIEVER CUT STORE,Textiles/Fabric,FABRIC,COTTON,WFX SAMPLE FABRIC_2200001,FAB-COTTON-WFXSAMPLEFABRIC,BLACK,BLACK01,,,...,,,NaT,4187,,MTRS,1.0,0.0,0.0,0.0


In [29]:
# Save cleaned CurrentStock to CSV
current_stock_csv_path = "../data/csv_xlsx/cleaned_CurrentStock.csv"
current_stock_clean.to_csv(current_stock_csv_path, index=False)

# Get information about cleaned BuyerOrder CSV

In [30]:
buyer_order_info = buyer_order_clean.info()
buyer_order_describe = buyer_order_clean.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34426 entries, 0 to 34425
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   buyername              34426 non-null  string        
 1   buyerorderno           34425 non-null  string        
 2   buyerorderstatus       34424 non-null  string        
 3   stylename              34424 non-null  string        
 4   stylecode              34426 non-null  string        
 5   productgroup           34426 non-null  string        
 6   category               34426 non-null  string        
 7   subcategory            34426 non-null  string        
 8   buyerorderqty          34422 non-null  float64       
 9   buyerorderdate         34250 non-null  datetime64[ns]
 10  buyerordervalue        34245 non-null  float64       
 11  currency               34227 non-null  string        
 12  buyerdeliverydate      34184 non-null  datetime64[ns]
 13  b

In [31]:
# Get information about cleaned CurrentStock CSV
current_stock_info = current_stock_clean.info()
current_stock_describe = current_stock_clean.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72089 entries, 0 to 72088
Data columns (total 45 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   sitename                   72089 non-null  string        
 1   category                   72089 non-null  string        
 2   productgroup               71522 non-null  string        
 3   productsubcatcode          71535 non-null  string        
 4   articlename                71511 non-null  string        
 5   articlecode                72087 non-null  string        
 6   colorname                  62087 non-null  string        
 7   colorcode                  62090 non-null  string        
 8   sizename                   60515 non-null  string        
 9   sizecode                   60652 non-null  string        
 10  shade                      26245 non-null  string        
 11  count                      8410 non-null   string        
 12  cont

In [32]:
buyer_order_summary = {
    "shape": buyer_order_clean.shape,
    "columns": list(buyer_order_clean.columns)
}

In [33]:
current_stock_summary = {
    "shape": current_stock_clean.shape,
    "columns": list(current_stock_clean.columns)
}

In [36]:
# Display summaries
print("BuyerOrder Summary:")
buyer_order_clean.head(10)

BuyerOrder Summary:


Unnamed: 0,buyername,buyerorderno,buyerorderstatus,stylename,stylecode,productgroup,category,subcategory,buyerorderqty,buyerorderdate,buyerordervalue,currency,buyerdeliverydate,buyershippedqty,buyershippedvalue,buyershippedinvoiceno
0,shikha buyer,WFXCompany/OC217.1,Cancelled,WFX SAMPLE THREAD_2200005_2200005,SEW-THREAD-WFXSAMPLETHREAD,SEWING TRIMS,Trims,THREAD,500.0,2022-01-04 00:00:00,1000000.0,INR,2022-01-30,,,
1,Ashhar Buyer,116180.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19 00:00:00,10000.0,INR,2021-11-19,,,
2,Ashhar Buyer,116181.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19 00:00:00,1000.0,INR,2021-11-19,,,
3,Ashhar Buyer,116184.1,Cancelled,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19 00:00:00,10000.0,INR,2021-11-19,,,
4,Ashhar Buyer,116186.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19 00:00:00,10000.0,INR,2021-11-19,,,
5,Ashhar Buyer,116190.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19 00:00:00,10000.0,INR,2021-11-19,,,
6,Ashhar Buyer,116191.1,Closed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2021-11-19 00:00:00,10000.0,INR,2021-11-19,,,
7,WFXUSD1,112349.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,90.0,2020-02-12 11:12:00,945.0,INR,2020-02-12,,,
8,PB,AoneBuyer/NEWOC/30.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2023-09-11 00:00:00,109.0,USD,2023-09-11,,,
9,PB,AoneBuyer/NEWOC/31.1,Confirmed,WFX SAMPLE STYLE_2200006,JAYDEE10001,WOVEN,Apparel,JEANS,100.0,2023-09-11 00:00:00,120.86,USD,2023-09-11,,,


In [37]:
buyer_order_summary

{'shape': (34426, 16),
 'columns': ['buyername',
  'buyerorderno',
  'buyerorderstatus',
  'stylename',
  'stylecode',
  'productgroup',
  'category',
  'subcategory',
  'buyerorderqty',
  'buyerorderdate',
  'buyerordervalue',
  'currency',
  'buyerdeliverydate',
  'buyershippedqty',
  'buyershippedvalue',
  'buyershippedinvoiceno']}

In [38]:
current_stock_summary

{'shape': (72089, 45),
 'columns': ['sitename',
  'category',
  'productgroup',
  'productsubcatcode',
  'articlename',
  'articlecode',
  'colorname',
  'colorcode',
  'sizename',
  'sizecode',
  'shade',
  'count',
  'content',
  'construction',
  'stocktype',
  'quality',
  'posupplierref',
  'locationcode',
  'indentno',
  'stylename',
  'stylecode',
  'buyerstyleref',
  'merchandiser',
  'manager',
  'buyer',
  'supplier',
  'ocnum',
  'ocstatus',
  'contractno',
  'contractdate',
  'contractamount',
  'sourcebuyer',
  'pcddate',
  'garmentdeliverydate',
  'grndetails',
  'grnno',
  'grncreatedby',
  'grndate',
  'ageing',
  'supplierpono',
  'uom',
  'quantity',
  'pendingtodispatch_underqc',
  'rate',
  'value']}