In [2]:
import pandas as pd
import numpy as np

### Loading the Supplier Data Files

In [3]:
s1 = pd.read_excel("../data/supplier_data1.xlsx", dtype=str)
s2 = pd.read_excel("../data/supplier_data2.xlsx", dtype=str)

In [4]:
print("Supplier 1 shape:", s1.shape)
print("Supplier 2 shape:", s2.shape)

Supplier 1 shape: (50, 12)
Supplier 2 shape: (50, 6)


### Displaying first 5 rows

In [5]:
display(s1.head())
display(s2.head())

Unnamed: 0,Quality/Choice,Grade,Finish,Thickness (mm),Width (mm),Description,Gross weight (kg),RP02,RM,Quantity,AG,AI
0,3rd,C200S,gebeizt und geglüht,2.77,1100,Längs- oder Querisse,13983,333.6,606.2,0.0,16.11,0.0054
1,3rd,C300S,ungebeizt,2.65,1075,Längs- oder Querisse,13047,717.7,0.0,0.0,16.11,0.0046
2,3rd,C100S,gebeizt und geglüht,2.2,1100,Kantenfehler - FS-Kantenrisse,14155,368.9,0.0,10.84,0.0,0.0061
3,2nd,C100S,gebeizt,2.86,1100,Längs- oder Querisse,11381,368.9,601.7,22.87,0.0,0.0062
4,1st,C300S,ungebeizt,2.88,1050,Sollmasse (Gewicht) unterschritten,10072,0.0,1213.0,22.87,0.0,0.0041


Unnamed: 0,Material,Description,Article ID,Weight (kg),Quantity,Reserved
0,HDC,Material is Oiled,23048203,24469,52,NOT RESERVED
1,S235JR,Material is Oiled,23040547,16984,41,NOT RESERVED
2,S235JR,Material is Painted,23046057,9162,28,NOT RESERVED
3,DX51D +AZ150,Material is Oiled,23041966,12119,66,VANILLA
4,HDC,Material is Painted,23043884,17260,26,NOT RESERVED


### Cleaning 

In [6]:
import re

def to_float(x):
    # Convert messy numbers (commas, etc.) into float.
    if pd.isna(x): return np.nan
    s = str(x).strip().replace(' ', '')
    # Handle 1.234,56 style numbers
    if '.' in s and ',' in s:
        s = s.replace('.', '').replace(',', '.')
    else:
        s = s.replace(',', '.')
    s = re.sub(r'[^0-9\.\-]', '', s)  # keep only digits, dot, minus
    try:
        return float(s)
    except:
        return np.nan

def parse_range(s):
    # Convert values like '2.5-3.0' or '≤5' into (min,max).
    if pd.isna(s): return (np.nan, np.nan)
    s = str(s).strip()
    if '-' in s or '–' in s:
        parts = re.split(r'[-–]', s)
        a = to_float(parts[0])
        b = to_float(parts[1]) if len(parts)>1 else a
        return (min(a,b), max(a,b))
    if '≤' in s or '<=' in s:
        return (np.nan, to_float(s))
    if '≥' in s or '>=' in s:
        return (to_float(s), np.nan)
    # fallback: single value
    v = to_float(s)
    return (v, v)

### Normalizing

In [21]:
def process_supplier_df(df, source_name):
    out = pd.DataFrame()
    
    #Article ID
    if 'Article ID' in df.columns:
        out['article_id'] = df['Article ID']
    else:
        out['article_id'] = [f"{source_name}_{i+1}" for i in range(len(df))]
    
     #Grade
    if 'Grade' in df.columns:
        out['grade'] = df['Grade'].astype(str).str.upper().str.strip()
    elif 'Material' in df.columns:
        out['grade'] = df['Material'].astype(str).str.upper().str.strip()
    else:
        out['grade'] = None

    #Quality
    if 'Quality/Choice' in df.columns:
        out['quality_choice'] = df['Quality/Choice'].apply(lambda x: str(x).strip() if pd.notna(x) else np.nan)
    else:
        out['quality_choice'] = np.nan

    #Description
    if 'Description' in df.columns:
        out['description'] = df['Description'].apply(lambda x: str(x).strip() if pd.notna(x) else np.nan)
    else:
        out['description'] = np.nan
    
    #Finish
    if 'Finish' in df.columns:
        out['finish'] = df['Finish'].astype(str).str.strip()
    else:
        out['finish'] = None
    
    #Thickness
    if 'Thickness (mm)' in df.columns:
        out[['thickness_min','thickness_max']] = df['Thickness (mm)'].apply(lambda x: pd.Series(parse_range(x)))
    else:
        out[['thickness_min','thickness_max']] = (np.nan, np.nan)
    
    #Width
    if 'Width (mm)' in df.columns:
        out[['width_min','width_max']] = df['Width (mm)'].apply(lambda x: pd.Series(parse_range(x)))
    else:
        out[['width_min','width_max']] = (np.nan, np.nan)
    
    #Quantity
    if 'Quantity' in df.columns:
        out['quantity'] = df['Quantity'].apply(to_float)
    else:
        out['quantity'] = None
    
    #Weight
    if 'Gross weight (kg)' in df.columns:
        out['weight_kg'] = df['Gross weight (kg)'].apply(to_float)
    elif 'Weight (kg)' in df.columns:
        out['weight_kg'] = df['Weight (kg)'].apply(to_float)
    else:
        out['weight_kg'] = None

    if 'Reserved' in df.columns:
        out['reserved'] = df['Reserved'].astype(str).str.strip()
    else:
        out['reserved'] = None
    
    #Mechanical properties (supplier 1 only)
    if 'RP02' in df.columns:
        out['rp02'] = df['RP02'].apply(to_float)
    else:
        out['rp02'] = None
    
    if 'RM' in df.columns:
        out['rm'] = df['RM'].apply(to_float)
    else:
        out['rm'] = None
    
    if 'AG' in df.columns:
        out['ag'] = df['AG'].apply(to_float)
    else:
        out['ag'] = None
    
    if 'AI' in df.columns:
        out['ai'] = df['AI'].apply(to_float)
    else:
        out['ai'] = None

    out['source'] = source_name
    
    return out

### Joining into a single table called inventory_dataset.

In [22]:
df1 = process_supplier_df(s1, "supplier1")
df2 = process_supplier_df(s2, "supplier2")

#Defining the final schema 
final_cols = [
    "article_id","grade","finish","quality_choice","description",
    "thickness_min","thickness_max","width_min","width_max",
    "quantity","weight_kg","reserved","rp02","rm","ag","ai","source"
]

#Reindex both DataFrames so they have the same columns
df1 = df1.reindex(columns=final_cols)
df2 = df2.reindex(columns=final_cols)

#Concatenate 
inventory = pd.concat([df1, df2], ignore_index=True)

print("Inventory combined without warnings. Rows:", len(inventory))

inventory.to_csv("../outputs/inventory_dataset.csv", index=False, encoding="utf-8-sig")
print("Saved cleaned file at outputs/inventory_dataset.csv")


Saved cleaned file at outputs/inventory_dataset.csv


  inventory = pd.concat([df1, df2], ignore_index=True)


In [23]:
# Quick validation
print("Number of rows:", len(inventory))
print("Missing grades:", inventory['grade'].isna().sum())
print("Missing quantities:", inventory['quantity'].isna().sum())

print("\nThickness stats:")
print(inventory[['thickness_min','thickness_max']].describe())

print("\nWidth stats:")
print(inventory[['width_min','width_max']].describe())

# Check suppliers
print("\nRows per supplier:")
print(inventory['source'].value_counts())

# Preview final table
display(inventory.head(10))

Number of rows: 100
Missing grades: 0
Missing quantities: 0

Thickness stats:
       thickness_min  thickness_max
count      50.000000      50.000000
mean        2.508800       2.508800
std         0.273112       0.273112
min         2.010000       2.010000
25%         2.292500       2.292500
50%         2.510000       2.510000
75%         2.750000       2.750000
max         2.980000       2.980000

Width stats:
         width_min    width_max
count    50.000000    50.000000
mean   1075.000000  1075.000000
std      48.968961    48.968961
min    1000.000000  1000.000000
25%    1050.000000  1050.000000
50%    1075.000000  1075.000000
75%    1100.000000  1100.000000
max    1150.000000  1150.000000

Rows per supplier:
source
supplier1    50
supplier2    50
Name: count, dtype: int64


Unnamed: 0,article_id,grade,finish,quality_choice,description,thickness_min,thickness_max,width_min,width_max,quantity,weight_kg,reserved,rp02,rm,ag,ai,source
0,supplier1_1,C200S,gebeizt und geglüht,3rd,Längs- oder Querisse,2.77,2.77,1100.0,1100.0,0.0,13983.0,,333.6,606.2,16.11,0.0054,supplier1
1,supplier1_2,C300S,ungebeizt,3rd,Längs- oder Querisse,2.65,2.65,1075.0,1075.0,0.0,13047.0,,717.7,0.0,16.11,0.0046,supplier1
2,supplier1_3,C100S,gebeizt und geglüht,3rd,Kantenfehler - FS-Kantenrisse,2.2,2.2,1100.0,1100.0,10.84,14155.0,,368.9,0.0,0.0,0.0061,supplier1
3,supplier1_4,C100S,gebeizt,2nd,Längs- oder Querisse,2.86,2.86,1100.0,1100.0,22.87,11381.0,,368.9,601.7,0.0,0.0062,supplier1
4,supplier1_5,C300S,ungebeizt,1st,Sollmasse (Gewicht) unterschritten,2.88,2.88,1050.0,1050.0,22.87,10072.0,,0.0,1213.0,0.0,0.0041,supplier1
5,supplier1_6,C200S,gebeizt,2nd,Längs- oder Querisse,2.51,2.51,1000.0,1000.0,0.0,12863.0,,0.0,606.2,0.0,0.0059,supplier1
6,supplier1_7,C100S,gebeizt,1st,Sollmasse (Gewicht) unterschritten,2.69,2.69,1150.0,1150.0,10.84,8936.0,,0.0,606.2,15.05,0.0066,supplier1
7,supplier1_8,C300S,gebeizt und geglüht,2nd,Sollmasse (Gewicht) unterschritten,2.98,2.98,1000.0,1000.0,22.87,9810.0,,0.0,601.7,15.05,0.0042,supplier1
8,supplier1_9,C200S,gebeizt,3rd,Längs- oder Querisse,2.03,2.03,1050.0,1050.0,22.87,5491.0,,368.9,601.7,0.0,0.0064,supplier1
9,supplier1_10,C300S,gebeizt,3rd,Längs- oder Querisse,2.54,2.54,1050.0,1050.0,0.0,11415.0,,368.9,606.2,15.05,0.0067,supplier1
