In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from google.colab import files

#  STEP 1: Upload Files Manually (Run this cell and select files)
print("Please upload 'supplier_data_1.xlsx' and 'supplier_data_2.xlsx'")
uploaded = files.upload()

#  STEP 2: Load Supplier Datasets
supplier_1 = pd.read_excel("supplier_data_1.xlsx")
supplier_2 = pd.read_excel("supplier_data_2.xlsx")

#  STEP 3: Standardizing Column Names (for consistency)
def clean_column_names(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace(r"[^a-z0-9_]", "", regex=True)
    return df

supplier_1 = clean_column_names(supplier_1)
supplier_2 = clean_column_names(supplier_2)

#  STEP 4: Handle Missing Values
# - Fill missing numerical values with the median
# - Fill missing categorical values with "Unknown"
for col in supplier_1.columns:
    if supplier_1[col].dtype == "object":
        supplier_1[col].fillna("Unknown", inplace=True)
    else:
        supplier_1[col].fillna(supplier_1[col].median(), inplace=True)

for col in supplier_2.columns:
    if supplier_2[col].dtype == "object":
        supplier_2[col].fillna("Unknown", inplace=True)
    else:
        supplier_2[col].fillna(supplier_2[col].median(), inplace=True)

#  STEP 5: Improved Safe Numeric Conversion (Prevents NaN Data Loss)
def safe_numeric_conversion(df, column):
    """
    Converts a column to numeric while preserving original values in case of failed conversion.
    """
    df[column] = df[column].astype(str).str.replace(r'[^0-9.]', '', regex=True)  # Remove non-numeric characters
    converted = pd.to_numeric(df[column], errors='coerce')  # Attempt conversion

    # Restore original values if conversion fails (i.e., all NaN)
    df[column] = converted.fillna(df[column])
    return df[column]

# Define numeric columns
numeric_columns = ['thickness_mm', 'width_mm', 'gross_weight_kg', 'quantity']

# Apply safe conversion to supplier datasets
for col in numeric_columns:
    if col in supplier_1.columns:
        supplier_1[col] = safe_numeric_conversion(supplier_1, col)
    if col in supplier_2.columns:
        supplier_2[col] = safe_numeric_conversion(supplier_2, col)

print(" Data type conversion improved: No unintentional NaN values!")

#  STEP 6: Merge Supplier Data into a Single Inventory Dataset
inventory_dataset = pd.concat([supplier_1, supplier_2], ignore_index=True)

#  STEP 7: Save the Cleaned Dataset
inventory_dataset.to_csv("inventory_dataset.csv", index=False)
print("Task 1 Completed: Cleaned dataset saved as 'inventory_dataset.csv'")

#  STEP 8: Download the Cleaned Dataset (Google Colab Only)
from google.colab import files
files.download("inventory_dataset.csv")

#  STEP 9: Display Data Preview
inventory_dataset.head()


Please upload 'supplier_data_1.xlsx' and 'supplier_data_2.xlsx'


Saving supplier_data_1.xlsx to supplier_data_1 (5).xlsx
Saving supplier_data_2.xlsx to supplier_data_2 (5).xlsx
 Data type conversion improved: No unintentional NaN values!
Task 1 Completed: Cleaned dataset saved as 'inventory_dataset.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  supplier_1[col].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  supplier_1[col].fillna(supplier_1[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,werksgte,bestellgtentext,nenndicke_nnnnn_mm_mit_dezimalpunkt,breite,lnge,gewicht_kg,cluster,sigehalt,mngehalt,pgehalt,...,height_mm,mass_min_kg,number_of_coils,delivery_earliest,delivery_latest,inco_term,buy_now_eur_per_ton,minmax_bid_eur_per_ton,co2_per_ton_max_kg,valid_until
0,G2UB5,SZBS800,320.0,856.0,787.0,16.49,WB-G,Unknown,Unknown,Unknown,...,,,,,,,,,,
1,G2UJ5,SZBS800,339.0,918.0,707.0,17.16,WB-G,Unknown,Unknown,Unknown,...,,,,,,,,,,
2,C3318,LICRO 500,452.0,1839.0,300.0,18.7,WB-U,0.2540,1.2780,0.0080,...,,,,,,,,,,
3,C3U15,S380MC mod. 4,532.0,1160.0,461.0,22.011,WB-U,0.2250,1.0630,0.0100,...,,,,,,,,,,
4,G3UB5,SZBE800,451.0,727.2,557.0,14.02,Spaltband,Unknown,Unknown,Unknown,...,,,,,,,,,,
