In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sqlalchemy import create_engine

# 1. Data Extraction
excel_file = 'SAP-DataSet.xlsx'

# Reading different sheets into DataFrames
vbak_df = pd.read_excel(excel_file, sheet_name='VBAK')   # Sales Order Header
vbap_df = pd.read_excel(excel_file, sheet_name='VBAP')   # Sales Order Items
kna1_df = pd.read_excel(excel_file, sheet_name='KNA1')   # Customer Master
likp_df = pd.read_excel(excel_file, sheet_name='LIKP')   # Delivery Header
lips_df = pd.read_excel(excel_file, sheet_name='LIPS')   # Delivery Items
vttk_df = pd.read_excel(excel_file, sheet_name='VTTK')   # Shipment Header
vttp_df = pd.read_excel(excel_file, sheet_name='VTTP')   # Shipment Items
lfa1_df = pd.read_excel(excel_file, sheet_name='LFA1')   # Carrier Data

In [2]:
# 2. Data Validation and Transformation

# Rename columns to standardized names
kna1_rename = {
    'Customer ID': 'customer_id',
    'Customer Name': 'customer_name',
    'Country': 'country',
    'Region': 'region',
    'City': 'city',
    'Postal Code': 'postal_code',
    'Street Address': 'street_address',
    'Phone Number': 'phone_number',
    'Email Address': 'email_address',
    'Language': 'language',
    'Tax Number': 'tax_number',
    'Customer Group': 'customer_group',
    'Sales Organization': 'sales_organization',
    'Distribution Channel': 'distribution_channel',
    'Division': 'division'
}
kna1_df.rename(columns=kna1_rename, inplace=True)

lfa1_rename = {
    'Vendor Number': 'vendor_number',
    'Vendor Name': 'vendor_name',
    'Country': 'country',
    'Region': 'region',
    'City': 'city',
    'Postal Code': 'postal_code',
    'Street Address': 'street_address',
    'Phone Number': 'phone_number',
    'Email Address': 'email_address',
    'Language': 'language',
    'Tax Number': 'tax_number',
    'Payment Terms': 'payment_terms'
}
lfa1_df.rename(columns=lfa1_rename, inplace=True)

vbak_rename = {
    'Sales Document': 'order_id',
    'Order Date': 'order_date',
    'Customer ID': 'customer_id',
    'Order Type': 'order_type',
    'Sales Organization': 'sales_organization',
    'Distribution Channel': 'distribution_channel',
    'Division': 'division',
    'Order Status': 'order_status'
}
vbak_df.rename(columns=vbak_rename, inplace=True)

vbap_rename = {
    'Sales Document': 'order_id',
    'Item Number': 'item_number',
    'Material Number': 'material_number',
    'Quantity': 'quantity',
    'Net Price': 'net_price',
    'Item Status': 'item_status',
    'Delivery Date': 'delivery_date'
}
vbap_df.rename(columns=vbap_rename, inplace=True)

likp_rename = {
    'Delivery Number': 'delivery_number',
    'Delivery Date': 'delivery_date',
    'Sales Document': 'order_id',
    'Shipping Point': 'shipping_point',
    'Shipping Type': 'shipping_type',
    'Delivery Status': 'delivery_status',
    'Shipping Status': 'shipping_status',
    'Route': 'route',
    'Delivery Priority': 'delivery_priority',
    'Customer ID': 'customer_id'
}
likp_df.rename(columns=likp_rename, inplace=True)

lips_rename = {
    'Delivery Number': 'delivery_number',
    'Item Number': 'item_number',
    'Material Number': 'material_number',
    'Delivered Quantity': 'delivered_quantity',
    'Net Price': 'net_price',
    'Delivery Status': 'delivery_status',
    'Customer ID': 'customer_id',
    'Sales Document': 'order_id',
    'Sales Item': 'sales_item',
    'Delivery Date': 'delivery_date'
}
lips_df.rename(columns=lips_rename, inplace=True)

vttk_rename = {
    'Shipment Number': 'shipment_number',
    'Shipment Date': 'shipment_date',
    'Sales Document': 'order_id',
    'Delivery Number': 'delivery_number',
    'Shipping Point': 'shipping_point',
    'Carrier': 'carrier',
    'Shipment Status': 'shipment_status',
    'Route': 'route',
    'Shipping Type': 'shipping_type',
    'Customer ID': 'customer_id'
}
vttk_df.rename(columns=vttk_rename, inplace=True)

vttp_rename = {
    'Shipment Number': 'shipment_number',
    'Item Number': 'item_number',
    'Material Number': 'material_number',
    'Shipped Quantity': 'shipped_quantity',
    'Item Status': 'item_status',
    'Delivery Number': 'delivery_number',
    'Customer ID': 'customer_id',
    'Sales Document': 'order_id',
    'Sales Item': 'sales_item',
    'Shipment Date': 'shipment_date'
}
vttp_df.rename(columns=vttp_rename, inplace=True)

In [3]:
# Validate missing values in essential fields
essential_fields = {
    'KNA1': ['customer_id', 'customer_name', 'email_address'],
    'LFA1': ['vendor_number', 'vendor_name'],
    'VBAK': ['order_id', 'customer_id', 'order_date'],
    'VBAP': ['order_id', 'item_number', 'material_number'],
    'LIKP': ['delivery_number', 'delivery_date', 'order_id', 'customer_id'],
    'LIPS': ['delivery_number', 'item_number', 'material_number'],
    'VTTK': ['shipment_number', 'shipment_date', 'order_id', 'carrier'],
    'VTTP': ['shipment_number', 'item_number', 'material_number']
}

In [4]:
def validate_missing(df, columns, df_name):
    missing = df[columns].isnull().sum()
    print(f"Missing values in {df_name}:")
    print(missing)
    if missing.sum() > 0:
        raise ValueError(f"{df_name} has missing values in essential fields.")

In [5]:
validate_missing(kna1_df, essential_fields['KNA1'], 'KNA1')
validate_missing(lfa1_df, essential_fields['LFA1'], 'LFA1')
validate_missing(vbak_df, essential_fields['VBAK'], 'VBAK')
validate_missing(vbap_df, essential_fields['VBAP'], 'VBAP')
validate_missing(likp_df, essential_fields['LIKP'], 'LIKP')
validate_missing(lips_df, essential_fields['LIPS'], 'LIPS')
validate_missing(vttk_df, essential_fields['VTTK'], 'VTTK')
validate_missing(vttp_df, essential_fields['VTTP'], 'VTTP')


Missing values in KNA1:
customer_id      0
customer_name    0
email_address    0
dtype: int64
Missing values in LFA1:
vendor_number    0
vendor_name      0
dtype: int64
Missing values in VBAK:
order_id       0
customer_id    0
order_date     0
dtype: int64
Missing values in VBAP:
order_id           0
item_number        0
material_number    0
dtype: int64
Missing values in LIKP:
delivery_number    0
delivery_date      0
order_id           0
customer_id        0
dtype: int64
Missing values in LIPS:
delivery_number    0
item_number        0
material_number    0
dtype: int64
Missing values in VTTK:
shipment_number    0
shipment_date      0
order_id           0
carrier            0
dtype: int64
Missing values in VTTP:
shipment_number    0
item_number        0
material_number    0
dtype: int64


In [6]:
# Referential Integrity: Ensure every order has a valid customer
invalid_customers = set(vbak_df['customer_id']) - set(kna1_df['customer_id'])
if invalid_customers:
    raise ValueError(f"Invalid customer IDs in orders: {invalid_customers}")
else:
    print("All orders have valid customers.")

# Convert date columns to datetime
def convert_dates(df, date_cols):
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

vbak_df = convert_dates(vbak_df, ['order_date'])
likp_df = convert_dates(likp_df, ['delivery_date'])
lips_df = convert_dates(lips_df, ['delivery_date'])
vttk_df = convert_dates(vttk_df, ['shipment_date'])
vttp_df = convert_dates(vttp_df, ['shipment_date'])

All orders have valid customers.


In [7]:
# Strip extra whitespace from string columns
def strip_strings(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.strip()
    return df

for df in [kna1_df, lfa1_df, vbak_df, vbap_df, likp_df, lips_df, vttk_df, vttp_df]:
    strip_strings(df)

# Check for duplicate records in key columns
def check_duplicates(df, key, df_name):
    duplicates = df[df.duplicated(subset=[key], keep=False)]
    if not duplicates.empty:
        raise ValueError(f"Duplicates found in {df_name} for key {key}.")
    else:
        print(f"No duplicates in {df_name} for key {key}.")

check_duplicates(vbak_df, 'order_id', 'VBAK')
check_duplicates(kna1_df, 'customer_id', 'KNA1')
check_duplicates(likp_df, 'delivery_number', 'LIKP')
check_duplicates(lfa1_df, 'vendor_number', 'LFA1')
check_duplicates(vttk_df, 'shipment_number', 'VTTK')

No duplicates in VBAK for key order_id.
No duplicates in KNA1 for key customer_id.
No duplicates in LIKP for key delivery_number.
No duplicates in LFA1 for key vendor_number.
No duplicates in VTTK for key shipment_number.


In [8]:
# 3. Data Transformation

# Build Orders table by merging VBAK and VBAP on order_id
orders_df = pd.merge(vbak_df, vbap_df, on='order_id', how='left', suffixes=('_header', '_item'))
# Merge LIKP to get delivery_date (used here to compute processing time)
orders_df = pd.merge(orders_df, likp_df[['order_id', 'delivery_date']], on='order_id', how='left')
# Compute processing time (in days)
orders_df['processing_time'] = (orders_df['delivery_date_x'] - orders_df['order_date']).dt.days

# Customers table from KNA1 (all columns are kept)
customers_df = kna1_df.copy()

# Shipments table: Merge LIKP (delivery header) with VTTK (shipment header) using delivery_number.
shipments_df = pd.merge(likp_df, vttk_df[['delivery_number', 'shipment_number', 'shipment_date', 'carrier', 'shipment_status']], on='delivery_number', how='left')

# Shipment_Items table directly from LIPS
shipment_items_df = lips_df.copy()

# Carriers table from LFA1 (all columns)
carriers_df = lfa1_df.copy()

# Delivery_Status table: Merge VTTK (shipment header) with VTTP (shipment items) on shipment_number.
delivery_status_df = pd.merge(vttk_df, vttp_df, on='shipment_number', how='left')

delivery_status_df = delivery_status_df.drop(columns=[col for col in delivery_status_df.columns if col.endswith('_y')])
delivery_status_df = delivery_status_df.rename(columns=lambda col: col.replace('_x', ''))

# Optionally, attach shipping status from LIKP if desired
if 'shipping_status' in likp_df.columns:
    delivery_status_df = pd.merge(delivery_status_df, likp_df[['delivery_number', 'shipping_status']], on='delivery_number', how='left')

In [9]:
from sqlalchemy import create_engine

# Database connection details
DB_USER = "root"
DB_PASSWORD = "12345"
DB_HOST = "localhost"
DB_PORT = "3306"
DB_NAME = "case11"

# Create the engine using pymysql
engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

print("Successfully connected to the MySQL database!")

# Example of pushing data to MySQL
tables_to_push = {
    "delivery_status": delivery_status_df,
    "orders": orders_df,
    "carriers": carriers_df,
}

# Directly replace existing tables
for table_name, df in tables_to_push.items():
    df.to_sql(table_name, engine, if_exists='replace', index=False)

print("Data loaded to MySQL successfully.")


Successfully connected to the MySQL database!
Data loaded to MySQL successfully.
