In [1]:
import sys
sys.path.append('/Users/markbills/Library/CloudStorage/OneDrive-Transformativ,LLC/Clients/Ovation Holdings/src')

# Azure Data Lake libraries
import azure_data_lake_interface as adl

# Data analysis libraries
import pandas as pd

# Helper function libraries
from helper_functions import load_config, convert_json_strings_to_python_types

In [2]:
def clean_and_filter_vendor_data(vendors: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans and filters vendor data from a pandas DataFrame.

    Args:
        vendors (pd.DataFrame): DataFrame containing vendor information with columns:
            links, balance, category, company_name, datecreated, etc.

    Returns:
        pd.DataFrame: Cleaned DataFrame with the following modifications:
            - Removed 'links' column
            - 'null' categories replaced with 'Not Assigned' 
            - 'datecreated' converted to datetime
            - Numeric columns ('balance', 'unbilled_orders') converted to numbers
    """

    # drop 'links' column
    vendors.drop('links', axis=1, inplace=True)

    # fill in values for category that are 'null'
    vendors.loc[vendors['category'] == 'null', 'category'] = 'Not Assigned'

    vendors = convert_json_strings_to_python_types(vendors)

    return vendors

In [None]:
# this code was used to find fuzzy duplicates in the manufacturer field, and the findings were used to create the manufacturer_name_map
# used in the clean_and_resolve_manufacturers method

# from thefuzz import fuzz
# from collections import defaultdict
# import pandas as pd
#
# # Set a threshold for similarity
# similarity_threshold = 80
#
# # Create a dictionary where each key is a manufacturer name and
# # the value is a list of tuples (other_name, similarity_ratio)
# manufacturer_clusters = defaultdict(list)
#
# # Compare each manufacturer against the others
# for i, name1 in enumerate(manufacturers):
#     if name1 is None or pd.isna(name1):
#         continue
#     for name2 in manufacturers[i + 1:]:
#         if name2 is None or pd.isna(name2):
#             continue
#         ratio = fuzz.ratio(name1, name2)
#         if ratio >= similarity_threshold:
#             manufacturer_clusters[name1].append((name2, ratio))
#
# clusters = []
# for primary, similar_list in manufacturer_clusters.items():
#     # Only include clusters where there is at least one similar name.
#     if similar_list:
#         # Cluster includes the primary manufacturer name first.
#         cluster = [primary] + [name for name, score in similar_list]
#         clusters.append(cluster)
#
# # Optionally, print out the clusters for verification.
# print("Generated manufacturer clusters:")
# for main_name, similar in manufacturer_clusters.items():
#     if similar:
#         print(f"\nCluster for '{main_name}':")
#         for name, score in similar:
#             print(f"  - {name} (similarity: {score}%)")

In [3]:
def clean_and_resolve_manufacturers(items: pd.DataFrame) -> pd.DataFrame:

    # replace "empty" values with something more human-readable
    items["manufacturer"] = items["manufacturer"].replace("null", "Not Specified")
    items["custom_manufacturer"] = items["custom_manufacturer"].replace("null", "Not Specified")
    items.loc[(items['vsi_mfr'] == "null") | (items['vsi_mfr'] == "Unknown") | (items['vsi_mfr'].isna()), 'vsi_mfr'] = "Not Specified"

    # resolve multiple manufacturer columns
    # -- put custom_manufacturer value in manufacturer if "Not Specified"
    items.loc[items["manufacturer"] == "Not Specified", "manufacturer"] = items["custom_manufacturer"]

    # -- put vsi_mfr value in manufacturer if "Not Specified" (which happens if custom_manufacturer was not specified)
    items.loc[items["manufacturer"] == "Not Specified", "manufacturer"] = items["vsi_mfr"]

    # Clean up manufacturer column
    # -- remove special characters
    items['manufacturer'] = items['manufacturer'].str.replace(r'[,.\/-]', ' ', regex=True)

    # -- remove leading/trailing spaces and replace multiple spaces with a single space
    items['manufacturer'] = items['manufacturer'].str.strip()
    items['manufacturer'] = items['manufacturer'].str.replace(r'\s+', ' ', regex=True)

    # -- capitalize the first letter of each word (removes all caps)
    items['manufacturer'] = items['manufacturer'].str.title()

    # remove all misspellings
    mfg_name_map = load_config("config/manufacturer_name_map.json", flush_cache=True)["manufacturer_map"]
    for correct_name, misspellings in mfg_name_map.items():
        items.loc[items['manufacturer'].isin(misspellings), 'manufacturer'] = correct_name

    return items

In [4]:
def clean_and_filter_item_data(items: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans and filters item data from a pandas DataFrame.

    Args:
        items (pd.DataFrame): DataFrame containing item information with columns for
            links, costs, manufacturers, descriptions, dates, quantities etc.

    Returns:
        pd.DataFrame: Cleaned DataFrame with the following modifications:
            - Removed 'links' column
            - Resolved manufacturer/custom_manufacturer columns
            - Replaced 'null' values with 'Not Specified' in text columns
            - Converted date columns to datetime
            - Converted numeric columns to numbers
    """

    items = convert_json_strings_to_python_types(items)

    # drop 'links' column
    items.drop('links', axis=1, inplace=True)

    # fill in values for columns of interest that contain 'null'
    nulls = [
        'description', 'display_name', 'level_1_category', 'level_2_category', 'level_3_category',
        'parent_item', 'preferred_vendor', 'valve_spec_size'
    ]
    for col in nulls:
        items[col] = items[col].apply(lambda x: str(x) if not pd.isna(x) else 'Not Specified')
        items.loc[items[col] == 'null', col] = 'Not Specified'

    # fill in "empty" vsi_item_category values
    items.loc[(items['vsi_item_category'] == "null") | (items['vsi_item_category'] == "Unknown") | (items['vsi_item_category'].isna()), 'vsi_item_category'] = "Not Specified"

    # move any valid manufacturer into the manufacturer field from custom or vsi fields
    items = clean_and_resolve_manufacturers(items)

    # remove items with item_names that start with "Inactivated"
    items = items[~items["item_name"].str.startswith("Inactivated")]

    # remove items with item names that contain the word "custom"
    items = items[~items["item_name"].str.contains(r'\bcustom\b', case=False, regex=True)]

    return items

In [5]:
def add_new_item_levels(items: pd.DataFrame, new_item_info_path: str, create_new_columns: bool = True) -> pd.DataFrame:
    """Adds new item level categories from an Excel file to the items DataFrame.

    Args:
        items (pd.DataFrame): DataFrame containing item information with existing level categories.
        new_item_info_path (str): Path to Excel file containing new level information.
        create_new_columns (bool, optional): Whether to create new level columns 4-6. Defaults to True.

    Returns:
        pd.DataFrame: Items DataFrame with updated level categories.
    """

    if create_new_columns:
        # add new level columns to items
        for i in [4, 5, 6]:
            items[f"level_{i}_category"] = 'Not Specified'

        # rearrange the columns so that the levels are contiguous
        columns_before = items.columns[0:items.columns.get_loc("level_3_category") + 1].tolist()
        level_columns = [f"level_{i}_category" for i in [4, 5, 6]]
        remaining_columns = [col for col in items.columns if col not in columns_before + level_columns]
        items = items[columns_before + level_columns + remaining_columns]

    # get new level information in Excel
    level_info = pd.read_excel(new_item_info_path)

    # convert all level values to string (Excel treats some as numbers)
    for i in range(1, 7):
        level_info[f'Level {i}'] = level_info[f'Level {i}'].astype(str)

    # Update level categories for matching items
    for i in range(1, 7):
        items.loc[items['item_name'].isin(level_info['Name']), f'level_{i}_category'] = \
            items[items['item_name'].isin(level_info['Name'])]['item_name'].map(
                dict(zip(level_info['Name'], level_info[f'Level {i}']))
            )

    # replace nan strings with "Not Specified"
    for i in range(1, 7):
        items[f'level_{i}_category'] = items[f'level_{i}_category'].replace("nan", "Not Specified")

    # replace old category with the updated one
    items["level_1_category"] = items["level_1_category"].replace("Valve", "Valves")

    return items

In [6]:
# attach to the data lake
config = load_config("config/datalake_config.json", flush_cache=True)
service_client = adl.get_azure_service_client(config["blob_url"])

container_name = "consolidated"
file_system_client = adl.get_azure_file_system_client(service_client, container_name)

In [7]:
# get vendor and item data
source_folder = "raw/netsuite"
vendors = adl.get_parquet_file_from_data_lake(file_system_client, source_folder, "vendor_raw.parquet")
items = adl.get_parquet_file_from_data_lake(file_system_client, source_folder, "item_raw.parquet")

In [8]:
vendors = clean_and_filter_vendor_data(vendors)
items = clean_and_filter_item_data(items)

# save in the data lake
adl.save_df_as_parquet_in_data_lake(vendors, file_system_client, "cleaned/netsuite", "vendor_cleaned.parquet")
adl.save_df_as_parquet_in_data_lake(items, file_system_client, "cleaned/netsuite", "item_cleaned.parquet")

In [9]:
# add new item category levels and save as enhanced
items = add_new_item_levels(items,
                            "../../../../../../../../Dropbox/Transformativ/ClientSystems/ResearchAndDevelopment/AzureDataLakeAccess/client_data/NewItemLevels.xlsx")
adl.save_df_as_parquet_in_data_lake(items, file_system_client, "enhanced/netsuite", "item_enhanced.parquet")