In [70]:
import pandas as pd
import re
import numpy as np

# Load the data
customer_regions_df = pd.read_excel('Data/Customer Regions.xlsx')

In [71]:
import os

csv_folder = os.path.join(os.getcwd(), "csv_outputs")
json_folder = os.path.join(os.getcwd(), "json_outputs")

In [72]:
print("Columns:", customer_regions_df.columns.tolist())

Columns: ['REGION_CODE', 'REGION_DESC']


In [73]:
print("Sample data:\n", customer_regions_df.head())

Sample data:
   REGION_CODE           REGION_DESC
0         000               Unknown
1         10a                Durban
2         10b              Midlands
3         11a  Free State / Lesotho
4         11b  Ermelo / Piet Retief


In [74]:
print("Null values:\n", customer_regions_df.isnull().sum())

Null values:
 REGION_CODE    0
REGION_DESC    0
dtype: int64


In [75]:
# Remove rows with any null values
customer_regions_df_clean = customer_regions_df.dropna()

In [76]:
print("Cleaned data shape:", customer_regions_df_clean.shape)

Cleaned data shape: (34, 2)


In [77]:
#Manual identification of the invalid region descriptions (REGION_DESC that do not make sense in the context of the Customer Regions table)
#THIS "~" is to EXCLUDE the invalid region descriptions and their corresponding region codes from the dataframe
invalid_labels = [
    "Unknown", "Samples", "ADAPRO", "Stock - Defectives", "Closed - Bad Debts",
    "Closed", "CONS", "Stores", "House Accounts", "Reps Trip - Botswana"
]
customer_regions_df_clean = customer_regions_df_clean[
    ~customer_regions_df_clean['REGION_DESC'].isin(invalid_labels)
    
]

In [78]:
customer_regions_df_clean

Unnamed: 0,REGION_CODE,REGION_DESC
1,10a,Durban
2,10b,Midlands
3,11a,Free State / Lesotho
4,11b,Ermelo / Piet Retief
6,1a,Pretoria Central
7,1b,Pretoria North/South/West
8,1c,Pretoria East
9,1d,Pretoria
10,1e,Witbank/Middelburg
17,2a,Vaal Triangle


In [79]:
# Save CSV
csv_path = os.path.join(csv_folder, "customer_regions_df_clean.csv")
customer_regions_df_clean.to_csv(csv_path, index=False)

# Save JSON
json_path = os.path.join(json_folder, "customer_regions_df_clean.json")
customer_regions_df_clean.to_json(json_path, orient="records", lines=True)