In [20]:
import pandas as pd
import re
import numpy as np

# Load the data
customer_regions_df = pd.read_excel('Data/Customer Regions.xlsx')

In [21]:
import os

csv_folder = os.path.join(os.getcwd(), "csv_outputs")
json_folder = os.path.join(os.getcwd(), "json_outputs")

In [22]:
print("Columns:", customer_regions_df.columns.tolist())

Columns: ['REGION_CODE', 'REGION_DESC']


In [23]:
print("Sample data:\n", customer_regions_df.head())

Sample data:
   REGION_CODE           REGION_DESC
0         000               Unknown
1         10a                Durban
2         10b              Midlands
3         11a  Free State / Lesotho
4         11b  Ermelo / Piet Retief


In [24]:
print("Null values:\n", customer_regions_df.isnull().sum())

Null values:
 REGION_CODE    0
REGION_DESC    0
dtype: int64


In [25]:
# Remove rows with any null values
customer_regions_df_clean = customer_regions_df.dropna()

In [26]:
print("Cleaned data shape:", customer_regions_df_clean.shape)

Cleaned data shape: (34, 2)


In [27]:
#Manual identification of the invalid region descriptions (REGION_DESC that do not make sense in the context of the Customer Regions table)
#THIS "~" is to EXCLUDE the invalid region descriptions and their corresponding region codes from the dataframe
invalid_labels = [
    "Unknown", "Samples", "ADAPRO", "Stock - Defectives", "Closed - Bad Debts",
    "Closed", "CONS", "Stores", "House Accounts", "Reps Trip - Botswana"
]
customer_regions_df_clean = customer_regions_df_clean[
    ~customer_regions_df_clean['REGION_DESC'].isin(invalid_labels)
    
]

In [28]:
customer_regions_df_clean

Unnamed: 0,REGION_CODE,REGION_DESC
1,10a,Durban
2,10b,Midlands
3,11a,Free State / Lesotho
4,11b,Ermelo / Piet Retief
6,1a,Pretoria Central
7,1b,Pretoria North/South/West
8,1c,Pretoria East
9,1d,Pretoria
10,1e,Witbank/Middelburg
17,2a,Vaal Triangle


In [29]:
unique_values = customer_regions_df_clean['REGION_DESC'].unique()
unique_values

array(['Durban', 'Midlands', 'Free State / Lesotho',
       'Ermelo / Piet Retief', 'Pretoria Central',
       'Pretoria North/South/West', 'Pretoria East', 'Pretoria',
       'Witbank/Middelburg', 'Vaal Triangle', 'Krugersdorp / Sun City',
       'Potch / Krugersdorp / Kuruman', 'Rustenburg', 'Soweto/Lenasia',
       'Johannesburg CBD', 'Pietersburg / Potgietersrus',
       'Nelspruit / Tzaneen', 'Botswana and Exports', 'East Cape',
       'West Cape', 'Gauteng N - Sandton', 'Gauteng N - Randburg/Rivonia',
       'Gauteng N - Eastgate/Norwood/Midrand',
       'Gauteng E - Alberton/Benoni/Kempton'], dtype=object)

In [30]:
province_mapping = {
    # KwaZulu-Natal (KZN)
    'Durban': 'KwaZulu-Natal',
    'Midlands': 'KwaZulu-Natal',

    # Free State (FS) & Special Case
    'Free State / Lesotho': 'Free State',
    'Botswana and Exports': 'Export', # Grouping non-SA regions as 'Export'

    # Mpumalanga (MP)
    'Ermelo / Piet Retief': 'Mpumalanga',
    'Witbank/Middelburg': 'Mpumalanga',
    'Nelspruit / Tzaneen': 'Mpumalanga',

    # Limpopo (LP)
    'Pietersburg / Potgietersrus': 'Limpopo',

    # Eastern Cape (EC) and Western Cape (WC)
    'East Cape': 'Eastern Cape',
    'West Cape': 'Western Cape',

    # Gauteng (GP) - Consolidating all Johannesburg/Pretoria/Vaal areas
    'Pretoria Central': 'Gauteng',
    'Pretoria North/South/West': 'Gauteng',
    'Pretoria East': 'Gauteng',
    'Pretoria': 'Gauteng',
    'Vaal Triangle': 'Gauteng',
    'Soweto/Lenasia': 'Gauteng',
    'Johannesburg CBD': 'Gauteng',
    'Gauteng N - Sandton': 'Gauteng',
    'Gauteng N - Randburg/Rivonia': 'Gauteng',
    'Gauteng N - Eastgate/Norwood/Midrand': 'Gauteng',
    'Gauteng E - Alberton/Benoni/Kempton': 'Gauteng',

    # North West (NW)
    'Rustenburg': 'North West',
    'Krugersdorp / Sun City': 'North West', # Sun City is in NW, Krugersdorp is GP but this grouping might be customer-specific
    'Potch / Krugersdorp / Kuruman': 'North West', # Potch/Kuruman are NW
}

In [31]:
# Assuming your DataFrame is named customer_regions_df_clean
customer_regions_df_clean['PROVINCE'] = (
    customer_regions_df_clean['REGION_DESC']
    .map(province_mapping)
)

In [32]:
# Check the results
print(customer_regions_df_clean[['REGION_DESC', 'PROVINCE']].head())

# Check for any unmapped (NaN) values
print("\nUnique values in the new 'PROVINCE' column:")
print(customer_regions_df_clean['PROVINCE'].unique())

            REGION_DESC       PROVINCE
1                Durban  KwaZulu-Natal
2              Midlands  KwaZulu-Natal
3  Free State / Lesotho     Free State
4  Ermelo / Piet Retief     Mpumalanga
6      Pretoria Central        Gauteng

Unique values in the new 'PROVINCE' column:
['KwaZulu-Natal' 'Free State' 'Mpumalanga' 'Gauteng' 'North West'
 'Limpopo' 'Export' 'Eastern Cape' 'Western Cape']


In [33]:
customer_regions_df_clean

Unnamed: 0,REGION_CODE,REGION_DESC,PROVINCE
1,10a,Durban,KwaZulu-Natal
2,10b,Midlands,KwaZulu-Natal
3,11a,Free State / Lesotho,Free State
4,11b,Ermelo / Piet Retief,Mpumalanga
6,1a,Pretoria Central,Gauteng
7,1b,Pretoria North/South/West,Gauteng
8,1c,Pretoria East,Gauteng
9,1d,Pretoria,Gauteng
10,1e,Witbank/Middelburg,Mpumalanga
17,2a,Vaal Triangle,Gauteng


In [34]:
# Save CSV
csv_path = os.path.join(csv_folder, "customer_regions_df_clean.csv")
customer_regions_df_clean.to_csv(csv_path, index=False)

# Save JSON
json_path = os.path.join(json_folder, "customer_regions_df_clean.json")
customer_regions_df_clean.to_json(json_path, orient="records", lines=True)