In [1]:
import pandas as pd
import re

# Load the data
customer_df = pd.read_excel('Data/Customer.xlsx')

In [2]:
print("Columns:", customer_df.columns.tolist())

Columns: ['CUSTOMER_NUMBER', 'CCAT_CODE', 'REGION_CODE', 'REP_CODE', 'SETTLE_TERMS', 'NORMAL_PAYTERMS', 'DISCOUNT', 'CREDIT_LIMIT']


In [3]:
print("Sample data:\n", customer_df.head())

Sample data:
   CUSTOMER_NUMBER  CCAT_CODE REGION_CODE REP_CODE  SETTLE_TERMS  \
0          599000         30          7a       05           0.0   
1          999999          0          9a        0           0.0   
2          AACJ01         21         25b     ZZZ5           0.0   
3          AACJC1         21         25b     ZZZ5           0.0   
4          AACJC2          5         20a    CONS4           0.0   

   NORMAL_PAYTERMS  DISCOUNT  CREDIT_LIMIT  
0               60         0          6000  
1                0         0             0  
2               90         0        999999  
3              120         0        999999  
4              120         0        999999  


In [4]:
print("Null values:\n", customer_df.isnull().sum())

Null values:
 CUSTOMER_NUMBER    0
CCAT_CODE          0
REGION_CODE        0
REP_CODE           0
SETTLE_TERMS       1
NORMAL_PAYTERMS    0
DISCOUNT           0
CREDIT_LIMIT       0
dtype: int64


In [5]:
# Remove rows with any null values
customer_df_clean = customer_df.dropna()

# Clean CUSTOMER_NUMBER: ensure it's a string and strip whitespace
customer_df_clean['CUSTOMER_NUMBER'] = customer_df_clean['CUSTOMER_NUMBER'].astype(str).str.strip()

# Optional: drop rows where CUSTOMER_NUMBER is missing entirely
customer_df_clean = customer_df_clean[customer_df_clean['CUSTOMER_NUMBER'].notna()]

# Clean CCAT_CODE: keep only positive integers
customer_df_clean = customer_df_clean[customer_df_clean['CCAT_CODE'] > 0]

# Clean REGION_CODE: keep only rows where REGION_CODE matches pattern like '7a', '25b', '20a', etc. (1-2 digits + 1 letter)
region_code_pattern = r'^\d{1,2}[A-Za-z]$'
customer_df_clean = customer_df_clean[customer_df_clean['REGION_CODE'].astype(str).str.match(region_code_pattern)]

# Clean REP_CODE: keep only alphanumeric codes (no special characters)
customer_df_clean = customer_df_clean[customer_df_clean['REP_CODE'].astype(str).str.isalnum()]

# Clean SETTLE_TERMS: keep only non-negative floats
customer_df_clean = customer_df_clean[customer_df_clean['SETTLE_TERMS'] >= 0]

# Clean NORMAL_PAYTERMS: keep only positive integers
customer_df_clean = customer_df_clean[customer_df_clean['NORMAL_PAYTERMS'] > 0]

# Clean DISCOUNT: keep only non-negative integers (assuming discount can't be negative)
customer_df_clean = customer_df_clean[customer_df_clean['DISCOUNT'] >= 0]

# Clean CREDIT_LIMIT: keep only non-negative integers
customer_df_clean = customer_df_clean[customer_df_clean['CREDIT_LIMIT'] >= 0]

# Reset index after cleaning
customer_df_clean = customer_df_clean.reset_index(drop=True)

# Check for possible spelling mistakes in column names by comparing to a reference list
expected_columns = [
    'CUSTOMER_NUMBER', 'CCAT_CODE', 'REGION_CODE', 'REP_CODE',
    'SETTLE_TERMS', 'NORMAL_PAYTERMS', 'DISCOUNT', 'CREDIT_LIMIT'
]

# Find unexpected columns
unexpected_columns = [col for col in customer_df_clean.columns if col not in expected_columns]
if unexpected_columns:
    print("Possible spelling mistakes or unexpected columns:", unexpected_columns)
else:
    print("All column names match expected names.")

print("Current columns:", customer_df_clean.columns.tolist())

# Example: Correct known spelling mistakes in all string/object columns
# Define a dictionary of known spelling corrections
spelling_corrections = {
    'ALLSION': 'ALLISON',
    # Add more known corrections here as needed
}

# Apply corrections to all object (string) columns
for col in customer_df_clean.select_dtypes(include='object').columns:
    customer_df_clean[col] = customer_df_clean[col].replace(spelling_corrections, regex=False)

All column names match expected names.
Current columns: ['CUSTOMER_NUMBER', 'CCAT_CODE', 'REGION_CODE', 'REP_CODE', 'SETTLE_TERMS', 'NORMAL_PAYTERMS', 'DISCOUNT', 'CREDIT_LIMIT']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_df_clean['CUSTOMER_NUMBER'] = customer_df_clean['CUSTOMER_NUMBER'].astype(str).str.strip()


In [6]:
# Show cleaned data info
print("Cleaned data shape:", customer_df_clean.shape)

Cleaned data shape: (2757, 8)


In [7]:
print("Cleaned sample data:\n", customer_df_clean.head())

Cleaned sample data:
   CUSTOMER_NUMBER  CCAT_CODE REGION_CODE REP_CODE  SETTLE_TERMS  \
0          599000         30          7a       05           0.0   
1          AACJ01         21         25b     ZZZ5           0.0   
2          AACJC1         21         25b     ZZZ5           0.0   
3          AACJC2          5         20a    CONS4           0.0   
4          AADPRG          6         21a       XX           0.0   

   NORMAL_PAYTERMS  DISCOUNT  CREDIT_LIMIT  
0               60         0          6000  
1               90         0        999999  
2              120         0        999999  
3              120         0        999999  
4              120         0        999999  


In [8]:
customer_df_clean

Unnamed: 0,CUSTOMER_NUMBER,CCAT_CODE,REGION_CODE,REP_CODE,SETTLE_TERMS,NORMAL_PAYTERMS,DISCOUNT,CREDIT_LIMIT
0,599000,30,7a,05,0.0,60,0,6000
1,AACJ01,21,25b,ZZZ5,0.0,90,0,999999
2,AACJC1,21,25b,ZZZ5,0.0,120,0,999999
3,AACJC2,5,20a,CONS4,0.0,120,0,999999
4,AADPRG,6,21a,XX,0.0,120,0,999999
...,...,...,...,...,...,...,...,...
2752,ZHAY02,19,4b,03,0.0,120,0,2000
2753,ZMAU01,37,11a,03,0.0,120,0,0
2754,ZNAE01,46,2b,05,0.0,120,0,30000
2755,ZNAEOC,5,20a,STAND,0.0,120,0,999999


In [9]:
import os

# Define folder paths
csv_folder = os.path.join(os.getcwd(), "csv_outputs")
json_folder = os.path.join(os.getcwd(), "json_outputs")

# Create folders if they don't exist
os.makedirs(csv_folder, exist_ok=True)
os.makedirs(json_folder, exist_ok=True)

In [10]:
# Save CSV
csv_path = os.path.join(csv_folder, "customer_df_clean.csv") #Customer.xlsx
customer_df_clean.to_csv(csv_path, index=False)

# Save JSON
json_path = os.path.join(json_folder, "customer_df_clean.json")
customer_df_clean.to_json(json_path, orient="records", lines=True)