In [20]:
import pandas as pd

In [21]:
# Load CSV data (UTF-8 encoding)

df_customer = pd.read_csv(r"C:\Users\qinha\Desktop\SQL Data\CustomerTable.csv", encoding= 'UTF-8')

In [22]:
# Preview the data

df_customer.head()

Unnamed: 0,customerNumber,customerName,contactLastName,contactFirstName,phone,addressLine1,addressLine2,city,state,postalCode,country,salesRepEmployeeNumber,creditLimit
0,103,Atelier graphique,Schmitt,Carine,40.32.2555,"54, rue Royale",,Nantes,,44000,France,1370.0,21000.0
1,112,Signal Gift Stores,King,Jean,7025551838,8489 Strong St.,,Las Vegas,NV,83030,USA,1166.0,71800.0
2,114,"Australian Collectors, Co.",Ferguson,Peter,03 9520 4555,636 St Kilda Road,Level 3,Melbourne,Victoria,3004,Australia,1611.0,117300.0
3,119,La Rochelle Gifts,Labrune,Janine,40.67.8555,"67, rue des Cinquante Otages",,Nantes,,44000,France,1370.0,118200.0
4,121,Baane Mini Imports,Bergulfsen,Jonas,07-98 9555,Erling Skakkes gate 78,,Stavern,,4110,Norway,1504.0,81700.0


In [23]:
# CITY: find cities in the city column with shortcuts (length < 4)

city_shortcut = df_customer['city'].loc[df_customer['city'].str.len() < 4]
print(city_shortcut)

9      NYC
15     NYC
27     NYC
98     NYC
105    NYC
Name: city, dtype: object


In [24]:
# CITY: expand abbreviations

replacement_city = {
    'NYC' : 'New York City'
}

df_customer['city'] = df_customer['city'].replace(replacement_city, regex=False)

In [25]:
# State: find states in the state column with shortcuts (length < 4)

state_shortcut = df_customer['state'].loc[df_customer['state'].str.len() < 4].unique()
print(state_shortcut)

['NV' 'CA' 'NY' 'PA' 'CT' 'MA' 'BC' 'NSW' 'NJ' 'NH']


In [26]:
# State: expand abbreviations

replacement_state = {
    'CA': 'California',
    'PA': 'Pennsylvania',
    'BC': 'British Columbia',
    'MA': 'Massachusetts',
    'NY': 'New York',
    'CT': 'Connecticut',
    'NSW': 'New South Wales',
    'NV': 'Nevada',
    'NJ': 'New Jersey',
    'Co. Cork': 'County Cork',
    'NH': 'New Hampshire',
    'Qu├®bec': 'Quebec'
}

df_customer['state'] = df_customer['state'].replace(replacement_state, regex=False)


In [27]:
# State: find countries in the country column with shortcuts (length < 4)

country_shortcut = df_customer['country'].loc[df_customer['country'].str.len() < 4].unique()
print(country_shortcut)

['USA' 'UK']


In [28]:
# Country: expand abbreviations

replacement_country = {
    'USA' : 'United States of America',
    'UK' : 'United Kingdom'
}

df_customer['country'] = df_customer['country'].replace(replacement_country, regex=False)

In [29]:
# PHONE: normalize phone numbers
#  - remove symbols: . + ( ) -
#  - remove spaces
#  - add leading "+"

chars_to_remove = r"[.+()-]"

df_customer['phone'] = (
                    "+" + df_customer['phone'].astype(str)
                        .str.replace(chars_to_remove, "", regex=True)
                        .str.replace(" ", '', regex= True)
                        )

In [30]:
# CREDIT LIMIT: convert to integer

df_customer['creditLimit'] = df_customer['creditLimit'].astype('int') 

In [31]:
# SALES REP: remove NaN and convert to integer

df_customer.dropna(subset= ['salesRepEmployeeNumber'], inplace=True)

df_customer['salesRepEmployeeNumber'] = df_customer['salesRepEmployeeNumber'].astype('int')

print(df_customer['salesRepEmployeeNumber'])


0      1370
1      1166
2      1611
3      1370
4      1504
       ... 
117    1323
118    1165
119    1501
120    1188
121    1612
Name: salesRepEmployeeNumber, Length: 100, dtype: int64


In [32]:
# FIND INVALID CHARACTERS
#  - letters and spaces only allowed

cols = [ 'contactLastName', 'contactFirstName', 'city', 'country', 'state']

for col in cols:
    bad_rows = df_customer[col][df_customer[col].str.contains(r'[^A-Za-z\s]', na=False)]
    print(f"\n--- Rows with invalid characters in {col} ---")
    print(bad_rows)




--- Rows with invalid characters in contactLastName ---
22        Ranc├®
43    Fresni├¿re
55        O'Hara
Name: contactLastName, dtype: object

--- Rows with invalid characters in contactFirstName ---
37     Fr├®d├®rique 
106         Mart├¡n 
116     Jos├® Pedro 
Name: contactFirstName, dtype: object

--- Rows with invalid characters in city ---
11        Lule├Ñ
26       Kita-ku
42        ├àrhus
43     Montr├®al
52         K├Âln
60       Gen├¿ve
93     Minato-ku
101      Br├ñcke
Name: city, dtype: object

--- Rows with invalid characters in country ---
Series([], Name: country, dtype: object)

--- Rows with invalid characters in state ---
Series([], Name: state, dtype: object)


In [33]:
# CITY: fix encoding-related corruption

city_replacements = {
    'Lule├Ñ' : "Luleå",
    '├àrhus'  : 'Århus',
    'Montr├®al' : 'Montréal',
    "K├Âln": "Köln",
    "M├╝nchen": "München",
    "Gen├¿ve": "Genève",
    'M├╝nster' : 'Münster',
    'Br├ñcke' : 'Brück',
    'NYC' : 'New York City'
}

df_customer['city'] = df_customer['city'].replace(city_replacements, regex=False)



In [34]:
# CONTACT LAST NAME: fix encoding issues

replacement_contactLastName = {
    'Ranc├®' : 'Rancé',
    'Fresni├¿re' : 'Frédérique',
    'M├╝ller' : 'Müller'
}

df_customer['contactLastName'] = df_customer['contactLastName'].replace(replacement_contactLastName, regex=False)

In [35]:
# CONTACT FIRST NAME: fix encoding issues

replacement_contactFirstName = {
    'Fr├®d├®rique': 'Frédérique', 
    'Mart├¡n' : 'Martine',
    'Jos├® Pedro' : 'José Pedro'
}

df_customer['contactFirstName'] = df_customer['contactFirstName'].replace(replacement_contactFirstName, regex=False)

In [36]:
# ADDRESS LINE 1: standardize formatting

# 1. Strip + collapse spaces
df_customer['addressLine1'] = df_customer['addressLine1'].str.strip()
df_customer['addressLine1'] = df_customer['addressLine1'].str.replace(r'\s+', ' ', regex=True)

# 2. Remove unwanted characters but keep useful address symbols
df_customer['addressLine1'] = df_customer['addressLine1'].str.replace(r"[^A-Za-z0-9\s\-,.'#]", "",regex=True)

# 3. Normalize case
df_customer['addressLine1'] = df_customer['addressLine1'].str.title()

# 4. Standardize abbreviations
df_customer['addressLine1'] = (
    df_customer['addressLine1']
        .str.replace(r"\bSt\.?\b", "St", regex=True)
        .str.replace(r"\bRd\.?\b", "Rd", regex=True)
        .str.replace(r"\bAve\.?\b", "Ave", regex=True)
        .str.replace(r"\bBlvd\.?\b", "Blvd", regex=True)
)


In [None]:
# NAMES: clean customer and contact names uniformly

cols = ['customerName', 'contactLastName', 'contactFirstName']

for col in cols:
    df_customer[col] = (df_customer[col]
    .str.strip()  # trim spaces 
    .str.replace(r'\s+', ' ', regex=True) # collapse spaces
    .str.replace(r"[^A-Za-z0-9\s\-,.'#]", "",regex=True)
    .str.title() # normalize casing
    .str.replace(r"\bCorporation\b", "Corp", regex=True)
    .str.replace(r"\bCorp\.?\b", "Corp", regex=True)
    .str.replace(r"\bIncorporated\b", "Inc", regex=True)
    .str.replace(r"\bInc\.?\b", "Inc", regex=True)
    .str.replace(r"\bLimited\b", "Ltd", regex=True)
    .str.replace(r"\bLtd\.?\b", "Ltd", regex=True)
    .str.replace(r"\s*&\s*", " & ", regex=True)
)
    

In [None]:
# final check 
df_customer.head()

Unnamed: 0,customerNumber,customerName,contactLastName,contactFirstName,phone,addressLine1,addressLine2,city,state,postalCode,country,salesRepEmployeeNumber,creditLimit
0,103,Atelier Graphique,Schmitt,Carine,40322555,"54, Rue Royale",,Nantes,,44000,France,1370,21000
1,112,Signal Gift Stores,King,Jean,7025551838,8489 Strong St.,,Las Vegas,Nevada,83030,United States of America,1166,71800
2,114,"Australian Collectors, Co.",Ferguson,Peter,395204555,636 St Kilda Road,Level 3,Melbourne,Victoria,3004,Australia,1611,117300
3,119,La Rochelle Gifts,Labrune,Janine,40678555,"67, Rue Des Cinquante Otages",,Nantes,,44000,France,1370,118200
4,121,Baane Mini Imports,Bergulfsen,Jonas,7989555,Erling Skakkes Gate 78,,Stavern,,4110,Norway,1504,81700
