In [9]:
import pandas as pd
import numpy as np

# ...existing code...
import unicodedata

def is_valid_utf8(x) -> bool:
    try:
        if isinstance(x, bytes):
            x.decode('utf-8')  # will raise UnicodeDecodeError if invalid
        else:
            str(x).encode('utf-8')  # will raise UnicodeEncodeError if invalid (e.g., surrogates)
        return True
    except (UnicodeDecodeError, UnicodeEncodeError):
        return False

def filter_non_utf(strings):
    """Return only values that are valid UTF-8 (bytes decodable or str encodable)."""
    return [str(s) if not isinstance(s, str) else s for s in strings if is_valid_utf8(s)]

# Optional: sanitize instead of dropping
def sanitize_to_utf8(x):
    """Best-effort cleanup to valid UTF-8 string."""
    s = x.decode('utf-8', errors='ignore') if isinstance(x, bytes) else str(x)
    s = unicodedata.normalize('NFC', s)
    # drop surrogate code points and unwanted controls (keep \n, \r, \t)
    s = ''.join(
        ch for ch in s
        if (0xD800 > ord(ch) or ord(ch) > 0xDFFF) and (unicodedata.category(ch) != 'Cc' or ch in ('\n', '\r', '\t'))
    )
    # final validation
    try:
        s.encode('utf-8')
        return s
    except UnicodeEncodeError:
        return None

# Example:
# raw = ["OK", b"bad\xff", "surrogate\uDC00", "n√§√•", b"\xf0\x9f\x98\x80"]  # üòÄ
# only_valid = filter_non_utf(raw)
# cleaned = [v for v in (sanitize_to_utf8(x) for x in raw) if v is not None]

# Ensure psycopg2 client encoding is UTF-8
# conn = psycopg2.connect(**pg_config)
# conn.set_client_encoding('UTF8')
# ...existing code...

In [12]:
path = "../data/customers-final.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,customer_id,name,location,email
0,1,Selena Venegas,Bengaluru,selena.venegas0@gmail.com
1,2,Tessy Wo≈∫niak,Doha,tessy.wo≈∫niak1@gmail.com
2,3,Forbes Ko,Noida,forbes.ko2@gmail.com
3,4,Jo Ann Dahan,Bengaluru,joann.dahan3@gmail.com
4,5,Jacinta D√≠az,Vadodara,jacinta.d√≠az4@gmail.com


In [13]:
import random

# ASCII-only first and last names
first_names = [
    "James","John","Robert","Michael","William","David","Richard","Joseph","Thomas","Charles",
    "Christopher","Daniel","Matthew","Anthony","Mark","Donald","Steven","Paul","Andrew","Joshua",
    "Kenneth","Kevin","Brian","George","Timothy","Ronald","Edward","Jason","Jeffrey","Ryan",
    "Jacob","Gary","Nicholas","Eric","Jonathan","Stephen","Larry","Justin","Scott","Brandon",
    "Benjamin","Samuel","Gregory","Alexander","Frank","Patrick","Tyler","Raymond","Jack",
    "Dennis","Jerry","Peter","Adam","Nathan","Zachary","Paul","Kyle","Dylan","Noah"
]

last_names = [
    "Smith","Johnson","Williams","Brown","Jones","Miller","Davis","Garcia","Rodriguez","Wilson",
    "Martinez","Anderson","Taylor","Thomas","Hernandez","Moore","Martin","Jackson","Thompson","White",
    "Lopez","Lee","Gonzalez","Harris","Clark","Lewis","Robinson","Walker","Perez","Hall",
    "Young","Allen","Sanchez","Wright","King","Scott","Green","Baker","Adams","Nelson",
    "Hill","Ramirez","Campbell","Mitchell","Roberts","Carter","Phillips","Evans","Turner","Collins"
]

domains = ["example.com","mail.com","test.com","sample.org","demo.net"]

# Generate unique ASCII-only emails for the names

def gen_email(first: str, last: str) -> str:
    local = f"{first.lower()}.{last.lower()}" + str(random.randint(10,99))
    domain = random.choice(domains)
    return f"{local}@{domain}"

results = []
seen = set()

while len(results) < 100:
    f = random.choice(first_names)
    l = random.choice(last_names)
    email = gen_email(f, l)
    key = (f, l, email)
    if key in seen:
        continue
    seen.add(key)
    results.append({"name": f"{f} {l}", "email": email})

# replace name and email columns in the dataframe
for i, r in enumerate(results):
    df.at[i, 'name'] = r['name']
    df.at[i, 'email'] = r['email']

In [14]:
df.to_csv("customers-final-fixed.csv", index=False)