In [None]:
import pandas as pd

In [None]:
messy_data = {
    'customer_id': [1, 2, 2, 3, 4, None],
    'name': ['Alice', 'bob', 'bob', 'Charlie', '', 'David'],
    'signup_date': ['2024-01-15', '15/01/2024', 'invalid', '2024-01-17', '2024-01-18', '2024-01-19'],
    'country': ['USA', 'usa', 'USA', 'UK', None, 'Canada'],
    'lifetime_value': [1000, 2000, 2000, -100, 5000, 150]
}

In [None]:
df = pd.DataFrame(messy_data)
df

In [None]:
#  checking for null values
print("checking null values ->")
print(df.isnull().sum())
# checking for empty string
print("checking empty string ->")
checking_empty_values = (df == '').sum()
print(checking_empty_values)



#  first filling empty places with nan
df = df.replace('',None)

#  droping some rows
df_cleaned = df.dropna(subset=['customer_id'])

print("\nAfter handling nulls:")
print(df_cleaned)

In [None]:
#  checking for duplciated data
print(f"duplicated data: {df_cleaned.duplicated(subset=['customer_id']).sum()} ")
df_cleaned[df_cleaned.duplicated(subset=['customer_id'])]

df_cleaned = df_cleaned.sort_values(by='lifetime_value',ascending=False,na_position='last').drop_duplicates(subset=['customer_id'],keep='first')

df_cleaned

In [None]:
# now work on columns
df_cleaned['signup_date'] = pd.to_datetime(
    df_cleaned['signup_date'],
    errors='coerce'
)
df_cleaned = df_cleaned.dropna(subset=['signup_date'])
df_cleaned['name'] = df['name'].str.title()
df_cleaned['country'] = df['name'].str.upper()

print(df_cleaned.dtypes)
print(df_cleaned)

In [None]:
# Set negative lifetime_value to 0
df_cleaned['lifetime_value'] = df_cleaned['lifetime_value'].clip(lower=0)

In [None]:
df_cleaned

In [None]:
def validate_customer_data(df):
    errors = []

    # 1. No nulls in critical fields
    if df[['customer_id', 'lifetime_value']].isna().any().any():
        errors.append("Null values found in customer_id or lifetime_value")

    # 2. LTV must be >= 0
    if (df['lifetime_value'] < 0).any():
        errors.append("Negative lifetime_value detected")

    # 3. customer_id must be unique
    if not df['customer_id'].is_unique:
        errors.append("Duplicate customer_id detected")

    if errors:
        raise ValueError("Validation failed:\n- " + "\n- ".join(errors))


In [None]:
validate_customer_data(df_cleaned)