# Data Cleaning
**Group 4** \
Adam Zakaria Ababneh \
Matt Levitsky \
Kate Paparsenou \
Chris Umbel

In [77]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

# Load data

In [78]:
df = pd.read_csv('data/ecommerce_customer_data.csv')
df['Age'] = df['Age'].astype('Int64')
df['TotalPurchases'] = df['TotalPurchases'].astype('Int64')
df['CustomerServiceInteractions'] = df['CustomerServiceInteractions'].astype('Int64')
df

Unnamed: 0,CustomerID,RegistrationDate,Age,Gender,IncomeLevel,Country,City,TotalPurchases,AverageOrderValue,CustomerLifetimeValue,...,SocialMediaEngagementRate,MobileAppUsage,CustomerServiceInteractions,AverageSatisfactionScore,EmailConversionRate,SocialMediaConversionRate,SearchEngineConversionRate,RepeatCustomer,PremiumMember,HasReturnedItems
0,,2020-05-10,25,,,Other,Tokyo,4,15.886509,,...,0.379694,High,0,6.240881,0.112955,0.239948,0.156776,Yes,Yes,No
1,CUST00002,2021-07-18,,Prefer not to say,High,Germany,London,6,27.638853,181.725056,...,0.140988,Low,0,7.721917,0.300979,0.230821,0.290735,Yes,No,No
2,CUST00003,2021-02-04,43,,High,France,,10,161.739425,1810.555150,...,0.323660,Low,0,7.899042,,0.132239,0.050505,Yes,No,Yes
3,CUST00004,2020-12-31,49,Female,High,Australia,Los Angeles,5,14.194263,86.219740,...,0.268428,Never,1,7.829459,0.074097,0.206644,0.281067,Yes,Yes,No
4,CUST00005,2022-06-27,29,Female,Very High,Australia,Tokyo,7,31147.427206,2112.575945,...,0.160427,High,,7.777353,0.163438,0.232410,0.326645,Yes,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,CUST09996,2018-09-29,39,Female,High,Japan,New York,4,60.986389,212.992614,...,0.128575,High,1,6.094181,0.240422,0.293145,,Yes,Yes,Yes
9996,CUST09997,2020-10-25,58,Prefer not to say,Low,France,Sydney,6,75.208556,549.352094,...,0.298521,Never,,8.610345,0.056814,0.128591,0.070305,Yes,No,Yes
9997,CUST09998,2023-04-07,14,Other,Very High,USA,New York,1,37.196899,117.203509,...,0.126018,,0,9.864245,0.355694,0.101445,0.161469,No,No,Yes
9998,CUST09999,2021-05-27,29,Other,High,Canada,Berlin,3,13.191952,-6.655247,...,0.600112,,0,6.163032,0.222856,0.160716,0.431023,Yes,No,No


In [79]:
int_columns = ['TotalPurchases', 'CustomerServiceInteractions', 'Age']
categorical_columns = list(df.select_dtypes(include = 'object').columns)
categorical_columns.remove('CustomerID')
categorical_columns.remove('RegistrationDate')
quantitative_columns = list(df.select_dtypes(include = ['float64', 'Int64']).columns)

# Cleaning

In [80]:
def column_replace(df, column_name, from_value, to_value):
    # replace all instances of from_value with to_value for a given column
    df.loc[df[column_name] == from_value, column_name] = to_value

## Gender

#### Replace shortened text anomalies

In [81]:
column_replace(df, 'Gender', 'F', 'Female')
column_replace(df, 'Gender', 'M', 'Male')

### Replace N/As

In [82]:
df['Gender'] = df['Gender'].fillna('Prefer not to say')

#### Check

In [83]:
int(df[df['Gender'] == 'F'].shape[0] + df[df['Gender'] == 'M'].shape[0] + df['Gender'].isna().sum())

0

## IncomeLevel 

#### Replace "H" with "High", "L" with "Low"

In [84]:
column_replace(df, 'IncomeLevel', 'H', 'High')
column_replace(df, 'IncomeLevel', 'L', 'Low')

#### Classify for all N/A

In [85]:
def impute_classification(df, column_name, from_columns = None):
    if not from_columns:
        from_columns = quantitative_columns + categorical_columns
        from_columns.remove(column_name)
    
    mask_unknown = df[column_name].isna()
    mask_known = ~mask_unknown

    X = df[from_columns]
    y = df[column_name]    
    X_known = X[mask_known]
    y_known = y[mask_known]
    
    num_cols = X.select_dtypes(include = [np.number]).columns
    cat_cols = X.select_dtypes(exclude = [np.number]).columns

    # pre-processing pipeline    
    preprocessor = ColumnTransformer(
        transformers=[
            # NOTE, this is for handling N/A in INPUT FEATURES, not for our final imputation 
            ('num', SimpleImputer(strategy = 'median'), num_cols),
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy = 'most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown = 'ignore'))
            ]), cat_cols)
        ]
    )

    # data flow pipeline    
    pipe = Pipeline([
        ('preprocess', preprocessor),
        ('model', DecisionTreeClassifier())
    ])

    # train the model
    pipe.fit(X_known, y_known)

    # inference
    if mask_unknown.any():
        X_unknown = X[mask_unknown]
        y_pred = pipe.predict(X_unknown)
        df.loc[mask_unknown, column_name] = pd.Series(y_pred, index=df.index[mask_unknown])
    
    return df

In [86]:
df['IncomeLevel'].value_counts(dropna = False)

IncomeLevel
NaN          2503
Very High    1964
Medium       1918
Low          1822
High         1793
Name: count, dtype: int64

In [87]:
df = impute_classification(df, 'IncomeLevel')

In [88]:
df['IncomeLevel'].value_counts(dropna = False)

IncomeLevel
Very High    2630
Medium       2572
Low          2434
High         2364
Name: count, dtype: int64

## RepeatCustomer

In [89]:
df['RepeatCustomer'].value_counts(dropna = False)

RepeatCustomer
Yes    8479
No     1046
NaN     475
Name: count, dtype: int64

In [90]:
df = impute_classification(df, 'RepeatCustomer')

In [91]:
df['RepeatCustomer'].value_counts(dropna = False)

RepeatCustomer
Yes    8900
No     1100
Name: count, dtype: int64

## MobileAppUsage

In [118]:
df['MobileAppUsage'].value_counts(dropna = False)

MobileAppUsage
NaN       2457
Never     1914
Medium    1894
Low       1868
High      1867
Name: count, dtype: int64

In [119]:
df = impute_classification(df, 'MobileAppUsage')

In [120]:
df['MobileAppUsage'].value_counts(dropna = False)

MobileAppUsage
Never     2525
Medium    2523
High      2489
Low       2463
Name: count, dtype: int64

### PCA Regression

Extracts dimensionally-reduced features with PCA, and then uses a regressor for prediction.

Based on https://scikit-learn.org/stable/auto_examples/compose/plot_digits_pipe.html

In [92]:
def impute_regression(df, column_name, from_columns = None):
    if not from_columns:
        from_columns = quantitative_columns + categorical_columns
        from_columns.remove(column_name)
    
    mask_unknown = df[column_name].isna()
    mask_known = ~mask_unknown

    X = df[from_columns]
    y = df[column_name]    
    X_known = X[mask_known]
    y_known = y[mask_known]

    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in from_columns if c not in num_cols]
    
    # pre-processing pipelines
    numeric_pipeline = Pipeline([
        # NOTE, this is for handling N/A in INPUT FEATURES, not for our final imputation 
        ('imputer', SimpleImputer(strategy = 'median')), 
        ('scaler', StandardScaler()),
        # reduce to n - 1 components, this might require tweaking, but is a good start
        ('pca', PCA(n_components = 0.95))
    ])

    cat_pipeline = None
    
    if len(cat_cols) > 0:
        cat_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy = 'most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
        ])

    # combine pipelines into single preprocessor
    transformers = []

    if len(num_cols) > 0:
        transformers.append(('num', numeric_pipeline, num_cols))
        
    if cat_pipeline is not None:
        transformers.append(('cat', cat_pipeline, cat_cols))

    preprocessor = ColumnTransformer(transformers, remainder = 'drop')
    
    # regressor for final imputation
    model = RandomForestRegressor(
        n_estimators = 300,
        random_state = 42,
        n_jobs = -1
    )

    # data flow pipeline
    pipe = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', model)
    ])

    # train the model
    pipe.fit(X_known, y_known)

    # inference
    if mask_unknown.any():
        X_missing = X[mask_unknown]
        # inference: do the imputing
        y_pred = pipe.predict(X_missing)
        # write back imputed data to dataframe
        df.loc[mask_unknown, column_name] = y_pred

    return df

In [93]:
df['AverageSatisfactionScore'].isna().sum()

np.int64(496)

In [94]:
df = impute_regression(df, 'AverageSatisfactionScore')

In [95]:
df['AverageSatisfactionScore'].isna().sum()

np.int64(0)

## EmailEngagementRate

In [98]:
df['EmailEngagementRate'].isna().sum()

np.int64(476)

In [99]:
df = impute_regression(df, 'EmailEngagementRate')

In [100]:
df['EmailEngagementRate'].isna().sum()

np.int64(0)

## SocialMediaEngagementRate

In [101]:
df['SocialMediaEngagementRate'].isna().sum()

np.int64(528)

In [102]:
df = impute_regression(df, 'SocialMediaEngagementRate')

In [103]:
df['SocialMediaEngagementRate'].isna().sum()

np.int64(0)

## EmailConversionRate

In [107]:
df['EmailConversionRate'].isna().sum()

np.int64(523)

In [108]:
df = impute_regression(df, 'EmailConversionRate')

In [109]:
df['EmailConversionRate'].isna().sum()

np.int64(0)

In [110]:
## SocialMediaConversionRate

In [111]:
df['SocialMediaConversionRate'].isna().sum()

np.int64(494)

In [112]:
df = impute_regression(df, 'SocialMediaConversionRate')

In [113]:
df['SocialMediaConversionRate'].isna().sum()

np.int64(0)

## SearchEngineConversionRate

In [115]:
df['SearchEngineConversionRate'].isna().sum()

np.int64(505)

In [116]:
df = impute_regression(df, 'SearchEngineConversionRate')

In [117]:
df['SearchEngineConversionRate'].isna().sum()

np.int64(0)

## Favorites

In [None]:
df_fav = df[['FavoriteCategory', 'SecondFavoriteCategory']]
df_first_fav = pd.get_dummies(df_fav, columns = ['FavoriteCategory'], prefix = 'FavoriteCategory', dtype = float)
df_second_fav = pd.get_dummies(df_fav, columns = ['SecondFavoriteCategory'], prefix = 'FavoriteCategory', dtype = float)
df_second_fav = df_second_fav.replace(1.0, 0.5)
df_fav_dummies = pd.concat([df_first_fav, df_second_fav], axis = 1).groupby(level = 0, axis = 1).max()
df_fav_dummies

  df_fav_dummies = pd.concat([df_first_fav, df_second_fav], axis = 1).groupby(level = 0, axis = 1).max()


Unnamed: 0,FavoriteCategory,FavoriteCategory_Beauty,FavoriteCategory_Books,FavoriteCategory_Clothing,FavoriteCategory_Electronics,FavoriteCategory_Food,FavoriteCategory_Home Goods,FavoriteCategory_Sports,FavoriteCategory_Toys,SecondFavoriteCategory
0,Clothing,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0,Books
1,Electronics,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,Toys
2,,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,Books
3,Sports,0.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0,Clothing
4,Clothing,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Beauty
...,...,...,...,...,...,...,...,...,...,...
9995,Toys,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0,Food
9996,Food,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,Home Goods
9997,Toys,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,
9998,Toys,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Toys


In [None]:
df = pd.concat([df, df_fav_dummies], axis = 1).drop(['FavoriteCategory', 'SecondFavoriteCategory'], axis = 1)
df

Unnamed: 0,CustomerID,RegistrationDate,Age,Gender,IncomeLevel,Country,City,TotalPurchases,AverageOrderValue,CustomerLifetimeValue,...,PremiumMember,HasReturnedItems,FavoriteCategory_Beauty,FavoriteCategory_Books,FavoriteCategory_Clothing,FavoriteCategory_Electronics,FavoriteCategory_Food,FavoriteCategory_Home Goods,FavoriteCategory_Sports,FavoriteCategory_Toys
0,,2020-05-10,25,Prefer not to say,High,Other,Tokyo,4,15.886509,,...,Yes,No,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0
1,CUST00002,2021-07-18,,Prefer not to say,High,Germany,London,6,27.638853,181.725056,...,No,No,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5
2,CUST00003,2021-02-04,43,Prefer not to say,High,France,,10,161.739425,1810.555150,...,No,Yes,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,CUST00004,2020-12-31,49,Female,High,Australia,Los Angeles,5,14.194263,86.219740,...,Yes,No,0.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0
4,CUST00005,2022-06-27,29,Female,Very High,Australia,Tokyo,7,31147.427206,2112.575945,...,No,No,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,CUST09996,2018-09-29,39,Female,High,Japan,New York,4,60.986389,212.992614,...,Yes,Yes,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0
9996,CUST09997,2020-10-25,58,Prefer not to say,Low,France,Sydney,6,75.208556,549.352094,...,No,Yes,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0
9997,CUST09998,2023-04-07,14,Other,Very High,USA,New York,1,37.196899,117.203509,...,No,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9998,CUST09999,2021-05-27,29,Other,High,Canada,Berlin,3,13.191952,-6.655247,...,No,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Location

In [None]:
df_locations = pd.read_csv('./data/worldcities.csv')
df_locations

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6870,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764
1,Jakarta,Jakarta,-6.1750,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077
2,Delhi,Delhi,28.6100,77.2300,India,IN,IND,Delhi,admin,32226000.0,1356872604
3,Guangzhou,Guangzhou,23.1300,113.2600,China,CN,CHN,Guangdong,admin,26940000.0,1156237133
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629
...,...,...,...,...,...,...,...,...,...,...,...
48054,Al Jabīn,Al Jabin,14.7040,43.5990,Yemen,YE,YEM,Raymah,admin,,1887910100
48055,Nelspruit,Nelspruit,-25.4745,30.9703,South Africa,ZA,ZAF,Mpumalanga,admin,,1710114438
48056,Gqeberha,Gqeberha,-33.9681,25.5981,South Africa,ZA,ZAF,Eastern Cape,,,1710000082
48057,Lupane,Lupane,-18.9315,27.8070,Zimbabwe,ZW,ZWE,Matabeleland North,admin,,1716206606


In [None]:
df_capitals = df_locations[df_locations['capital'] == 'primary']
df_capitals

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6870,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000.0,1392685764
1,Jakarta,Jakarta,-6.1750,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077
5,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,24922000.0,1608618140
8,Seoul,Seoul,37.5667,126.9833,"Korea, South",KR,KOR,Seoul,primary,23016000.0,1410836482
9,Mexico City,Mexico City,19.4333,-99.1333,Mexico,MX,MEX,Ciudad de México,primary,21804000.0,1484247881
...,...,...,...,...,...,...,...,...,...,...,...
47679,Ngerulmud,Ngerulmud,7.5006,134.6242,Palau,PW,PLW,Melekeok,primary,271.0,1585525081
47750,Adamstown,Adamstown,-25.0667,-130.0833,Pitcairn Islands,PN,PCN,,primary,47.0,1612000000
47876,King Edward Point,King Edward Point,-54.2833,-36.5000,South Georgia And South Sandwich Islands,GS,SGS,,primary,,1239048838
47916,Plymouth,Plymouth,16.7064,-62.2158,Montserrat,MS,MSR,Saint Anthony,primary,,1500711935


#### If city is unknown, but country is known, fill in with the capital.

In [None]:
int(df['City'].isna().sum())

483

In [None]:
idx_country_no_city = df['City'].isna() & df['Country'].notna()
df.loc[idx_country_no_city, 'City'] = pd.merge(df, df_capitals, left_on = 'Country', right_on = 'country', how = 'left')['city']
df

Unnamed: 0,CustomerID,RegistrationDate,Age,Gender,IncomeLevel,Country,City,TotalPurchases,AverageOrderValue,CustomerLifetimeValue,...,PremiumMember,HasReturnedItems,FavoriteCategory_Beauty,FavoriteCategory_Books,FavoriteCategory_Clothing,FavoriteCategory_Electronics,FavoriteCategory_Food,FavoriteCategory_Home Goods,FavoriteCategory_Sports,FavoriteCategory_Toys
0,,2020-05-10,25,Prefer not to say,High,Other,Tokyo,4,15.886509,,...,Yes,No,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0
1,CUST00002,2021-07-18,,Prefer not to say,High,Germany,London,6,27.638853,181.725056,...,No,No,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5
2,CUST00003,2021-02-04,43,Prefer not to say,High,France,Paris,10,161.739425,1810.555150,...,No,Yes,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,CUST00004,2020-12-31,49,Female,High,Australia,Los Angeles,5,14.194263,86.219740,...,Yes,No,0.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0
4,CUST00005,2022-06-27,29,Female,Very High,Australia,Tokyo,7,31147.427206,2112.575945,...,No,No,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,CUST09996,2018-09-29,39,Female,High,Japan,New York,4,60.986389,212.992614,...,Yes,Yes,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0
9996,CUST09997,2020-10-25,58,Prefer not to say,Low,France,Sydney,6,75.208556,549.352094,...,No,Yes,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0
9997,CUST09998,2023-04-07,14,Other,Very High,USA,New York,1,37.196899,117.203509,...,No,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9998,CUST09999,2021-05-27,29,Other,High,Canada,Berlin,3,13.191952,-6.655247,...,No,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
int(df['City'].isna().sum())

197

#### If the country and city combination are nonsensical, wipe out country and fill it in later 

In [None]:
df_bad_locations = pd.merge(df, df_locations, left_on = ['Country', 'City'], right_on = ['country', 'city'], how = 'left')
bad_idx = df_bad_locations['country'].isna() & df_bad_locations['city'].isna() 
df.loc[bad_idx, 'Country'] = np.nan

#### If country is unknown, but city is known, lookup country

In [None]:
int(df['Country'].isna().sum())

8848

In [None]:
idx_city_no_country = df['Country'].isna() & df['City'].notna()
idx_city_no_country

0        True
1        True
2       False
3        True
4        True
        ...  
9995     True
9996     True
9997     True
9998     True
9999    False
Length: 10000, dtype: bool

In [None]:
df_canonical_cities = pd.merge(df, df_locations, left_on = 'City', right_on = 'city', how = 'left').drop_duplicates(subset = 'city', keep = 'first')[['country', 'city']]
df.loc[idx_city_no_country, 'Country'] = pd.merge(df, df_canonical_cities, left_on = 'City', right_on = 'city', how = 'left')['country']

In [None]:
int(df['City'].isna().sum())

197

#### If both are unknown, fill with "Other"

In [None]:
df[df['Country'].isna() & df['City'].isna()].shape[0]

197

In [None]:
df.loc[(df['Country'].isna() & df['City'].isna()), ['Country', 'City']] = 'Other'

In [None]:
df[df['Country'].isna() & df['City'].isna()].shape[0]

0

In [None]:
df

Unnamed: 0,CustomerID,RegistrationDate,Age,Gender,IncomeLevel,Country,City,TotalPurchases,AverageOrderValue,CustomerLifetimeValue,...,PremiumMember,HasReturnedItems,FavoriteCategory_Beauty,FavoriteCategory_Books,FavoriteCategory_Clothing,FavoriteCategory_Electronics,FavoriteCategory_Food,FavoriteCategory_Home Goods,FavoriteCategory_Sports,FavoriteCategory_Toys
0,,2020-05-10,25,Prefer not to say,High,Japan,Tokyo,4,15.886509,,...,Yes,No,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0
1,CUST00002,2021-07-18,,Prefer not to say,High,United Kingdom,London,6,27.638853,181.725056,...,No,No,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5
2,CUST00003,2021-02-04,43,Prefer not to say,High,France,Paris,10,161.739425,1810.555150,...,No,Yes,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,CUST00004,2020-12-31,49,Female,High,United States,Los Angeles,5,14.194263,86.219740,...,Yes,No,0.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0
4,CUST00005,2022-06-27,29,Female,Very High,Japan,Tokyo,7,31147.427206,2112.575945,...,No,No,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,CUST09996,2018-09-29,39,Female,High,United States,New York,4,60.986389,212.992614,...,Yes,Yes,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0
9996,CUST09997,2020-10-25,58,Prefer not to say,Low,Australia,Sydney,6,75.208556,549.352094,...,No,Yes,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0
9997,CUST09998,2023-04-07,14,Other,Very High,United States,New York,1,37.196899,117.203509,...,No,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9998,CUST09999,2021-05-27,29,Other,High,Germany,Berlin,3,13.191952,-6.655247,...,No,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Save

In [None]:
df.to_csv('data/ecommerce_customer_data_cleaned.csv')