# 4.9 Part 1.2 - Customer dataframe cleansing

## Clean & wrangle customers dataframes

### This script contains:

#### 1. Import libraries
#### 2. Import dataframes
#### 3. Data wrangling
####      3.1 customers dataframe 
#### 4. Consistency checks
####      4.1 customers dataframe 
#### 5. Exporting the customer dataframe

# Import libraries

In [1]:
# import libraries

import pandas as pd
import numpy as np
import os

# Import dataframes

In [2]:
# project folder path

path = r'C:\Users\Odette\Desktop\CareerFoundry\Immersion Courses\Course 4\Instacart Basket Analysis'

In [3]:
# import of customers data

customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

In [4]:
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
customers.shape

(206209, 10)

In [6]:
customers.tail()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799
206208,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095


# Data Wrangling

## customers dataframe

In [7]:
# renaming Gender column to gender

customers.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [8]:
# renaming STATE column to state

customers.rename(columns = {'STATE' : 'state'}, inplace = True)

In [9]:
# renaming Age column to age

customers.rename(columns = {'Age' : 'age'}, inplace = True)

In [10]:
customers.head()

Unnamed: 0,user_id,First Name,Surnam,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [11]:
# dropping First Name & Surnam columns

customers = customers.drop(columns = ['First Name', 'Surnam'])

In [12]:
customers.head()

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


# Consistency checks

## customers dataframe

In [13]:
# check for NaN in entire customers dataframe

customers.isnull().values.any()

False

In [14]:
# confirmation - check for NaN by each column

customers.isnull().sum()

user_id         0
gender          0
state           0
age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

In [15]:
# check data types of all columns

customers.dtypes

user_id          int64
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [16]:
# convert the user_id column to string/object format

customers['user_id'] = customers['user_id'].astype('str')

In [17]:
# convert the date_joined column to datetime format

customers['date_joined']= pd.to_datetime(customers['date_joined'])

In [18]:
# check data types of all columns

customers.dtypes

user_id                 object
gender                  object
state                   object
age                      int64
date_joined     datetime64[ns]
n_dependants             int64
fam_status              object
income                   int64
dtype: object

In [19]:
# descriptive stats of customers dataframe

customers.describe()

Unnamed: 0,age,n_dependants,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


In [20]:
# Look for and create a subset of full duplicate values

df_dups = customers[customers.duplicated()]

In [21]:
df_dups

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income


The customers dataframe has been wrangled and cleansed

# Exporting the customer dataframe

In [22]:
# Exporting the customers dataframe

customers.to_csv(os.path.join(path, '02 Data','Prepared Data', 'customers_clean.csv'), index = False)