# Script that loads and performs checks on raw customer data then exports it

## The script contains the following sections
### 1. Importing Libraries
### 2. Importing Data
### 3. Wrangling Data
### 4. Consistency Checks
### 5. Exporting Checked Customer Data

# 1. Importing Libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [2]:
# Folder Shortcut
path = r'C:\Users\seank\OneDrive\Dokumente\Career Foundry Data Analytics Course\Data Immersion\4 Python\03-2020_Instacart_Basket _Analysis'

In [3]:
# Importing Raw Customer Data
cust = pd.read_csv(os.path.join(path, '02_Data', 'Original_Data', 'customers.csv'),
                   index_col=False)

# 3. Data Wrangling

In [4]:
cust.head()    # previewing data

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
# Renaming variables
cust.rename(columns={'Surnam':'surname',
                      'First Name': 'first_name',
                      'Gender':'gender',
                      'Age':'age',
                      'STATE':'state'},
            inplace=True)

In [6]:
# Checking Variable Types
cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   first_name    194950 non-null  object
 2   surname       206209 non-null  object
 3   gender        206209 non-null  object
 4   state         206209 non-null  object
 5   age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [7]:
# Changing date to datetime format
cust['date_joined'] = pd.to_datetime(cust['date_joined'])
cust.dtypes

user_id                  int64
first_name              object
surname                 object
gender                  object
state                   object
age                      int64
date_joined     datetime64[ns]
n_dependants             int64
fam_status              object
income                   int64
dtype: object

In [8]:
# Changing customer ID to string
cust['user_id'] = cust['user_id'].astype('str')

In [9]:
# examining values
cust.describe()

Unnamed: 0,age,date_joined,n_dependants,income
count,206209.0,206209,206209.0,206209.0
mean,49.501646,2018-08-17 03:06:30.029532928,1.499823,94632.852548
min,18.0,2017-01-01 00:00:00,0.0,25903.0
25%,33.0,2017-10-23 00:00:00,0.0,59874.0
50%,49.0,2018-08-16 00:00:00,1.0,93547.0
75%,66.0,2019-06-10 00:00:00,3.0,124244.0
max,81.0,2020-04-01 00:00:00,3.0,593901.0
std,18.480962,,1.118433,42473.786988


These max and mins look OK

# 4. Consistency Checks

Mixed Data Types

In [10]:
# Check for mixed data types
for column in cust.columns:
    if cust[column].apply(type).nunique() > 1:
        print(f"Column '{column}' has mixed data types.")

Column 'first_name' has mixed data types.


In [11]:
# Converting first name to string
cust['first_name'] = cust['first_name'].astype('str')

In [12]:
print(cust['first_name'].dtype)

object


Missing Values

In [13]:
cust.isnull().sum()

user_id         0
first_name      0
surname         0
gender          0
state           0
age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

In [None]:
# but, we saw above  in ( cust.info() ) that first_name did have missing values. Checking if these were converted to nan
sorted_names = cust['first_name'].unique()
sorted_names = sorted(sorted_names)
print(sorted_names)

['Aaron', 'Adam', 'Alan', 'Albert', 'Alice', 'Amanda', 'Amy', 'Andrea', 'Andrew', 'Angela', 'Ann', 'Anna', 'Anne', 'Annie', 'Anthony', 'Antonio', 'Arthur', 'Ashley', 'Barbara', 'Benjamin', 'Betty', 'Beverly', 'Billy', 'Bob', 'Bobby', 'Bonnie', 'Brandon', 'Brenda', 'Brian', 'Bruce', 'Carl', 'Carlos', 'Carol', 'Carolyn', 'Catherine', 'Charles', 'Cheryl', 'Chris', 'Christina', 'Christine', 'Christopher', 'Clarence', 'Craig', 'Cynthia', 'Daniel', 'David', 'Deborah', 'Debra', 'Denise', 'Dennis', 'Diana', 'Diane', 'Donald', 'Donna', 'Doris', 'Dorothy', 'Douglas', 'Earl', 'Edward', 'Elizabeth', 'Emily', 'Eric', 'Ernest', 'Eugene', 'Evelyn', 'Frances', 'Frank', 'Fred', 'Garry', 'Gary', 'George', 'Gerald', 'Gloria', 'Gregory', 'Harold', 'Harry', 'Heather', 'Helen', 'Henry', 'Howard', 'Irene', 'Jack', 'Jacqueline', 'James', 'Jane', 'Janet', 'Janice', 'Jason', 'Jean', 'Jeffrey', 'Jennifer', 'Jeremy', 'Jerry', 'Jesse', 'Jessica', 'Jimmy', 'Joan', 'Joe', 'John', 'Johnny', 'Jonathan', 'Jose', 'Josep

they were

In [18]:
# Recoding as Missing
cust['first_name'].replace(['nan'], pd.NA, 
                           inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cust['first_name'].replace(['nan'], pd.NA,


In [19]:
cust.isnull().sum()

user_id             0
first_name      11259
surname             0
gender              0
state               0
age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

Since it is only first name, we can do nothing here to these missings

# Duplicates

In [20]:
cust_dups = cust[cust.duplicated()]
cust_dups

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,fam_status,income


No Duplicates found

## 5. Exporting Cleaned Checked customer Data


In [34]:
# Exporting as CSV
cust.to_csv(os.path.join(path, '02_Data', 'Prepared_Data', 'customers_checked.csv'),
            index = False)