In [2]:
import pandas as pd
import plotly as px
import matplotlib


## CUSTOMER EXPLORATION


In [17]:
customer = pd.read_csv("../data/customers.csv")

### DATA BRIEF

In [18]:
print(f"The shape of the dataset is {customer.shape}")

The shape of the dataset is (2000, 14)


In [19]:
customer.head()

Unnamed: 0,customer_id,name,email,phone_number,country,city,age,gender,interests,signup_date,last_purchase_date,total_spent,purchase_frequency,churn
0,CUST0001,Laura Hill,patrickcooke@hamilton.com,+1 (830) 982-9806,CA,South Lisa,61,F,"Beauty, Fashion",2022-08-17,2025-02-13,225.82,1.9,0
1,CUST0002,Kimberly Burnett,stonebrenda@hotmail.com,(938) 909-3033,US,Lake Priscilla,33,M,"Books, Sports, Electronics",2022-08-15,2024-04-17,3001.28,4.2,1
2,CUST0003,Abigail West,williamsbailey@hotmail.com,+1 (265) 189-5206,CA,North Tinahaven,45,Other,"Electronics, Sports",2022-12-18,2025-07-05,3934.91,2.1,0
3,CUST0004,Carolyn Rowland,kevinshah@conway.com,+234 690 602 6127,NG,Catherinehaven,49,M,"Books, Home Appliances",2023-03-19,2024-10-03,162.0,2.4,1
4,CUST0005,Amanda Johnson,melissa32@arnold.org,+44 7914 177971,UK,Jorgeburgh,59,F,"Sports, Home Appliances, Beauty",2023-01-09,2023-06-11,2233.89,3.8,1


In [20]:
customer.columns

Index(['customer_id', 'name', 'email', 'phone_number', 'country', 'city',
       'age', 'gender', 'interests', 'signup_date', 'last_purchase_date',
       'total_spent', 'purchase_frequency', 'churn'],
      dtype='object')

### COLUMNS DEBRIEF

1. Personal Identifiable Information: name, email, location (city & country), age, gender , last_purchase_date 

2. Additional Info: customer_id (unique ID), total_spent (on products), purchase_frequency (or number_of_orders), interests (tags/keywords of product interest)

In [21]:
customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id         2000 non-null   object 
 1   name                2000 non-null   object 
 2   email               2000 non-null   object 
 3   phone_number        2000 non-null   object 
 4   country             2000 non-null   object 
 5   city                2000 non-null   object 
 6   age                 2000 non-null   int64  
 7   gender              2000 non-null   object 
 8   interests           2000 non-null   object 
 9   signup_date         2000 non-null   object 
 10  last_purchase_date  2000 non-null   object 
 11  total_spent         2000 non-null   float64
 12  purchase_frequency  2000 non-null   float64
 13  churn               2000 non-null   int64  
dtypes: float64(2), int64(2), object(10)
memory usage: 218.9+ KB


In [22]:
# Descriptive statistics for numerical columns
customer.describe()

Unnamed: 0,age,total_spent,purchase_frequency,churn
count,2000.0,2000.0,2000.0,2000.0
mean,43.8015,2557.034085,2.7728,0.7455
std,15.471127,1401.000706,1.293432,0.435689
min,18.0,100.17,0.5,0.0
25%,30.0,1387.18,1.7,0.0
50%,44.0,2529.855,2.8,1.0
75%,57.0,3772.4525,3.9,1.0
max,70.0,4997.25,5.0,1.0


From the stats above, the youngest & oldest customer by age are 18 & 70 year old respectively. On average customers spend around $2557. And the average purchase frequency is 2.7 (not bad if you would agree...)

In [23]:
# descriptive stats for categorical variables
customer.describe(include="object").T

Unnamed: 0,count,unique,top,freq
customer_id,2000,2000,CUST0001,1
name,2000,1968,Ashley Thomas,2
email,2000,1998,rjohnson@yahoo.com,2
phone_number,2000,2000,+1 (830) 982-9806,1
country,2000,5,IN,443
city,2000,1869,Lake James,4
gender,2000,3,F,671
interests,2000,154,Books,131
signup_date,2000,673,2022-08-17,11
last_purchase_date,2000,776,2024-11-04,9


 From the object stats, out of 2000 customer data, only 1968 are unqiue which is awkward & deserving a closer look. Same for email. Moving on, it's interesting to break the news that over 5 countries are serviced cutting across 1869 unique cities (with India possessing the highest service outreach). Meawhile, females top our customer leaderboard. Finally, data has it that the most highlighted interest among custoners is Book (is it safe now to say humans still read?...smiles...)

In [29]:
for n in customer.columns:
    print(customer[n].unique())

['CUST0001' 'CUST0002' 'CUST0003' ... 'CUST1998' 'CUST1999' 'CUST2000']
['Laura Hill' 'Kimberly Burnett' 'Abigail West' ... 'Ryan Schultz'
 'Tracy Ramos' 'Sharon Krueger']
['patrickcooke@hamilton.com' 'stonebrenda@hotmail.com'
 'williamsbailey@hotmail.com' ... 'jthomas@murray.com'
 'yscott@hotmail.com' 'ian41@hotmail.com']
['+1 (830) 982-9806' '(938) 909-3033' '+1 (265) 189-5206' ...
 '(851) 622-9626' '(492) 731-2367' '+1 (424) 597-4611']
['CA' 'US' 'NG' 'UK' 'IN']
['South Lisa' 'Lake Priscilla' 'North Tinahaven' ... 'South Scott'
 'West Andrewfurt' 'Lake Willieberg']
[61 33 45 49 59 38 57 31 18 66 62 64 35 41 53 46 70 27 56 55 28 63 60 54
 29 24 67 37 25 50 42 39 32 68 34 40 69 30 48 52 20 47 44 19 23 22 21 36
 51 26 58 43 65]
['F' 'M' 'Other']
['Beauty, Fashion' 'Books, Sports, Electronics' 'Electronics, Sports'
 'Books, Home Appliances' 'Sports, Home Appliances, Beauty' 'Books'
 'Books, Electronics' 'Books, Beauty' 'Electronics'
 'Home Appliances, Beauty, Sports' 'Beauty, Sports'
 '