## **Import Required Libraries**

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

### **Load the Dataset**

In [11]:
# Load the customer purchase dataset
df = pd.read_csv('customer_purchase_dataset.csv')

print("Dataset loaded successfully!")

Dataset loaded successfully!


##Display Dataset Shape

In [12]:
# Display the shape of the dataset
print(f"Dataset Shape: {df.shape}")
print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}")

Dataset Shape: (500, 10)
Number of Rows: 500
Number of Columns: 10


##Preview First Few Rows

In [13]:
# Display first 10 rows
df.head(10)

Unnamed: 0,CustomerID,Age,Gender,Location,ProductCategory,PurchaseAmount,PaymentMethod,PurchaseDate,DeviceUsed,ReturningCustomer
0,e555349c-0446-41fc-a343-188e4cbc226a,38,Male,Germany,Books,1969.04,PayPal,2024-08-29,Desktop,Yes
1,042dd0c7-b4ab-42f5-bd1c-04795a375acd,70,Female,Canada,Sports,464.76,Cash,2025-03-23,Mobile,No
2,300b7393-3eff-4268-91d5-97c97fc02a36,70,Male,India,Books,1724.29,PayPal,2024-04-05,Mobile,No
3,a6dc41cf-8117-4df3-aa63-fe9729fb8143,41,Male,Australia,Sports,58.1,Debit Card,2025-12-17,Mobile,No
4,d6369cb1-736d-409e-8276-bd5101928553,24,Female,Canada,Clothing,1571.8,Credit Card,2024-11-10,Desktop,No
5,5ca7ab3d-7f8e-4837-8dcc-f9719c3fcc35,53,Male,UK,Sports,880.5,Credit Card,2025-01-31,Tablet,Yes
6,429f0681-bc60-4fed-b6d0-cfe084315a5f,37,Female,USA,Home,1577.19,Credit Card,2024-08-27,Mobile,No
7,1d2c8988-5e40-4808-a514-37469e7fda62,49,Male,India,Clothing,1279.79,Credit Card,2024-07-18,Mobile,No
8,ad8c9ed5-1814-42ae-88a1-6184a51fa49d,44,Male,Australia,Books,877.9,Credit Card,2024-04-05,Tablet,Yes
9,bf684e3f-94d5-40e4-bcbd-ec44a980fd29,34,Male,USA,Electronics,209.9,Debit Card,2024-06-21,Tablet,No


##Check Column Names and Data Types

In [14]:
# Display column names
print("Column Names:")
print(df.columns.tolist())
print("\n" + "="*50 + "\n")

# Display data types and non-null counts
print("Data Types and Information:")
df.info()

Column Names:
['CustomerID', 'Age', 'Gender', 'Location', 'ProductCategory', 'PurchaseAmount', 'PaymentMethod', 'PurchaseDate', 'DeviceUsed', 'ReturningCustomer']


Data Types and Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         500 non-null    object 
 1   Age                500 non-null    int64  
 2   Gender             500 non-null    object 
 3   Location           500 non-null    object 
 4   ProductCategory    500 non-null    object 
 5   PurchaseAmount     500 non-null    float64
 6   PaymentMethod      500 non-null    object 
 7   PurchaseDate       500 non-null    object 
 8   DeviceUsed         500 non-null    object 
 9   ReturningCustomer  500 non-null    object 
dtypes: float64(1), int64(1), object(8)
memory usage: 39.2+ KB


## Basic Statistical Summary





In [15]:
# Display statistical summary for numerical columns
print("Statistical Summary of Numerical Features:")
df.describe()

Statistical Summary of Numerical Features:


Unnamed: 0,Age,PurchaseAmount
count,500.0,500.0
mean,42.804,1042.29228
std,15.673318,567.661767
min,18.0,53.55
25%,29.0,560.9625
50%,42.0,1067.625
75%,56.0,1495.565
max,70.0,1996.78


##Check for Missing Values

In [16]:
# Check if there a missing values
print("Missing Values:")
print(df.isnull().sum())
print("\n" + "="*50 + "\n")

# Percentage of missing values
print("Percentage of Missing Values:")
print((df.isnull().sum() / len(df)) * 100)

Missing Values:
CustomerID           0
Age                  0
Gender               0
Location             0
ProductCategory      0
PurchaseAmount       0
PaymentMethod        0
PurchaseDate         0
DeviceUsed           0
ReturningCustomer    0
dtype: int64


Percentage of Missing Values:
CustomerID           0.0
Age                  0.0
Gender               0.0
Location             0.0
ProductCategory      0.0
PurchaseAmount       0.0
PaymentMethod        0.0
PurchaseDate         0.0
DeviceUsed           0.0
ReturningCustomer    0.0
dtype: float64


##Examine Target Variable Distribution

In [17]:
# Check the distribution of the target variable
print("Target Variable (ReturningCustomer) Distribution:")
print(df['ReturningCustomer'].value_counts())
print("\n" + "="*50 + "\n")

print("Target Variable Percentage:")
print(df['ReturningCustomer'].value_counts(normalize=True) * 100)

Target Variable (ReturningCustomer) Distribution:
ReturningCustomer
No     257
Yes    243
Name: count, dtype: int64


Target Variable Percentage:
ReturningCustomer
No     51.4
Yes    48.6
Name: proportion, dtype: float64


##Examine Categorical Features

In [18]:
# Display unique values for categorical columns
categorical_columns = ['Gender', 'Location', 'ProductCategory', 'PaymentMethod', 'DeviceUsed', 'ReturningCustomer']

for col in categorical_columns:
    print(f"\n{col}:")
    print(f"Unique values: {df[col].nunique()}")
    print(df[col].value_counts())
    print("="*50)


Gender:
Unique values: 2
Gender
Male      253
Female    247
Name: count, dtype: int64

Location:
Unique values: 6
Location
USA          91
Australia    85
UK           84
India        82
Canada       80
Germany      78
Name: count, dtype: int64

ProductCategory:
Unique values: 5
ProductCategory
Home           107
Electronics    103
Books           98
Clothing        96
Sports          96
Name: count, dtype: int64

PaymentMethod:
Unique values: 4
PaymentMethod
Debit Card     133
Credit Card    130
Cash           119
PayPal         118
Name: count, dtype: int64

DeviceUsed:
Unique values: 3
DeviceUsed
Mobile     172
Tablet     167
Desktop    161
Name: count, dtype: int64

ReturningCustomer:
Unique values: 2
ReturningCustomer
No     257
Yes    243
Name: count, dtype: int64
