In [1]:
# Libraries and data.
import pandas as pd

df = pd.read_csv('shopping_data.csv')
df.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [2]:
# Print the columns.
df.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [3]:
# What are the data types?
df.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [4]:
# Any missing data?
df.count()

CustomerID                203
Card Member               201
Age                       201
Annual Income             203
Spending Score (1-100)    202
dtype: int64

In [5]:
# And the reverse of that.
df.isna().sum()

CustomerID                0
Card Member               2
Age                       2
Annual Income             0
Spending Score (1-100)    1
dtype: int64

In [6]:
# Drop the rows with missing values.
df = df.dropna()

In [7]:
# Check for duplicates in the dataset.
df.duplicated().sum()

0

In [8]:
# Remove the useless column from the dataset.
df = df.drop(columns='CustomerID', axis=1)
df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [9]:
# Convert string column to numerical value.
df['Card Member'] = df['Card Member'].apply(lambda x: 1 if x == 'Yes' else 0)
df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [10]:
# Rescale the annual income.
df['Annual Income'] = df['Annual Income'].apply(lambda x: x / 1000)
df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [11]:
# Rename columns.
df.rename(columns={'Card Member': 'CardMember', 'Annual Income': 'AnnualIncome', 'Spending Score (1-100)': 'SpendingScore'}, inplace=True)
df.head()

Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [12]:
# Output the data for others to make use of easily.
df.to_csv('shopping_data_cleaned.csv', index=False)