In [1]:
# Data Cleaning for Mall Customer Dataset - Task 1

# Importing the pandas library
import pandas as pd

# Load data
df = pd.read_csv("Mall_Customers_RAW.csv")
print("Initial Dataset Preview:")
display(df.head())

# Dataset structure
print("\nDataset Info:")
df.info()

# Check missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Remove duplicate rows
initial_rows = df.shape[0]
df.drop_duplicates(inplace=True)
removed = initial_rows - df.shape[0]
print(f"\n✅ Removed {removed} duplicate rows.")

# Standardize Gender
df['Gender'] = df['Gender'].str.strip().str.lower()

# Rename columns to lowercase and snake_case
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Handle missing values with median for numerical columns
df['age'].fillna(df['age'].median(), inplace=True)
df['annual_income_(k$)'].fillna(df['annual_income_(k$)'].median(), inplace=True)
df['spending_score_(1-100)'].fillna(df['spending_score_(1-100)'].median(), inplace=True)

# Convert customerid to string (categorical)
df['customerid'] = df['customerid'].astype(str)

# Final dataset check
print("\nCleaned Dataset Preview:")
display(df.head())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# Export cleaned dataset
df.to_csv("cleaned_mall_customers.csv", index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_mall_customers.csv'")


Initial Dataset Preview:


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,MALE,19.0,15.0,39.0
1,2,MALE,21.0,15.0,81.0
2,3,female,20.0,16.0,6.0
3,4,female,23.0,16.0,
4,5,female,31.0,17.0,40.0



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   CustomerID              210 non-null    int64  
 1   Gender                  210 non-null    object 
 2   Age                     199 non-null    float64
 3   Annual Income (k$)      198 non-null    float64
 4   Spending Score (1-100)  199 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 8.3+ KB

Missing Values:
CustomerID                 0
Gender                     0
Age                       11
Annual Income (k$)        12
Spending Score (1-100)    11
dtype: int64

✅ Removed 10 duplicate rows.

Cleaned Dataset Preview:


Unnamed: 0,customerid,gender,age,annual_income_(k$),spending_score_(1-100)
0,1,male,19.0,15.0,39.0
1,2,male,21.0,15.0,81.0
2,3,female,20.0,16.0,6.0
3,4,female,23.0,16.0,50.0
4,5,female,31.0,17.0,40.0



Missing Values After Cleaning:
customerid                0
gender                    0
age                       0
annual_income_(k$)        0
spending_score_(1-100)    0
dtype: int64

✅ Cleaned dataset saved as 'cleaned_mall_customers.csv'
