In [15]:
import pandas as pd

# Step 1: Load Dataset
df = pd.read_csv("/Users/suhasgowdagr/Downloads/Mall_Customers.csv")
print("Original Shape:", df.shape)
print(df.head())

# Step 2: Check Missing Values
print("\nMissing Values:\n", df.isnull().sum())

# Handle missing values (drop rows with missing Age, Income, or Spending Score)
df.dropna(subset=["Age", "Annual Income (k$)", "Spending Score (1-100)"], inplace=True)
# OR fill missing values if you prefer:
# df["Age"].fillna(df["Age"].mean(), inplace=True)

# Step 3: Remove Duplicates
df.drop_duplicates(inplace=True)

# Step 4: Standardize Text (Gender column)
if "Gender" in df.columns:
    df["Gender"] = df["Gender"].str.lower().str.strip()
    df["Gender"] = df["Gender"].replace({"m": "male", "f": "female"})

# Step 5: Convert Dates (no Date column in this dataset, so skipped)

# Step 6: Fix Data Types (Age column)
if "Age" in df.columns:
    df["Age"] = pd.to_numeric(df["Age"], errors="coerce").astype("Int64")

# Step 7: Rename Columns (lowercase, underscores)
df.columns = df.columns.str.lower().str.replace(" ", "_")

# Step 8: Save Cleaned Dataset
df.to_csv("/Users/suhasgowdagr/Downloads/cleaned_mall_customers.csv", index=False)
print("\nCleaned dataset saved as 'cleaned_mall_customers.csv'")
print("Final Shape:", df.shape)

Original Shape: (200, 5)
   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40

Missing Values:
 CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

Cleaned dataset saved as 'cleaned_mall_customers.csv'
Final Shape: (200, 5)
