In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np

# Step 2: Create a Sample Customer Dataset 
data = {
    "Customer_ID": [201, 202, 203, 204, 205, 206, 207, 207],
    "Name": ["Grace", "Hannah", "Ian", None, "Jack", "Kate", "Leo", "Leo"],
    "Age": [27, None, 31, 26, 35, 30, None, 38],
    "Email": ["grace@mail.com", "hannah@mail.com", None, "dylan@mail.com", "jack@mail.com", "kate@mail.com", "leo@mail.com", "leo@mail.com"],
    "Join_Date": ["2024-05-01", "2024-06-12", "2024-07-23", "2024-08-10", None, "2024-09-05", "2024-10-15", "2024-10-15"],
    "Spending ($)": [450, 520, None, 410, 610, None, 700, 700]
}

# Create DataFrame
df = pd.DataFrame(data)
print(df)

# Step 3: Identify Missing Values
print("Missing Values:\n", df.isnull().sum())




   Customer_ID    Name   Age            Email   Join_Date  Spending ($)
0          201   Grace  27.0   grace@mail.com  2024-05-01         450.0
1          202  Hannah   NaN  hannah@mail.com  2024-06-12         520.0
2          203     Ian  31.0             None  2024-07-23           NaN
3          204    None  26.0   dylan@mail.com  2024-08-10         410.0
4          205    Jack  35.0    jack@mail.com        None         610.0
5          206    Kate  30.0    kate@mail.com  2024-09-05           NaN
6          207     Leo   NaN     leo@mail.com  2024-10-15         700.0
7          207     Leo  38.0     leo@mail.com  2024-10-15         700.0
Missing Values:
 Customer_ID     0
Name            1
Age             2
Email           1
Join_Date       1
Spending ($)    2
dtype: int64


In [9]:
# Step 4: Drop Rows with Missing Email (Critical Data)
df = df.dropna(subset=["Email"])
print("\nAfter Dropping Missing Emails:\n", df)

# Step 5: Fill Missing Values in Age & Spending ($)
df["Age"].fillna(df["Age"].mean(), inplace=True)  # Fill Age with mean
df["Spending ($)"].fillna(df["Spending ($)"].median(), inplace=True)  # Fill Spending with median
print("\nAfter Filling Missing Values:\n", df)

# Step 6: Convert Join_Date to Date Format
df["Join_Date"] = pd.to_datetime(df["Join_Date"])
print("\nData Types After Date Conversion:\n", df.dtypes)

# Step 7: Remove Duplicate Customer Records
df = df.drop_duplicates()
print("\nAfter Removing Duplicates:\n", df)

# Step 8: Rename Columns for Clarity
df = df.rename(columns={"Spending ($)": "Total_Spending", "Join_Date": "Registration_Date"})
print("\nFinal Cleaned Data:\n", df.head())


After Dropping Missing Emails:
    Customer_ID    Name   Age            Email   Join_Date  Spending ($)
0          201   Grace  27.0   grace@mail.com  2024-05-01         450.0
1          202  Hannah   NaN  hannah@mail.com  2024-06-12         520.0
3          204    None  26.0   dylan@mail.com  2024-08-10         410.0
4          205    Jack  35.0    jack@mail.com        None         610.0
5          206    Kate  30.0    kate@mail.com  2024-09-05           NaN
6          207     Leo   NaN     leo@mail.com  2024-10-15         700.0
7          207     Leo  38.0     leo@mail.com  2024-10-15         700.0

After Filling Missing Values:
    Customer_ID    Name   Age            Email   Join_Date  Spending ($)
0          201   Grace  27.0   grace@mail.com  2024-05-01         450.0
1          202  Hannah  31.2  hannah@mail.com  2024-06-12         520.0
3          204    None  26.0   dylan@mail.com  2024-08-10         410.0
4          205    Jack  35.0    jack@mail.com        None         610.0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)  # Fill Age with mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Spending ($)"].fillna(df["Spending ($)"].median(), inplace=True)  # Fill Spending with median
