In [1]:
##########################################################################################
#Task 1: The Integrity Audit (Missing Values & Duplicates)

import pandas as pd

# Load dataset
df = pd.read_csv("customer_orders.csv")

# Shape before cleaning
print("Shape BEFORE cleaning:", df.shape)

# Missing values report
print("\nMissing values in each column:")
print(df.isna().sum())

# Fill missing numeric values with median
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Remove duplicate rows
df_cleaned = df.drop_duplicates()

# Shape after cleaning
print("\nShape AFTER cleaning:", df_cleaned.shape)

# Show cleaned data
print("\nCleaned Data:")
df_cleaned.head()


Shape BEFORE cleaning: (7, 3)

Missing values in each column:
order_id    1
customer    0
amount      1
dtype: int64

Shape AFTER cleaning: (6, 3)

Cleaned Data:


Unnamed: 0,order_id,customer,amount
0,1001.0,Alice,250.0
1,1002.0,Bob,300.0
2,1002.5,Charlie,150.0
3,1003.0,David,200.0
5,1004.0,Eva,275.0


In [2]:
#######################################################################################
#Task 2: The Type Fixer (Data Type Conversion)

import pandas as pd

# Example dataset (you can replace with your CSV later)
data = {
    "Product": ["Phone", "Laptop", "Tablet"],
    "Price": ["$500", "$1200", "$300"],
    "Date": ["2025-01-10", "2025-02-15", "2025-03-20"]
}

df = pd.DataFrame(data)

# 1️⃣ Check initial data types
print("Before conversion:\n")
print(df.dtypes)

# 2️⃣ Remove $ and convert Price → float
df["Price"] = df["Price"].str.replace("$", "", regex=False).astype(float)

# 3️⃣ Convert Date → datetime
df["Date"] = pd.to_datetime(df["Date"])

# Check updated data types
print("\nAfter conversion:\n")
print(df.dtypes)

print("\nUpdated Data:")
print(df)




Before conversion:

Product    object
Price      object
Date       object
dtype: object

After conversion:

Product            object
Price             float64
Date       datetime64[ns]
dtype: object

Updated Data:
  Product   Price       Date
0   Phone   500.0 2025-01-10
1  Laptop  1200.0 2025-02-15
2  Tablet   300.0 2025-03-20


In [None]:
##########################################################################################
#Task 3: The Categorical Standardizer (String Cleaning)

import pandas as pd

# Example dataset
data = {
    "Location": [" New York", "new york", "NEW YORK ", "Chicago", " chicago "]
}

df = pd.DataFrame(data)

# Check original unique values
print("Before cleaning:")
print(df["Location"].unique())

# 1️⃣ Remove leading/trailing spaces
df["Location"] = df["Location"].str.strip()

# 2️⃣ Standardize casing (choose one)
df["Location"] = df["Location"].str.title()
# (You could also use .str.lower())

# Verify cleaning
print("\nAfter cleaning:")
print(df["Location"].unique())
