# 🏠 Simple Missing Data Lab - Ames Housing Dataset

**What we'll do:**
1. Load data and see what's missing
2. Fill missing numbers with average values
3. Fill missing text with most common values
4. Save our clean data

**Why this matters:** Real data is messy! We need to handle missing values before analyzing or building models.

## Step 1: Import what we need

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

# Show more columns when we print data
pd.set_option("display.max_columns", 50)

## Step 2: Load the data

In [None]:


def clean_outliers(df_in: pd.DataFrame, method: str = "cap", k: float = 1.5):
    df_clean = df_in.copy()
    for col in df_clean.select_dtypes(include="number").columns:
        s = df_clean[col]
        if s.notna().sum() == 0:
            continue
        q1, q3 = s.quantile([0.25, 0.75])
        iqr = q3 - q1
        low, up = q1 - k * iqr, q3 + k * iqr
        if method == "cap":
            df_clean[col] = s.clip(lower=low, upper=up)
        elif method == "median":
            mask = (s < low) | (s > up)
            df_clean.loc[mask, col] = s.median()
        elif method == "remove":
            mask = (s < low) | (s > up)
            df_clean = df_clean.loc[~mask]
    return df_clean


In [2]:
# Load the housing data
df = pd.read_csv("Ames_outliers_removed.csv")

# Example:

df_cleaned = clean_outliers(df, method="cap", k=1.5)

df_cleaned



print(f"Our data has {df.shape[0]} houses and {df.shape[1]} features")
print("\nFirst 5 rows:")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Ames_outliers_removed.csv'

## Step 3: Find missing values

In [None]:
# Count missing values in each column
missing_data = df.isnull().sum()

# Only show columns that have missing values
missing_data = missing_data[missing_data > 0]
missing_data = missing_data.sort_values(ascending=False)

print("Columns with missing values:")
print(missing_data)

## Step 4: Visualize missing data

In [None]:
# Make a simple bar chart of missing values
if len(missing_data) > 0:
    plt.figure(figsize=(10, 6))
    missing_data.plot(kind='bar')
    plt.title("Missing Values by Column")
    plt.ylabel("Number of Missing Values")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Great! No missing values found.")

## Step 5: Separate numbers from text

In [None]:
# Find columns with numbers
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

# Find columns with text
text_columns = df.select_dtypes(include=['object']).columns.tolist()

print(f"Number columns: {len(numeric_columns)}")
print(f"Text columns: {len(text_columns)}")

## Step 6: Fix missing numbers (use median)

In [None]:
# Create a copy to work with
df_clean = df.copy()

# Fix missing numbers by filling with median (middle value)
if len(numeric_columns) > 0:
    number_fixer = SimpleImputer(strategy='median')
    df_clean[numeric_columns] = number_fixer.fit_transform(df_clean[numeric_columns])
    print("✅ Fixed missing numbers")
else:
    print("No numeric columns to fix")

## Step 7: Fix missing text (use most common value)

In [None]:
# Fix missing text by filling with most common value
if len(text_columns) > 0:
    text_fixer = SimpleImputer(strategy='most_frequent')
    df_clean[text_columns] = text_fixer.fit_transform(df_clean[text_columns])
    print("✅ Fixed missing text")
else:
    print("No text columns to fix")

## Step 8: Check our work

In [None]:
# Count missing values before and after
before = df.isnull().sum().sum()
after = df_clean.isnull().sum().sum()

print(f"Missing values before: {before}")
print(f"Missing values after: {after}")
print(f"We fixed {before - after} missing values! 🎉")

In [None]:
df_clean