In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# --- Configuration ---
# This is a raw CSV URL for the "Ames Housing" training data, a popular Kaggle dataset.
DATA_URL = "https://raw.githubusercontent.com/dataprofessor/data/master/house_price_train.csv"

# We will predict 'LotFrontage' (the target)
# using these features.
TARGET = 'LotFrontage'
FEATURES = ['LotArea', 'OverallQual', 'YearBuilt']

# Set pandas to display more info
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

print("--- Data Imputation Script Starting ---")

# --- Step 1: Load the Dataset ---
print(f"\n--- Step 1: Loading Dataset from URL ---\n{DATA_URL}")
try:
    df = pd.read_csv(DATA_URL)
    
    # For this exercise, we'll only keep our feature and target columns
    df = df[FEATURES + [TARGET]]
    
    print("\nOriginal Dataset (first 10 rows):")
    print(df.head(10))
    
    print("\nMissing values BEFORE imputation:")
    print(df.isnull().sum())
    print("-----------------------------------")
    
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please check the DATA_URL or your internet connection.")
    exit()

# --- Step 2: Train Linear Regression Model ---
print(f"\n--- Step 2: Training Model to Predict '{TARGET}' ---")

# Create the training set using only rows where 'LotFrontage' is NOT null
df_train = df.dropna(subset=[TARGET])

# Check if we have any data to train on
if df_train.empty:
    print(f"Error: No complete rows found to train the model. Cannot proceed.")
else:
    X_train = df_train[FEATURES]
    y_train = df_train[TARGET]
    
    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    print(f"Model trained successfully using {FEATURES}.")
    print(f"Model coefficients (for {FEATURES}): {model.coef_}")
    print(f"Model intercept: {model.intercept_}")
    print("-----------------------------------")


    # --- Step 3: Use Trained Model to Fill Missing Values ---
    print(f"\n--- Step 3: Predicting and Filling Missing '{TARGET}' Values ---")
    
    # Get all rows where 'LotFrontage' IS null
    df_predict = df[df[TARGET].isnull()]
    
    # Check if there are any values to predict
    if df_predict.empty:
        print("No missing values found to predict.")
    else:
        # Select the features for prediction
        X_predict = df_predict[FEATURES]
        
        # Use the trained model to predict 'LotFrontage'
        predicted_values = model.predict(X_predict)
        
        # Fill the missing values in the original dataframe
        # We use .loc to safely assign the values back
        df.loc[df[TARGET].isnull(), TARGET] = predicted_values
        
        print(f"Successfully predicted and filled {len(predicted_values)} missing '{TARGET}' values.")
    print("-----------------------------------")


    # --- Step 4: Display the Dataset After Filling ---
    print("\n--- Step 4: Displaying Final Dataset ---")
    
    print("Dataset AFTER imputation (first 10 rows):")
    # Note rows 2, 7, and 9 (indices) which were 'NaN' are now filled.
    print(df.head(10))
    
    print("\nMissing values AFTER imputation:")
    # The 'LotFrontage' column should now show 0 missing values
    print(df.isnull().sum())
    print("-----------------------------------")
    print("\n--- Data Imputation Script Finished ---")

--- Data Imputation Script Starting ---

--- Step 1: Loading the Dataset ---


KeyError: "None of [Index(['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'], dtype='object')] are in the [columns]"