In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Set random seed for reproducibility
np.random.seed(42)

# Create sample data
n_samples = 100
data = {
    'SalePrice': np.random.normal(200000, 50000, n_samples),
    'Bedrooms': np.random.randint(1, 6, n_samples),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples),
    'SquareFootage': np.random.normal(2000, 500, n_samples)
}

# Create DataFrame
df = pd.DataFrame(data)

# Introduce missing values in SquareFootage (20% of the data)
mask = np.random.rand(n_samples) < 0.2
df.loc[mask, 'SquareFootage'] = np.nan

# Save to CSV
df.to_csv('house_sales.csv', index=False)
print("CSV file 'house_sales.csv' has been created.")

# Display the first few rows of original data
print("\nOriginal data:")
print(df.head())
print(f"\nMissing values in SquareFootage: {df['SquareFootage'].isnull().sum()}")

# Prepare data for regression
X = df[['SalePrice', 'Bedrooms', 'Location']].copy()
y = df['SquareFootage']

# One-hot encode the 'Location' column
encoder = OneHotEncoder(drop='first', sparse_output=False)
location_encoded = encoder.fit_transform(X[['Location']])
location_columns = encoder.get_feature_names_out(['Location'])

X = X.drop('Location', axis=1)
X[location_columns] = location_encoded

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the regression model
model = LinearRegression()
model.fit(X_train[y_train.notnull()], y_train[y_train.notnull()])

# Impute missing values
df_imputed = df.copy()
missing_mask = df_imputed['SquareFootage'].isnull()
X_missing = X[missing_mask]
df_imputed.loc[missing_mask, 'SquareFootage'] = model.predict(X_missing)

print("\nImputed data:")
print(df_imputed.head())

# Validate imputation
known_mask = ~df['SquareFootage'].isnull()
mse = mean_squared_error(df.loc[known_mask, 'SquareFootage'], 
                         df_imputed.loc[known_mask, 'SquareFootage'])
print(f"\nMean Squared Error for known SquareFootage values: {mse:.2f}")

# Compare distribution of original and imputed data
print("\nOriginal SquareFootage statistics:")
print(df['SquareFootage'].describe())
print("\nImputed SquareFootage statistics:")
print(df_imputed['SquareFootage'].describe())

# Save the imputed data to a new CSV file
df_imputed.to_csv('house_sales_imputed.csv', index=False)
print("\nImputed data saved to 'house_sales_imputed.csv'")

CSV file 'house_sales.csv' has been created.

Original data:
       SalePrice  Bedrooms  Location  SquareFootage
0  224835.707651         1  Suburban            NaN
1  193086.784941         5     Rural            NaN
2  232384.426905         1     Urban    2944.773828
3  276151.492820         3     Urban    1662.097633
4  188292.331264         2     Urban    1543.293731

Missing values in SquareFootage: 15

Imputed data:
       SalePrice  Bedrooms  Location  SquareFootage
0  224835.707651         1  Suburban    2057.719918
1  193086.784941         5     Rural    1883.315781
2  232384.426905         1     Urban    2944.773828
3  276151.492820         3     Urban    1662.097633
4  188292.331264         2     Urban    1543.293731

Mean Squared Error for known SquareFootage values: 0.00

Original SquareFootage statistics:
count      85.000000
mean     2016.105161
std       471.781877
min       953.102998
25%      1727.481192
50%      1954.960683
75%      2265.700005
max      3626.992563
Na