In [1]:
import warnings

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


warnings.filterwarnings("ignore")

ImportStringError: import_string() failed for 'config'. Possible reasons are:

- missing __init__.py in a package;
- package or module path not included in sys.path;
- duplicated package or module name taking precedence in sys.path;
- missing module, class, function or variable;

Debugged import:

- 'config' not found.

Original exception:

ModuleNotFoundError: No module named 'config'

In [2]:
# Load data
df = pd.read_csv(DATA_FOLDER / RAW_DATA_FILENAME)
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()[:10]}...")  # First 10
df.head()

NameError: name 'DATA_FOLDER' is not defined

In [None]:
# Quick look at target variable
df["SalePrice"].describe()

In [None]:

# Cell 4: Check missing values (quick overview)
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print(f"Columns with missing values: {len(missing)}")
print(missing.head(10))

In [None]:

# Cell 5: SIMPLEST POSSIBLE APPROACH - Only use numerical columns
# Drop columns with too many missing values
# Drop Id column
# Drop target from features

# Keep only numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols.remove("SalePrice")  # Remove target
if "Id" in numerical_cols:
    numerical_cols.remove("Id")  # Remove ID

print(f"Using {len(numerical_cols)} numerical features")

# Create feature matrix and target
X = df[numerical_cols].copy()
y = df["SalePrice"].copy()

In [None]:

# Cell 6: Simple imputation - fill missing with median
X = X.fillna(X.median())

print(f"Final X shape: {X.shape}")
print(f"Any missing values left? {X.isnull().sum().sum()}")

In [None]:

# Cell 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

In [None]:

# Cell 8: Train baseline model - Simple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Model trained!")


In [None]:

# Cell 9: Evaluate
def evaluate(y_true, y_pred, dataset_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    print(f"\n{dataset_name} Results:")
    print(f"  MAE:  ${mae:,.2f}")
    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  R²:   {r2:.4f}")
    return mae, rmse, r2


train_results = evaluate(y_train, y_train_pred, "TRAIN")
test_results = evaluate(y_test, y_test_pred, "TEST")

In [None]:

# Cell 10: Visualize predictions
residuals = y_test - y_test_pred

# Create subplots with plotly
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Predictions vs Actual", "Residual Plot")
)

# Left plot: Predictions vs Actual
fig.add_trace(
    go.Scatter(
        x=y_test,
        y=y_test_pred,
        mode='markers',
        marker=dict(color='blue', opacity=0.5),
        name='Predictions'
    ),
    row=1, col=1
)

# Add perfect prediction line
min_val = y_test.min()
max_val = y_test.max()
fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        line=dict(color='red', dash='dash', width=2),
        name='Perfect Prediction'
    ),
    row=1, col=1
)

# Right plot: Residuals
fig.add_trace(
    go.Scatter(
        x=y_test_pred,
        y=residuals,
        mode='markers',
        marker=dict(color='blue', opacity=0.5),
        name='Residuals'
    ),
    row=1, col=2
)

# Add zero line
fig.add_trace(
    go.Scatter(
        x=[y_test_pred.min(), y_test_pred.max()],
        y=[0, 0],
        mode='lines',
        line=dict(color='red', dash='dash', width=2),
        name='Zero Line'
    ),
    row=1, col=2
)

# Update axes labels
fig.update_xaxes(title_text="Actual Price", row=1, col=1)
fig.update_yaxes(title_text="Predicted Price", row=1, col=1)
fig.update_xaxes(title_text="Predicted Price", row=1, col=2)
fig.update_yaxes(title_text="Residuals", row=1, col=2)

# Update layout
fig.update_layout(
    height=500,
    width=1200,
    showlegend=True,
    title_text="Baseline Model Results"
)

fig.show()

In [None]:

# Cell 11: Check for overfitting
print("\nOverfitting Check:")
print(f"Train R²: {train_results[2]:.4f}")
print(f"Test R²:  {test_results[2]:.4f}")
print(f"Difference: {train_results[2] - test_results[2]:.4f}")

if train_results[2] - test_results[2] > 0.1:
    print("⚠️  Possible overfitting detected!")
else:
    print("✅ No significant overfitting")

In [None]:

# Cell 12: Save results for comparison
results = {
    "model": "LinearRegression",
    "features": "numerical_only",
    "n_features": len(numerical_cols),
    "train_mae": train_results[0],
    "test_mae": test_results[0],
    "train_r2": train_results[2],
    "test_r2": test_results[2],
}

print("\n📊 BASELINE RESULTS:")
for key, value in results.items():
    print(f"  {key}: {value}")
