In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

sns.set(style="whitegrid", context="talk")


In [None]:
house_data_train = pd.read_csv('data/train.csv')

In [None]:
df = house_data_train.copy()

# Basic df understanding

In [None]:
df.info()

In [None]:
df.describe(include='number')

Lot of data skewness, max >> 75th percentile

TODO: Consider transforming those features

In [None]:
df.describe(include='object')

## Studing the target: SalePrice

In [None]:
df['SalePrice'].describe()

In [None]:
sns.histplot(data=df, x='SalePrice', kde=True)

Right skewed SalePrice

# Feature type selection

In [None]:
numerical_features = [
    'LotFrontage',
    'LotArea',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',

    # Datetime features
    'YearBuilt',
    'YearRemodAdd', 
    'GarageYrBlt',
    'MoSold',
    'YrSold',
    
    # Target feature
    'SalePrice'
]

cat_ordinal_features = [
    'OverallQual', # Already int
    'OverallCond', # Already int
    'KitchenQual',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'HeatingQC',
    'FireplaceQu',
    'GarageQual',
    'GarageCond',
    'PoolQC',
]

cat_nominal_features = [
    'MSSubClass',
    'MSZoning',
    'Alley',
    'LotShape',
    'LandContour',
    'Utilities',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Foundation',
    'Heating',
    'CentralAir',
    'Electrical',
    'Functional',
    'GarageType',
    'PavedDrive',
    'Fence',
    'SaleType',
    'SaleCondition',
    'Street',
    'GarageFinish',
    'MiscFeature'
]

categorical_features = cat_nominal_features + cat_ordinal_features

In [None]:
# Basic check if non column is missed

col_sum = 0
all_features = []

for feature_type in [numerical_features, cat_ordinal_features, cat_nominal_features]:
    all_features.extend(feature_type)
    col_sum += len(feature_type)

for col in df.columns:
    if col not in all_features:
        print(f"{col}: not present in selected features")

Id column is not needed

In [None]:
df = df[all_features]

# Numerical features analysis

In [None]:
df_num = df[numerical_features]

In [None]:
df_num.describe()

In [None]:
df_num.info()

In [None]:
from typing import Iterator

def get_next_feature_from_iter(features: Iterator):
    try:
        feature = next(features)
        return feature
    except StopIteration:
        print("No features left")
        return

def create_numerical_viz_iter(df, features, target_feature):
    feature = get_next_feature_from_iter(features)

    if feature == target_feature:
        print("All features visualized")
        return

    fig, axes = plt.subplots(3, 2, figsize=(14, 9))
    ax1, ax2, ax3, ax4, ax5, _ = axes.ravel()

    fig.suptitle(f'Feature diagnostics: {feature}', fontsize=30)

    # Hist + KDE
    sns.histplot(data=df, x=feature, kde=True, ax=ax1)
    ax1.set_title("Histogram + KDE")
    ax1.set_xlabel(feature)
    ax1.set_ylabel("Count")

    # Boxplot
    sns.boxplot(data=df, x=feature, ax=ax2)
    ax2.set_title("Boxplot")
    ax2.set_xlabel(feature)
    ax2.set_ylabel("Value")

    # Scatter: feature vs target
    sns.scatterplot(data=df, x=feature, y=target_feature, ax=ax3)
    ax3.set_title(f"Scatter: {feature} vs {target_feature}")
    ax3.set_xlabel(feature)
    ax3.set_ylabel(target_feature)

    tmp = df[[feature, target_feature]].copy()
    tmp = tmp.dropna()
    log_feature = f"log1p_{feature}"
    log_target_feature = f"log1p_{target_feature}"

    tmp[log_feature] = np.log1p(tmp[feature])
    tmp[log_target_feature] = np.log1p(tmp[target_feature])

    # Scatter feature vs log1p(target)
    sns.scatterplot(data=tmp, x=feature, y=log_target_feature, ax=ax4)
    ax4.set_title(f"Scatter: {feature} vs {log_target_feature}")
    ax4.set_xlabel(feature)
    ax4.set_ylabel(log_target_feature)
    
    # Scatter log1p(feature) vs log1p(target)
    sns.scatterplot(data=tmp, x=log_feature, y=log_target_feature, ax=ax5)
    ax5.set_title(f"Scatter: {log_feature} vs {log_target_feature}")
    ax5.set_xlabel(log_feature)
    ax5.set_ylabel(log_target_feature)


    plt.tight_layout()
    plt.show()
    
    return feature


Shape before removing outliers

In [None]:
df.shape

In [None]:
num_features_iter = iter(df_num.columns)

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

Some outliers in LotFrontage > 300

In [None]:
df[df['LotFrontage'] > 300]

In [None]:
df = df.drop(index=[934, 1298])

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
# Dropping huge outliers
df = df[df['LotArea'] <= 100000]

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
df = df[df['MasVnrArea'] <= 1250]

In [None]:
df.shape

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

Will it be usefull?

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

Will it be usefull?

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

TODO: Consider feature engineering for this feature

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

TODO: Consider feature engineering for this feature

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

TODO: Consider feature engineering for this feature

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

TODO: Consider feature engineering for Garage Area

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
df[df['OpenPorchSF'] > 500][[col for col in df.columns if 'porch' in col.lower()]]

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
df[df['EnclosedPorch'] > 500][[col for col in df.columns if 'porch' in col.lower()]]

In [None]:
df = df.drop(index=197)

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

TODO: Will it be usefull?

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

TODO: Consider feautre engineering, will this feature be usefull?

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

TODO: Consider feautre engineering, will this feature be usefull?

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
df[(df['YearBuilt'] < 1900) & (df['SalePrice'] > 400000)] # OverallQual 10 , OverallCond 9
# House in excelent condition

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

TODO: Will it be usefull?

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

TODO: Consider feature engineering

In [None]:
create_numerical_viz_iter(df, num_features_iter, 'SalePrice')

In [None]:
df.shape

Removed 17 outliers

In [None]:
# Saving df after numerical features analysis
df.to_csv('data/df_after_numerical.csv', index=False)

# Categorical features analysis

## Nominal feature diagnostics

In [None]:
from typing import Iterator

def get_next_feature_from_iter(features: Iterator):
    try:
        feature = next(features)
        return feature
    except StopIteration:
        print("No features left")
        return

def plot_nominal_feature_diagnostics(df: pd.DataFrame, features: Iterator, target_feature: str):
	"""
	Nominal (unordered) categorical feature diagnostics with multiple subplots.

	Subplots (2x2):
	1) Category frequency (bar)
	2) Mean target by category (bar, sorted)
	3) Target distribution by category (boxplot, categories sorted by median)
	4) Count vs median target (bubble-ish scatter; helps spot rare-but-extreme levels)

	Notes:
	- NaNs are treated as "Missing".
	- If there are many levels, keeps the top MAX_LEVELS-1 by frequency and groups the rest as "Other".
	- Returns (fig, axes).
	"""
	feature = get_next_feature_from_iter(features)

	if not feature:
		return

	if feature == target_feature:
		print('Feature is the same as target feature')
		return

	MAX_LEVELS = 20

	d = df[[feature, target_feature]].copy()
	d[feature] = d[feature].astype("object").where(d[feature].notna(), "Missing")

	# Group rare levels if too many unique
	vc = d[feature].value_counts(dropna=False)
	if len(vc) > MAX_LEVELS:
		top = set(vc.index[: MAX_LEVELS - 1])
		d[feature] = d[feature].where(d[feature].isin(top), "Other")

	stats = (
		d.groupby(feature)[target_feature]
		.agg(count="size", mean="mean", median="median", std="std")
		.reset_index()
	)

	order_count = stats.sort_values("count", ascending=False)[feature].tolist()
	order_mean = stats.sort_values("mean", ascending=False)[feature].tolist()
	order_median = stats.sort_values("median", ascending=False)[feature].tolist()

	fig, axes = plt.subplots(2, 2, figsize=(14, 9))
	ax1, ax2, ax3, ax4 = axes.ravel()

	# 1) Frequency
	sns.countplot(data=d, x=feature, order=order_count, ax=ax1)
	ax1.set_title(f"{feature}: Category frequency")
	ax1.set_xlabel(feature)
	ax1.set_ylabel("Count")
	ax1.tick_params(axis="x", rotation=45)

	# 2) Mean target by category
	sns.barplot(data=d, x=feature, y=target_feature, order=order_mean, estimator=np.mean, ax=ax2)
	ax2.set_title(f"{feature}: Mean {target_feature} by category")
	ax2.set_xlabel(feature)
	ax2.set_ylabel(f"Mean {target_feature}")
	ax2.tick_params(axis="x", rotation=45)

	# 3) Target distribution by category
	sns.boxplot(data=d, x=feature, y=target_feature, order=order_median, ax=ax3)
	ax3.set_title(f"{feature}: {target_feature} distribution (sorted by median)")
	ax3.set_xlabel(feature)
	ax3.set_ylabel(target_feature)
	ax3.tick_params(axis="x", rotation=45)

	# 4) Bubble scatter: count vs median target
	bubble = stats.copy()
	bubble["size"] = bubble["count"].clip(lower=10, upper=300)
	sns.scatterplot(data=bubble, x="count", y="median", size="size", sizes=(40, 400), ax=ax4, legend=False)
	for _, r in bubble.iterrows():
		ax4.text(r["count"], r["median"], str(r[feature]), fontsize=8, ha="left", va="bottom")
	ax4.set_title(f"{feature}: Count vs median {target_feature}")
	ax4.set_xlabel("Count")
	ax4.set_ylabel(f"Median {target_feature}")

	fig.tight_layout()
	plt.show()
	return feature

In [None]:
cat_nominal_features_iter = iter(cat_nominal_features)

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

- Huge outliers for calss 60, many outliers in the rest of the classes
- Rare 180 category with low median sale value

In [None]:
df[(df['MSSubClass'] == 60) & (df['SalePrice'] > 600000)]

Houses in excellent condition

In [None]:
df[(df['MSSubClass'] == 20) & (df['SalePrice'] > 600000)]

Again house in excellent condition

In [None]:
df[df['MSSubClass'] == 180]

In [None]:
df[df['MSSubClass'] == 180]['SalePrice']

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

- Huge outliers for RL class
- low median low count for C(all) class

In [None]:
df[(df['MSZoning'] == 'RL') & (df['SalePrice'] > 620000)].index

In [None]:
df = df.drop(index=df[(df['MSZoning'] == 'RL') & (df['SalePrice'] > 620000)].index)

In [None]:
df.shape

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

- outliers for IR1 and Reg class
- high median low count for class IR2 IR3

In [None]:
df[(df['LotShape'] == 'IR1') & (df['SalePrice'] > 550000)]

In [None]:
df[df['LotShape'] == 'IR3']

In [None]:
df[df['LotShape'] == 'IR2']


In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')


- Some outliers in Lvl class
- Low median low count Bnk
- High median low count HLS

In [None]:
df[(df['LandContour'] == 'Lvl') & (df['SalePrice'] > 550000)]

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

- huge outlier for OldTown class

In [None]:
df[(df['Neighborhood'] == 'OldTown') & (df['SalePrice'] > 400000)]

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
def display_df_with_ge_sale_price(df, value):
    display(df[df['SalePrice'] > value])

In [None]:
display_df_with_ge_sale_price(df[df['Condition1'] == 'Norm'], 530000)

In [None]:
display_df_with_ge_sale_price(df[df['Condition1'] == 'Artery'], 400000)

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
display_df_with_ge_sale_price(df[df['Condition2'] == 'Norm'], 550000)

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
display_df_with_ge_sale_price(df[df['Exterior2nd'] == 'Wd Shng'], 400000)

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
display_df_with_ge_sale_price(df[df['Functional'] == 'Mod'], 400000)

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
display_df_with_ge_sale_price(df[df['Fence'] == 'GdPrv'], 400000)

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
display_df_with_ge_sale_price(df[df['SaleType'] == 'ConLI'], 400000)

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')


In [None]:
plot_nominal_feature_diagnostics(df, cat_nominal_features_iter, 'SalePrice')

In [None]:
df.to_csv('data/df_after_numerical_category_nom.csv', index=False)

## Ordinal feature diagnostics

In [None]:
def plot_ordinal_feature_diagnostics(df: pd.DataFrame, features: Iterator, target_feature: str):
	"""
	Ordinal (ordered) categorical feature diagnostics using seaborn.

	Creates a 2x2 grid:
	  1) Frequency by level (in order)
	  2) Boxplot of target by level (in order)
	  3) Median target trend (line/point)
	  4) Mean target with +/- 1 std (error bars)

	Ordering:
	  - Uses ordered pandas.Categorical order if present.
	  - Else tries common ordinal sets (Po/Fa/TA/Gd/Ex, N/Y, None+quality).
	  - Else falls back to ordering levels by median target (low->high).

	Args:
	  df: DataFrame
	  feature: ordinal categorical column name
	  target_feature: target column name

	Returns:
	  (fig, axes)
	"""
	feature = get_next_feature_from_iter(features)
	
	if not feature:
		return

	if feature == target_feature:
		print('Feature is the same as target feature')
		return
	
	d = df[[feature, target_feature]].copy()
	d[feature] = d[feature].astype(int).where(d[feature].notna(), -1)

	# Determine order
	order = None
	orig = df[feature]

	if isinstance(orig.dtype, pd.CategoricalDtype) and orig.dtype.ordered:
		order = [str(x) for x in orig.dtype.categories]
		if "Missing" in d[feature].unique() and "Missing" not in order:
			order = ["Missing"] + order
	else:
		uniq = set(map(str, d[feature].unique()))
		quality = list(range(-1, 11))
		yn = ["N", "Y"]

		if uniq.issubset(set(quality) | {"Missing"}):
			order = (["Missing"] if "Missing" in uniq else []) + quality
		elif uniq.issubset(set(quality) | {"Missing", "None"}):
			order = []
			if "Missing" in uniq: order.append("Missing")
			if "None" in uniq: order.append("None")
			order += quality
		elif uniq.issubset(set(yn) | {"Missing"}):
			order = (["Missing"] if "Missing" in uniq else []) + yn
		else:
			# fallback: order by median target
			med = d.groupby(feature)[target_feature].median().sort_values()
			order = med.index.astype(str).tolist()

	# Ensure all present levels are included
	for lvl in d[feature].astype(str).unique():
		if lvl not in order:
			order.append(lvl)

	# Precompute stats in order
	stats = (
		d.assign(_lvl_=d[feature].astype(str))
		.groupby("_lvl_")[target_feature]
		.agg(count="size", mean="mean", median="median", std="std")
		.reindex(order)
		.reset_index()
		.rename(columns={"_lvl_": feature})
	)

	fig, axes = plt.subplots(2, 2, figsize=(14, 9))
	ax1, ax2, ax3, ax4 = axes.ravel()

	# 1) Frequency
	sns.countplot(data=d, x=feature, order=order, ax=ax1)
	ax1.set_title(f"{feature} (ordinal): Frequency by level")
	ax1.set_xlabel(feature)
	ax1.set_ylabel("Count")
	ax1.tick_params(axis="x", rotation=45)

	# 2) Boxplot
	sns.boxplot(data=d, x=feature, y=target_feature, order=order, ax=ax2)
	ax2.set_title(f"{feature} (ordinal): {target_feature} distribution by level")
	ax2.set_xlabel(feature)
	ax2.set_ylabel(target_feature)
	ax2.tick_params(axis="x", rotation=45)

	# 3) Median trend
	sns.pointplot(data=stats, x=feature, y="median", order=order, ax=ax3)
	ax3.set_title(f"{feature} (ordinal): Median {target_feature} trend")
	ax3.set_xlabel(feature)
	ax3.set_ylabel(f"Median {target_feature}")
	ax3.tick_params(axis="x", rotation=45)

	# 4) Mean +/- 1 std (error bars)
	ax4.errorbar(
		x=np.arange(len(order)),
		y=stats["mean"].values,
		yerr=stats["std"].values,
		fmt="o-",
		capsize=3,
	)
	ax4.set_xticks(np.arange(len(order)))
	ax4.set_xticklabels(order, rotation=45, ha="right")
	ax4.set_title(f"{feature} (ordinal): Mean {target_feature} ± 1 std")
	ax4.set_xlabel(feature)
	ax4.set_ylabel(f"Mean {target_feature}")

	fig.tight_layout()
	plt.show()
	return feature

In [None]:
def plot_ordinal_feature_diagnostics(
    df: pd.DataFrame,
    features: Iterator,
    target_feature: str,
):
    """
    Ordinal *encoded* feature diagnostics (assumes feature already contains integers from OrdinalEncoder).

    Creates a 2x2 grid:
      1) Frequency by encoded level (sorted ascending; unknown/missing shown first if present)
      2) Boxplot of target by encoded level
      3) Median target trend vs encoded level
      4) Mean target with +/- 1 std error bars vs encoded level

    Conventions:
      - If you used OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        then -1 is treated as "Unknown/NA" and will be shown first.
      - Otherwise, order is simply the sorted unique numeric levels.

    Args:
      df: DataFrame containing encoded columns
      features: iterator yielding feature names (your get_next_feature_from_iter is used)
      target_feature: target column name

    Returns:
      The feature name plotted (or None if nothing plotted).
    """
    feature = get_next_feature_from_iter(features)
    if not feature:
        return None

    if feature == target_feature:
        print("Feature is the same as target feature")
        return None

    if feature not in df.columns or target_feature not in df.columns:
        print("Feature or target_feature not found in df")
        return None

    d = df[[feature, target_feature]].copy()

    # Ensure numeric encoded feature; preserve NaNs then fill to -1 for plotting
    d[feature] = pd.to_numeric(d[feature], errors="coerce")
    d[feature] = d[feature].fillna(-1).astype(int)

    # Determine order for encoded levels:
    # - show -1 first if present, then 0..max ascending
    uniq_levels = np.array(sorted(d[feature].unique()))
    order_levels = uniq_levels.tolist()
    if -1 in order_levels:
        order_levels = [-1] + [x for x in order_levels if x != -1]

    # Build a label column for prettier x-ticks
    # If you'd like plain ints on x-axis, set labels equal to str(level)
    def _label(level: int) -> str:
        return "Unknown/NA (-1)" if level == -1 else str(level)

    level_to_label = {lvl: _label(lvl) for lvl in order_levels}
    d["_lvl_label_"] = d[feature].map(level_to_label)

    order_labels = [level_to_label[lvl] for lvl in order_levels]

    # Precompute stats in the same order
    stats = (
        d.groupby("_lvl_label_")[target_feature]
        .agg(count="size", mean="mean", median="median", std="std")
        .reindex(order_labels)
        .reset_index()
        .rename(columns={"_lvl_label_": "level"})
    )

    fig, axes = plt.subplots(2, 2, figsize=(14, 9))
    ax1, ax2, ax3, ax4 = axes.ravel()

    # 1) Frequency
    sns.countplot(data=d, x="_lvl_label_", order=order_labels, ax=ax1)
    ax1.set_title(f"{feature} (encoded ordinal): Frequency by level")
    ax1.set_xlabel("Encoded level")
    ax1.set_ylabel("Count")
    ax1.tick_params(axis="x", rotation=45)

    # 2) Boxplot
    sns.boxplot(data=d, x="_lvl_label_", y=target_feature, order=order_labels, ax=ax2)
    ax2.set_title(f"{feature} (encoded ordinal): {target_feature} distribution by level")
    ax2.set_xlabel("Encoded level")
    ax2.set_ylabel(target_feature)
    ax2.tick_params(axis="x", rotation=45)

    # 3) Median trend (use numeric x for correct spacing, but label ticks)
    x_numeric = np.arange(len(order_labels))
    ax3.plot(x_numeric, stats["median"].values, marker="o")
    ax3.set_xticks(x_numeric)
    ax3.set_xticklabels(order_labels, rotation=45, ha="right")
    ax3.set_title(f"{feature} (encoded ordinal): Median {target_feature} trend")
    ax3.set_xlabel("Encoded level")
    ax3.set_ylabel(f"Median {target_feature}")
    ax3.grid(True, alpha=0.3)

    # 4) Mean +/- 1 std (error bars)
    ax4.errorbar(
        x=x_numeric,
        y=stats["mean"].values,
        yerr=stats["std"].values,
        fmt="o-",
        capsize=3,
    )
    ax4.set_xticks(x_numeric)
    ax4.set_xticklabels(order_labels, rotation=45, ha="right")
    ax4.set_title(f"{feature} (encoded ordinal): Mean {target_feature} ± 1 std")
    ax4.set_xlabel("Encoded level")
    ax4.set_ylabel(f"Mean {target_feature}")
    ax4.grid(True, alpha=0.3)

    fig.tight_layout()
    plt.show()

    return feature

### Ordinal encoding

In [None]:
# First two already int
cat_ordinal_features

In [None]:
from sklearn.preprocessing import OrdinalEncoder
categories = {
    # OverallQual, OverallCond are already numeric 1..10 in this dataset.
    'OverallQual': [1,2,3,4,5,6,7,8,9,10],
    'OverallCond': [1,2,3,4,5,6,7,8,9,10],

    'KitchenQual': ["NA","Po","Fa","TA","Gd","Ex"],
    'ExterQual': ["Po","Fa","TA","Gd","Ex"],
    'ExterCond': ["Po","Fa","TA","Gd","Ex"],

    'BsmtQual': ["NA","Po","Fa","TA","Gd","Ex"],
    'BsmtCond': ["NA","Po","Fa","TA","Gd","Ex"],
    'BsmtExposure': ["NA","No","Mn","Av","Gd"],

    'BsmtFinType1': ["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],
    'BsmtFinType2': ["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],

    'HeatingQC': ["Po","Fa","TA","Gd","Ex"],

    'FireplaceQu': ["NA","Po","Fa","TA","Gd","Ex"],
    'GarageQual': ["NA","Po","Fa","TA","Gd","Ex"],
    'GarageCond': ["NA","Po","Fa","TA","Gd","Ex"],
    'PoolQC': ["NA","Po", "Fa","TA","Gd","Ex"],
}




In [None]:
ordinal_encoder = OrdinalEncoder(
    categories=list(categories.values()),
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

In [None]:
x = df.copy()
x = x[list(categories.keys())]
x = x.fillna('NA')

In [None]:
encoded = ordinal_encoder.fit_transform(x)

In [None]:
df_encoded = df.copy()

ordinal_cols = [f"{c}_ord" for c in categories.keys()]
df_encoded[ordinal_cols] = encoded

In [None]:
df_encoded[[c for c in categories.keys()] + ordinal_cols].head()

In [None]:
df = df_encoded.copy()

In [None]:
oridnal_features_iter = iter(ordinal_cols)

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

- As expected, seems to be a great predictor

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

- Outlier for class 2
- As expected, seems to be a great predictor

In [None]:
display_df_with_ge_sale_price(df[df['OverallCond_ord'] == 1], 390000)
# Seems like data error, lot of Ex quality but overall cond only 2? -> Removing row

In [None]:
df = df.drop(index=df[(df['OverallCond_ord'] == 1) & (df['SalePrice'] > 390000)].index)

In [None]:
df.shape

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

- May be good predictor too

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
plot_ordinal_feature_diagnostics(df, oridnal_features_iter, 'SalePrice')

In [None]:
df.shape

In [None]:
df.to_csv('data/final.csv', index=False)

## Correlation Matix

In [None]:
df_num = df.select_dtypes(include="number")
corr = df_num.corr()['SalePrice']

In [None]:
selected = corr[corr.abs() >= 0.3].sort_values(key=lambda x: x.abs(), ascending=False)

In [None]:
selected

In [None]:
corr_selected = df_num[list(selected.index)].corr()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(corr_selected, cmap='coolwarm', square=True, center=0, linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
df.to_csv('df_after_numerical_category_nom_ord_encoded.csv', index=False)

# Missing values

In [None]:
def display_missing_data(df: pd.DataFrame):
    null_sum = df.isnull().sum()
    total = null_sum.sort_values(ascending=False) # Total number of missing values
    percentage = (null_sum / df.isnull().count()).sort_values(ascending
                                                              =False) * 100  # % of Missing values
    unique_values = df.nunique()
    missing_data = pd.concat([total, percentage, unique_values], axis=1, 
                             keys=['Missing', '% Missing', 'Unique values'], sort=False)
    
    missing_data = missing_data[missing_data['Missing'] > 0]
    display(missing_data.sort_values(by='Missing', ascending=False))

display_missing_data(df)

## Handling missing values feature by feature

### MiscFeature

In [None]:
df['MiscFeature'].unique()

In [None]:
# Filling NA - look description
df['MiscFeature'] = df['MiscFeature'].fillna('NA')

### Alley

In [None]:
df['Alley'].unique()

In [None]:
# Filling NA - look description
df['Alley'] = df['Alley'].fillna('NA')

### Fence

In [None]:
df['Fence'].unique()

In [None]:
# Filling NA - look description
df['Fence'] = df['Fence'].fillna('NA')

### MasVnrType

In [None]:
df['MasVnrType'].unique()

In [None]:
# Filling NA - look description
df['MasVnrType'] = df['MasVnrType'].fillna('NA')

### LotFrontage

In [None]:
df['LotFrontage'].describe()

Will be imputed by median from neighbourhood

In [None]:
display_missing_data(df)

In [None]:
df['GarageType'].unique()

GarageType, GarageFinish Imputed by menaingfull NA feature

In [None]:
df['GarageYrBlt'].describe()

In [None]:
display_missing_data(df)

In [None]:
df['MasVnrArea'].describe()

In [None]:
df['MasVnrArea'].plot(kind='hist')

In [None]:
df[df['MasVnrArea'].isna()][['MasVnrType', 'MasVnrArea']]

In [None]:
display_missing_data(df)

In [None]:
df['Electrical'].unique()

# Feature Engineering

## Boolean features:
1. HasRemod - YearRemodAdd != constrution date
2. HasFireplace -> Fireplaces > 0
3. HasGarage -> GarageType != NA
4. HasPool -> PoolQC != NA
5. HasFence -> Fence != NA
6. HasMiscFeature -> MiscFeature != NA
7. IsNormalSaleCondition -> SaleCondition == Normal
8. HasBasement
9. Has2ndFloor


## Square feet
1. Add full square feet of the house floors


## Garage features
1. Area per car -> GarageArea / GarageCars



## Total rooms
1. Add total bmt bathrooms

## QuadraticFeatures:
1. OverallQual
2. OverallCond
3. YearBuilt
4. FloorSquareFeet
5. TotalBsmtSF
6. TotalSquareFeet
7. GarageAreaPerCar
8. TotalBsmtBathrooms