# # ⭐Load

In [141]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [142]:
import pandas as pd

# Raw GitHub URLs (direct)
train_url = "https://raw.githubusercontent.com/chiragjagad/House-Prices-Advanced-Regression-Techniques/main/train.csv"
test_url  = "https://raw.githubusercontent.com/chiragjagad/House-Prices-Advanced-Regression-Techniques/main/test.csv"

# Load the data
train_df = pd.read_csv(train_url)
test_df  = pd.read_csv(test_url)

# Print shapes
print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Check for SalePrice
print("Is 'SalePrice' present in train?", 'SalePrice' in train_df.columns)

# Show a sample
print(train_df.head(5))


Train set shape: (1460, 81)
Test set shape: (1459, 80)
Is 'SalePrice' present in train? True
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN       

In [143]:
# name change 
df = train_df

In [144]:

# Basic info about dataset
print("\nDataset Info:")
print(df.info())

# Basic statistics
print("\nDataset Description:")
print(df.describe())



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64 

# ⭐:Check feature types (numerical, categorical)

In [145]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
print("\nNumerical Columns:")
print(numerical_columns)
categorical_columns = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:")
print(categorical_columns)


Numerical Columns:
Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

Categorical Columns:
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinT

In [146]:
# ✅ Numerical columns to drop (Final List)

num_drop = [
    'Id', 'LowQualFinSF', 'BsmtFinSF2', 'BsmtHalfBath',
    'KitchenAbvGr', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'MiscVal'
]

df.drop(columns=num_drop, inplace=True)
df.shape

(1460, 71)

In [147]:
# ✅ Categorical columns to drop (Final List)

cat_drop = [
    'Alley', 'PoolQC', 'Fence', 'MiscFeature',
    'Condition2', 'RoofMatl', 'Street', 'Utilities'
]

df.drop(columns=cat_drop, inplace=True)
df.shape

(1460, 63)

In [148]:
print(df.columns.tolist())


['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice']


In [149]:
# ✅ Two main types of columns

# Main numerical columns
num_df_main = df[['LotArea', 'OverallQual', 'YearBuilt', 'GrLivArea', 'SalePrice']].head(10)

# Main categorical columns (added 'SaleType')
cat_df_main = df[['MSZoning', 'ExterQual', 'GarageType', 'Neighborhood', 'SaleType']].head(10)

print("Main Numerical Columns:\n", num_df_main)
print("\nMain Categorical Columns:\n", cat_df_main)


Main Numerical Columns:
    LotArea  OverallQual  YearBuilt  GrLivArea  SalePrice
0     8450            7       2003       1710     208500
1     9600            6       1976       1262     181500
2    11250            7       2001       1786     223500
3     9550            7       1915       1717     140000
4    14260            8       2000       2198     250000
5    14115            5       1993       1362     143000
6    10084            8       2004       1694     307000
7    10382            7       1973       2090     200000
8     6120            7       1931       1774     129900
9     7420            5       1939       1077     118000

Main Categorical Columns:
   MSZoning ExterQual GarageType Neighborhood SaleType
0       RL        Gd     Attchd      CollgCr       WD
1       RL        TA     Attchd      Veenker       WD
2       RL        Gd     Attchd      CollgCr       WD
3       RL        TA     Detchd      Crawfor       WD
4       RL        Gd     Attchd      NoRidge      

## ⭐:Check missing values (isnull().sum())

In [150]:
df.isnull().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
LotShape           0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 63, dtype: int64

In [151]:
df['LotFrontage'].shape

(1460,)

In [152]:
# LotFrontage fill with mean
LotFrontage_fill = df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace=True)
print(LotFrontage_fill)

None



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





# ⭐:Visualize distributions (SalePrice, numeric features)

In [153]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [154]:
# ✅ Visualize distributions of main numerical features

# List of numerical columns to visualize
numeric_cols = ['LotArea', 'OverallQual', 'YearBuilt', 'GrLivArea', 'SalePrice']

for col in numeric_cols:
    fig = px.histogram(df, x=col, nbins=30, title=f'Distribution of {col}',
                       marginal="box", # adds boxplot on top
                       color_discrete_sequence=['teal'])
    fig.show()


In [155]:
import plotly.express as px

# 1️⃣ Create SalePrice bins
bins = [0, 100000, 150000, 200000, 250000, 300000, 400000, df['SalePrice'].max()]
labels = ['0-100k', '100k-150k', '150k-200k', '200k-250k', '250k-300k', '300k-400k', '400k+']
df['PriceRange'] = pd.cut(df['SalePrice'], bins=bins, labels=labels)

# 2️⃣ Create pie chart
fig = px.pie(df, names='PriceRange', title='SalePrice Distribution by Range',
             color='PriceRange', color_discrete_sequence=px.colors.qualitative.Pastel)

fig.update_layout(
    template='plotly_dark',  # Dark theme
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )  # Adjust legend position
)

fig.update_traces(textinfo='percent+label')

fig.show()


In [156]:
import plotly.express as px

# 1️⃣ Create SalePrice bins
bins = [0, 100000, 150000, 200000, 250000, 300000, 400000, df['SalePrice'].max()]
labels = ['0-100k', '100k-150k', '150k-200k', '200k-250k', '250k-300k', '300k-400k', '400k+']
df['PriceRange'] = pd.cut(df['SalePrice'], bins=bins, labels=labels)

# 2️⃣ Create pie chart
fig = px.pie(df, names='PriceRange', title='SalePrice Distribution by Range',
             color='PriceRange', color_discrete_sequence=px.colors.qualitative.Pastel)

fig.update_layout(
    template='plotly_dark',  # Dark theme
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )  # Adjust legend position
    # title=dict(
    #     text='SalePrice Distribution by Range',
    #     font=dict(size=20),
    #     x=0.5,
    #     y=0.95
    # )
)

fig.update_traces(textinfo='percent+label')

fig.show()


In [157]:
# Line Plot
# Optional: Aggregate SalePrice by YearBuilt (mean per year)
yearly_avg = df.groupby('YearBuilt')['SalePrice'].mean().reset_index()

# Create enhanced line plot
fig = px.line(
    yearly_avg, 
    x='YearBuilt', 
    y='SalePrice', 
    title='Average SalePrice Over Year Built',
    markers=True,  # Show points
    template='plotly_dark'
)

# Update layout
fig.update_layout(
    xaxis_title='Year Built',
    yaxis_title='Average Sale Price',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    title=dict(
        x=0.5,  # Center title
        font=dict(size=22)
    )
)

# Update line style
fig.update_traces(
    line=dict(color='teal', width=3),
    marker=dict(size=6),
    hovertemplate='Year: %{x}<br>Avg SalePrice: %{y:$,.0f}<extra></extra>'
)

fig.show()


# ⭐: Correlation matrix to see most important features

In [158]:
# Select only numerical columns
numeric_cols = df.select_dtypes(include=np.number)

# Compute correlation matrix
corr_matrix = numeric_cols.corr()

# Show correlation with SalePrice
print(corr_matrix['SalePrice'].sort_values(ascending=False))


SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
GarageYrBlt     0.486362
MasVnrArea      0.477493
Fireplaces      0.466929
BsmtFinSF1      0.386420
LotFrontage     0.334901
WoodDeckSF      0.324413
2ndFlrSF        0.319334
OpenPorchSF     0.315856
HalfBath        0.284108
LotArea         0.263843
BsmtFullBath    0.227122
BsmtUnfSF       0.214479
BedroomAbvGr    0.168213
MoSold          0.046432
YrSold         -0.028923
OverallCond    -0.077856
MSSubClass     -0.084284
Name: SalePrice, dtype: float64


In [159]:
# HeatMap

fig = px.imshow(
    corr_matrix,
    text_auto=True,                    # Show correlation values on each cell
    aspect="auto",
    color_continuous_scale='teal',     # Color theme
    labels=dict(x="Features", y="Features", color="Correlation")  # Axis labels
)

fig.update_layout(
    title='Correlation Matrix of Numeric Features vs SalePrice',
    template='plotly_dark',
    width=800,
    height=700
)

# Custom hover info
fig.update_traces(
    hovertemplate='Feature 1: %{x}<br>Feature 2: %{y}<br>Correlation: %{z:.2f}<extra></extra>'
)

fig.show()


# ⭐: Visualize categorical vs target (Neighborhood, OverallQual)

In [160]:
# Box plot: Neighborhood vs SalePrice

import plotly.express as px

fig = px.box(df, 
             x='Neighborhood', 
             y='SalePrice',
             color='Neighborhood',
             title='SalePrice Distribution by Neighborhood',
             labels={'SalePrice':'Sale Price', 'Neighborhood':'Neighborhood'},
             points='all')  # show all points

fig.update_layout(
    template='plotly_dark',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    title=dict(
        x=0.5,  # Center title
        font=dict(size=22)
    )
)

fig.update_layout(xaxis_tickangle=-45)  # rotate x labels for better readability
fig.show()


In [161]:
# Box plot: OverallQual vs SalePrice

fig = px.box(
    df,
    x='OverallQual',
    y='SalePrice',
    color='OverallQual',
    title='SalePrice Distribution by Overall Quality',
    labels={'OverallQual':'Overall Quality', 'SalePrice':'Sale Price'},
    points='all'
)

fig.update_layout(
    template='plotly_dark',
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    title=dict(
        x=0.5,  # Center title
        font=dict(size=22)
    )
)

fig.update_traces(
    hovertemplate="Overall Quality: %{x}<br>Sale Price: %{y}<extra></extra>"
)

fig.show()


In [162]:
# Neighborhood average sale price
avg_price_neigh = df.groupby('Neighborhood')['SalePrice'].mean().sort_values(ascending=False)
fig = px.bar(x=avg_price_neigh.index, y=avg_price_neigh.values, 
             color=avg_price_neigh.values, color_continuous_scale='Viridis',
             title='Average SalePrice by Neighborhood',
             labels={'x':'Neighborhood', 'y':'Average SalePrice'})
fig.update_layout(template='plotly_dark',
                  xaxis_tickangle=-45,
                   
                  title=dict(
                      x=0.5,  # Center title
                      font=dict(size=22)
                  )
                  )
fig.update_traces(
    hovertemplate="Neighborhood: %{x}<br>Avg SalePrice: %{y:$,.0f}<extra></extra>"
)

fig.show()


In [163]:
# OverallQual average sale price
avg_price_qual = df.groupby('OverallQual')['SalePrice'].mean().sort_values(ascending=False)
fig = px.bar(x=avg_price_qual.index, y=avg_price_qual.values,
             color=avg_price_qual.values, color_continuous_scale='Viridis',
             title='Average SalePrice by Overall Quality',
             labels={'x':'Overall Quality', 'y':'Average SalePrice'})
fig.update_layout(template='plotly_dark',
                  title=dict(
                      x=0.5,  # Center title
                      font=dict(size=22)
                  )
                  )
fig.update_traces(
    hovertemplate="Overall Quality: %{x}<br>Avg SalePrice: %{y:$,.0f}<extra></extra>"
)
fig.show()

# ⭐:Identify skewed numeric variables

In [164]:
# Select numeric columns
numeric_cols = df.select_dtypes(include=np.number)

# Compute skewness
skewness = numeric_cols.skew().sort_values(ascending=False)

# Choose numeric columns with high skew (|skew| > 0.5)
skewed_cols = skewness[abs(skewness) > 0.5].index.tolist()

print("numerical_columns:" , numeric_cols.head(2))
print("Skewness of numeric columns:\n", skewness.head(2))
print("Skewed numeric columns:", skewed_cols)


numerical_columns:    MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0          60         65.0     8450            7            5       2003   
1          20         80.0     9600            6            8       1976   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtUnfSF  ...  TotRmsAbvGrd  \
0          2003       196.0         706        150  ...             8   
1          1976         0.0         978        284  ...             6   

   Fireplaces  GarageYrBlt  GarageCars  GarageArea  WoodDeckSF  OpenPorchSF  \
0           0       2003.0           2         548           0           61   
1           1       1976.0           2         460         298            0   

   MoSold  YrSold  SalePrice  
0       2    2008     208500  
1       5    2007     181500  

[2 rows x 28 columns]
Skewness of numeric columns:
 LotArea       12.207688
MasVnrArea     2.669084
dtype: float64
Skewed numeric columns: ['LotArea', 'MasVnrArea', 'LotFrontage', 'OpenPorchSF', 'Sale

In [165]:

# Determine number of rows/columns for subplot grid
n = len(skewed_cols)
cols = 2  # number of columns in grid
rows = (n + cols - 1) // cols  # calculate rows needed

print("Number of skewed columns:", n)
print("Number of columns in grid:", cols)
print("Number of rows in grid:", rows)

Number of skewed columns: 21
Number of columns in grid: 2
Number of rows in grid: 11


In [166]:
# Create subplot figure
fig = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=[f"{col} (Skew={skewness[col]:.2f})" for col in skewed_cols]
)

# Add each skewed variable as a histogram
for i, col in enumerate(skewed_cols):
    row = i // cols + 1
    col_pos = i % cols + 1
    fig.add_trace(
        go.Histogram(
            x=df[col],
            nbinsx=30,
            marker_color='orange',
            name=col,
            hovertemplate=f"{col}: %%{{x}}<br>Count: %%{{y}}<extra></extra>"
        ),
        row=row, col=col_pos
    )

# Update layout
fig.update_layout(
    height=300*rows,  # adjust height dynamically
    width=900,
    template='plotly_dark',
    title_text="Distributions of Skewed Numeric Variables",
    title=dict(x=0.5, font=dict(size=22))
)

# Update axes labels
for i in range(rows*cols):
    fig.update_xaxes(title_text=skewed_cols[i] if i < n else "", row=(i//cols)+1, col=(i%cols)+1)
    fig.update_yaxes(title_text="Count", row=(i//cols)+1, col=(i%cols)+1)

fig.show()


In [167]:
import numpy as np

# Example: log transform SalePrice
df['SalePrice_log'] = np.log1p(df['SalePrice'])

# Visualize log-transformed SalePrice
fig = px.histogram(df, x='SalePrice_log', nbins=30, 
                   title='Log-transformed SalePrice', 
                   marginal='box', color_discrete_sequence=['green'])
fig.update_layout(template='plotly_dark')
fig.update_traces(marker_line_width=0.5, marker_line_color='black')
fig.show()


# ⭐:Handle missing values (median, mode, or “None”)

In [168]:
# Count missing values per column
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("Columns with missing values:\n", missing)


Columns with missing values:
 MasVnrType      872
FireplaceQu     690
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageCond       81
BsmtExposure     38
BsmtFinType2     38
BsmtQual         37
BsmtCond         37
BsmtFinType1     37
MasVnrArea        8
Electrical        1
dtype: int64


In [169]:
# 1️ Fill numerical columns with median
num_cols = df.select_dtypes(include=np.number).columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

# 2️ Fill categorical columns with mode or "None"
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        # If column is like PoolQC, Alley, Fence → fill with "None"
        if col in ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu']:
            df[col] = df[col].fillna('None')
        else:
            df[col] = df[col].fillna(df[col].mode()[0])


In [170]:
print(df.isnull().sum().sum())  # should print 0


0


# ⭐:Encode categorical features (One-hot or LabelEncoder)

In [171]:
# Categorical columns
cat_cols = df.select_dtypes(include='object').columns
print("Categorical columns:", cat_cols.tolist())


Categorical columns: ['MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']


In [172]:
from sklearn.preprocessing import LabelEncoder

# Example ordinal columns
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

le = LabelEncoder()
for col in ordinal_cols:
    df[col] = le.fit_transform(df[col])
    
print("Ordinal columns:", ordinal_cols)


Ordinal columns: ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']


In [173]:
# Step 1: Identify all categorical columns still in df
cat_cols = df.select_dtypes(include='object').columns.tolist()

# Step 2: Identify nominal columns (exclude ordinal columns if they exist)
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']
nominal_cols = [col for col in cat_cols if col not in ordinal_cols]

print("Nominal columns to encode:", nominal_cols)


Nominal columns to encode: ['MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'GarageFinish', 'PavedDrive', 'SaleType', 'SaleCondition']


In [174]:
# One-hot encode nominal columns
df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)

print("DataFrame shape after encoding:", df.shape)


DataFrame shape after encoding: (1460, 190)


In [175]:
print(df.head())
print("Shape after encoding:", df.shape)


   MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0          60         65.0     8450            7            5       2003   
1          20         80.0     9600            6            8       1976   
2          60         68.0    11250            7            5       2001   
3          70         60.0     9550            7            5       1915   
4          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  ExterQual  ExterCond  ...  SaleType_ConLI  \
0          2003       196.0          2          4  ...           False   
1          1976         0.0          3          4  ...           False   
2          2002       162.0          2          4  ...           False   
3          1970         0.0          3          4  ...           False   
4          2000       350.0          2          4  ...           False   

   SaleType_ConLw  SaleType_New  SaleType_Oth  SaleType_WD  \
0           False         False     

# ⭐: Log-transform SalePrice if skewed

In [176]:
skewness = df['SalePrice'].skew()
print(f"Skewness of SalePrice: {skewness:.2f}")


Skewness of SalePrice: 1.88


In [177]:
# Log transform (log1p = log(1 + x) avoids log(0) errors)
df['SalePrice_log'] = np.log1p(df['SalePrice'])
print("Skewness of log-transformed SalePrice:", df['SalePrice_log'].skew())

Skewness of log-transformed SalePrice: 0.12134661989685333


In [178]:
fig = px.histogram(df, x='SalePrice_log', nbins=30, marginal='box',
                   title='Log-transformed SalePrice Distribution',
                   color_discrete_sequence=['green'])
fig.update_layout(template='plotly_dark', title=dict(x=0.5, font=dict(size=22)))
fig.update_traces(marker_line_width=0.5, marker_line_color='black')
fig.show()


In [179]:
# Total square footage = Basement + 1st floor + 2nd floor
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

# Check the new feature
print(df[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'TotalSF']].head())


   TotalBsmtSF  1stFlrSF  2ndFlrSF  TotalSF
0          856       856       854     2566
1         1262      1262         0     2524
2          920       920       866     2706
3          756       961       756     2473
4         1145      1145      1053     3343


In [180]:

# Total bathrooms (only existing columns)
df['TotalBath'] = df['FullBath'] + 0.5*df['HalfBath'] + df['BsmtFullBath']

# Age of house at time of sale
df['Age'] = df['YrSold'] - df['YearBuilt']

# Quick check
df[['TotalBath', 'Age']].head()


Unnamed: 0,TotalBath,Age
0,3.5,5
1,2.0,31
2,3.5,7
3,2.0,91
4,3.5,8


In [181]:

# TotalSF vs SalePrice
fig = px.scatter(df, x='TotalSF', y='SalePrice', trendline='ols', 
                 title='TotalSF vs SalePrice', color='OverallQual')
fig.update_layout(template='plotly_dark', title=dict(x=0.5, font=dict(size=22)))
fig.update_traces(marker=dict(size=5, opacity=0.5), mode='markers')
fig.show()


# ⭐: Train-test split 80/20(Separate X & y)

In [182]:
# Target variable
y = df['SalePrice_log']

# Features (drop original SalePrice and SalePrice_log)
X = df.drop(['SalePrice', 'SalePrice_log'], axis=1)

# Check shapes
print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (1460, 191)
Target shape: (1460,)


In [183]:
from sklearn.model_selection import train_test_split

# Split into 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1168, 191)
X_test shape: (292, 191)
y_train shape: (1168,)
y_test shape: (292,)


# ⭐: Linear Regression baseline

In [184]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [187]:
# Find columns that are still non-numeric
non_numeric_cols = X_train.select_dtypes(exclude=np.number).columns
print("Non-numeric columns:", non_numeric_cols.tolist())


Non-numeric columns: ['PriceRange', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'LandSlope_Mod', 'LandSlope_Sev', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_IDOTRR', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker', 'Condition1_Feedr', 'Condition1_Norm', 'Condition1_PosA', 'Condition1_PosN', 'Condition1_RRAe', 'Condition1_RRAn', 'Conditi

In [188]:
# Drop columns that are not numeric
X_train = X_train.drop(columns=non_numeric_cols, errors='ignore')
X_test = X_test.drop(columns=non_numeric_cols, errors='ignore')


In [189]:
# Initialize model
lr_model = LinearRegression()

# Fit on training data
lr_model.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [191]:
# Predict on training and test sets
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

print("y_train_pred shape:", y_train_pred.shape)
print("y_test_pred shape:", y_test_pred.shape)

y_train_pred shape: (1168,)
y_test_pred shape: (292,)


In [192]:
# RMSE (root mean squared error) on log scale
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# R² score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Train RMSE: {train_rmse:.4f}, R²: {train_r2:.4f}")
print(f"Test RMSE: {test_rmse:.4f}, R²: {test_r2:.4f}")


Train RMSE: 0.1431, R²: 0.8657
Test RMSE: 0.1549, R²: 0.8714


In [193]:
# Inverse log transformation
y_test_pred_orig = np.expm1(y_test_pred)
y_test_orig = np.expm1(y_test)

# Example: compare first 5 predictions
print("Predicted SalePrice:", y_test_pred_orig[:5])
print("Actual SalePrice:", y_test_orig[:5])


Predicted SalePrice: [147944.56400316 312084.02755535 119484.5794221  177497.49359862
 330689.92051168]
Actual SalePrice: 892     154500.0
1105    325000.0
413     115000.0
522     159000.0
1036    315500.0
Name: SalePrice_log, dtype: float64


# ⭐:Random Forest Regressor

In [194]:
from sklearn.ensemble import RandomForestRegressor

In [195]:
# Initialize Random Forest
rf_model = RandomForestRegressor(
    n_estimators=200,  # number of trees
    max_depth=None,    # allow full depth
    random_state=42,
    n_jobs=-1          # use all cores
)

# Fit model on training data
rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [196]:
# Predict on training and test sets
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

print("y_train_pred shape:", y_train_pred.shape)
print("y_test_pred shape:", y_test_pred.shape)

y_train_pred shape: (1168,)
y_test_pred shape: (292,)


In [197]:
# RMSE on log scale
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# R² score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Random Forest - Train RMSE: {train_rmse:.4f}, R²: {train_r2:.4f}")
print(f"Random Forest - Test RMSE: {test_rmse:.4f}, R²: {test_r2:.4f}")


Random Forest - Train RMSE: 0.0526, R²: 0.9818
Random Forest - Test RMSE: 0.1496, R²: 0.8800


In [198]:
# Inverse log transform
y_test_pred_orig = np.expm1(y_test_pred)
y_test_orig = np.expm1(y_test)

# Compare first 5 predictions
print("Predicted SalePrice:", y_test_pred_orig[:5])
print("Actual SalePrice:", y_test_orig[:5])


Predicted SalePrice: [138912.89984288 320711.21471997 112128.05204646 158760.55573066
 301015.18874503]
Actual SalePrice: 892     154500.0
1105    325000.0
413     115000.0
522     159000.0
1036    315500.0
Name: SalePrice_log, dtype: float64


# ⭐: Gradient Boosting if confident

In [199]:
from sklearn.ensemble import GradientBoostingRegressor

In [200]:
# Initialize Gradient Boosting
gb_model = GradientBoostingRegressor(
    n_estimators=500,    # number of boosting stages
    learning_rate=0.05,  # shrinkage factor
    max_depth=4,         # depth of each tree
    random_state=42
)

# Fit model on training data
gb_model.fit(X_train, y_train)


0,1,2
,loss,'squared_error'
,learning_rate,0.05
,n_estimators,500
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,4
,min_impurity_decrease,0.0


In [201]:
# Predict on training and test sets
y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

print("y_train_pred shape:", y_train_pred.shape)
print("y_test_pred shape:", y_test_pred.shape)

y_train_pred shape: (1168,)
y_test_pred shape: (292,)


In [202]:
# RMSE on log scale
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# R² score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Gradient Boosting - Train RMSE: {train_rmse:.4f}, R²: {train_r2:.4f}")
print(f"Gradient Boosting - Test RMSE: {test_rmse:.4f}, R²: {test_r2:.4f}")


Gradient Boosting - Train RMSE: 0.0336, R²: 0.9926
Gradient Boosting - Test RMSE: 0.1430, R²: 0.8903


In [203]:
# Inverse log transform
y_test_pred_orig = np.expm1(y_test_pred)
y_test_orig = np.expm1(y_test)

# Compare first 5 predictions
print("Predicted SalePrice:", y_test_pred_orig[:5])
print("Actual SalePrice:", y_test_orig[:5])


Predicted SalePrice: [138810.72314268 345044.67863559 111135.66006021 157900.29842933
 319353.56418883]
Actual SalePrice: 892     154500.0
1105    325000.0
413     115000.0
522     159000.0
1036    315500.0
Name: SalePrice_log, dtype: float64


# ⭐:RMSE (Root Mean Squared Error)

In [None]:
# 1️⃣ What is RMSE?
# RMSE measures the average difference between predicted values and actual values, in the same units as your target (SalePrice_log or original SalePrice).

# Where:✅ Lower RMSE → better predictions.

In [204]:
from sklearn.metrics import mean_squared_error


# RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {rmse:.4f}")


Test RMSE: 0.1430


In [205]:
# Linear Regression
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_model.predict(X_test)))

# Random Forest
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_model.predict(X_test)))

# Gradient Boosting
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_model.predict(X_test)))

print(f"Linear Regression RMSE: {lr_rmse:.4f}")
print(f"Random Forest RMSE: {rf_rmse:.4f}")
print(f"Gradient Boosting RMSE: {gb_rmse:.4f}")


Linear Regression RMSE: 0.1549
Random Forest RMSE: 0.1496
Gradient Boosting RMSE: 0.1430


In [206]:
# Convert predictions back to original scale
y_test_pred_orig = np.expm1(y_test_pred)  # inverse log1p
y_test_orig = np.expm1(y_test)

# RMSE in original SalePrice scale
rmse_orig = np.sqrt(mean_squared_error(y_test_orig, y_test_pred_orig))
print(f"Test RMSE (original SalePrice): {rmse_orig:.2f} USD")


Test RMSE (original SalePrice): 29086.95 USD


# ⭐: R²

In [None]:
# what is R² (R-squared)?
# R² measures how much of the variance in the target variable is explained by the model.

# Interpretation:
# R² = 1 → perfect prediction
# R² = 0 → model predicts as well as the mean
# R² < 0 → model is worse than just predicting the mean

In [208]:
from sklearn.metrics import r2_score

# Example: predictions from your model
r2 = r2_score(y_test, y_test_pred)
print(f"R² score: {r2:.4f}")


R² score: 0.8903


In [209]:
# Linear Regression
lr_r2 = r2_score(y_test, lr_model.predict(X_test))

# Random Forest
rf_r2 = r2_score(y_test, rf_model.predict(X_test))

# Gradient Boosting
gb_r2 = r2_score(y_test, gb_model.predict(X_test))

print(f"Linear Regression R²: {lr_r2:.4f}")
print(f"Random Forest R²: {rf_r2:.4f}")
print(f"Gradient Boosting R²: {gb_r2:.4f}")


Linear Regression R²: 0.8714
Random Forest R²: 0.8800
Gradient Boosting R²: 0.8903


# ⭐:Compare baseline vs Random Forest

In [210]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Linear Regression
y_train_pred_lr = lr_model.predict(X_train)
y_test_pred_lr = lr_model.predict(X_test)

lr_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_lr))
lr_test_rmse  = np.sqrt(mean_squared_error(y_test, y_test_pred_lr))
lr_train_r2 = r2_score(y_train, y_train_pred_lr)
lr_test_r2  = r2_score(y_test, y_test_pred_lr)

# Random Forest
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

rf_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
rf_test_rmse  = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
rf_train_r2 = r2_score(y_train, y_train_pred_rf)
rf_test_r2  = r2_score(y_test, y_test_pred_rf)


In [211]:
import pandas as pd

comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest'],
    'Train RMSE': [lr_train_rmse, rf_train_rmse],
    'Test RMSE': [lr_test_rmse, rf_test_rmse],
    'Train R²': [lr_train_r2, rf_train_r2],
    'Test R²': [lr_test_r2, rf_test_r2]
})

print(comparison)


               Model  Train RMSE  Test RMSE  Train R²   Test R²
0  Linear Regression    0.143098   0.154921  0.865673  0.871387
1      Random Forest    0.052638   0.149649  0.981824  0.879991


In [214]:
import plotly.express as px

# Combine test results
results = pd.DataFrame({
    'Actual': np.expm1(y_test),  # convert back to original SalePrice
    'LR_Predicted': np.expm1(y_test_pred_lr),
    'RF_Predicted': np.expm1(y_test_pred_rf)
})

fig = px.scatter(results, x='Actual', y='LR_Predicted', color_discrete_sequence=['blue'], 
                 title='Predicted vs Actual SalePrice (Linear Regression)')
fig.add_scatter(x=results['Actual'], y=results['RF_Predicted'], mode='markers', name='Random Forest', marker_color='red')
fig.add_scatter(x=results['Actual'], y=results['Actual'], mode='lines', name='Perfect Prediction', line=dict(color='black', dash='dash'))
fig.update_layout( title=dict(x=0.5, font=dict(size=22)))
fig.update_traces(marker=dict(size=5, opacity=0.5), mode='markers')
fig.show()


In [None]:
# Blue points → Linear Regression
# Red points → Random Forest
# Black dashed line → perfect prediction