# House Price Prediction: Advanced Regression Techniques- Data Wrangling

In [240]:
#Import necessary modules

import requests
import pandas as pd
import opendatasets as od
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

In [241]:
#Using the kaggle api to download the dataset

od.download('https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=train.csv')

Skipping, found downloaded files in ".\house-prices-advanced-regression-techniques" (use force=True to force download)


In [242]:
# Load train.csv into a DataFrame
train_df = pd.read_csv("train.csv")

# Load test.csv into a DataFrame
test_df = pd.read_csv("test.csv")

Now we have the dataframes made for the training and testing datasets and will take a look at the data

In [243]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [244]:
train_df.shape

(1460, 81)

In [245]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [246]:
train_df.shape

(1460, 81)

In [247]:
#New dataframe with the combined datasets in order to view all of the data

full_df = pd.concat([train_df, test_df], ignore_index=True)
full_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


In [248]:
full_df.shape

(2919, 81)

In [249]:
full_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [250]:
full_df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice        float64
Length: 81, dtype: object

In [251]:
full_df.set_index("Id", inplace=True)

In [252]:
full_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500.0
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500.0
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500.0
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000.0
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000.0


In [253]:
full_df.info

<bound method DataFrame.info of       MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
Id                                                                      
1             60       RL         65.0     8450   Pave   NaN      Reg   
2             20       RL         80.0     9600   Pave   NaN      Reg   
3             60       RL         68.0    11250   Pave   NaN      IR1   
4             70       RL         60.0     9550   Pave   NaN      IR1   
5             60       RL         84.0    14260   Pave   NaN      IR1   
...          ...      ...          ...      ...    ...   ...      ...   
2915         160       RM         21.0     1936   Pave   NaN      Reg   
2916         160       RM         21.0     1894   Pave   NaN      Reg   
2917          20       RL        160.0    20000   Pave   NaN      Reg   
2918          85       RL         62.0    10441   Pave   NaN      Reg   
2919          60       RL         74.0     9627   Pave   NaN      Reg   

     LandContour U

In [254]:
#Print out the first row transposed for sample home submission

full_df.iloc[0].T

MSSubClass             60
MSZoning               RL
LotFrontage          65.0
LotArea              8450
Street               Pave
                   ...   
MoSold                  2
YrSold               2008
SaleType               WD
SaleCondition      Normal
SalePrice        208500.0
Name: 1, Length: 80, dtype: object

In [255]:
#Count the number of missing values in each column and then sort. Count column shows us the number of instances that are
# missing and the % will show the percent of each column missing

missing = pd.concat([full_df.isnull().sum(), 100 * full_df.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count', ascending=False)

Unnamed: 0,count,%
PoolQC,2909,99.657417
MiscFeature,2814,96.402878
Alley,2721,93.216855
Fence,2348,80.438506
SalePrice,1459,49.982871
...,...,...
1stFlrSF,0,0.000000
2ndFlrSF,0,0.000000
LowQualFinSF,0,0.000000
GrLivArea,0,0.000000


In [256]:
#Pool quality has the most missing values. We will check what the unique missing values are and the counts of each.

poolqc_counts = full_df['PoolQC'].value_counts()
print(poolqc_counts)

Ex    4
Gd    4
Fa    2
Name: PoolQC, dtype: int64


In [257]:
#Drop the pool quality column with very little information to help us

full_df = full_df.drop('PoolQC', axis=1)

In [258]:
full_df.shape

(2919, 79)

In [259]:
misc_feature_counts = full_df['MiscFeature'].value_counts()
print(misc_feature_counts)

Shed    95
Gar2     5
Othr     4
TenC     1
Name: MiscFeature, dtype: int64


In [260]:
# Create a new column 'HasShed' indicating the presence of a shed as most non missing values of misc feature are shed

full_df['HasShed'] = full_df['MiscFeature'].apply(lambda x: 1 if x == 'Shed' else 0)

In [261]:
#Drop the misc feature column for the new has shed column

full_df = full_df.drop('MiscFeature', axis=1)
full_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,HasShed
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,0,2,2008,WD,Normal,208500.0,0
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,0,5,2007,WD,Normal,181500.0,0
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,0,9,2008,WD,Normal,223500.0,0
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,0,2,2006,WD,Abnorml,140000.0,0
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,0,12,2008,WD,Normal,250000.0,0


In [262]:
#Alley column unlikely to provide significant insights into our predictions with over 90% missing values

full_df = full_df.drop('Alley', axis=1)
full_df.shape

(2919, 78)

In [263]:
full_df.select_dtypes(object)

Unnamed: 0_level_0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,Fence,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Typ,,Attchd,RFn,TA,TA,Y,,WD,Normal
2,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,Typ,TA,Attchd,RFn,TA,TA,Y,,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Typ,TA,Attchd,RFn,TA,TA,Y,,WD,Normal
4,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Typ,Gd,Detchd,Unf,TA,TA,Y,,WD,Abnorml
5,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,Typ,TA,Attchd,RFn,TA,TA,Y,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,RM,Pave,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,...,Typ,,,,,,Y,,WD,Normal
2916,RM,Pave,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,...,Typ,,CarPort,Unf,TA,TA,Y,,WD,Abnorml
2917,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,...,Typ,TA,Detchd,Unf,TA,TA,Y,,WD,Abnorml
2918,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,...,Typ,,,,,,Y,MnPrv,WD,Normal


In [264]:
#Checking for unique values in utilities as there is no variation seen. Drop column since only 1 value not AllPub

utility_count = full_df['Utilities'].value_counts()
print(utility_count)
full_df = full_df.drop('Utilities', axis=1)
full_df.shape

AllPub    2916
NoSeWa       1
Name: Utilities, dtype: int64


(2919, 77)

In [265]:
#Loop through the columns of type object and show the unique counts of each

for column in full_df.columns:
    if full_df[column].dtype == 'object':
        unique_counts = full_df[column].value_counts()
        print(f"Column: {column}")
        print(unique_counts)

Column: MSZoning
RL         2265
RM          460
FV          139
RH           26
C (all)      25
Name: MSZoning, dtype: int64
Column: Street
Pave    2907
Grvl      12
Name: Street, dtype: int64
Column: LotShape
Reg    1859
IR1     968
IR2      76
IR3      16
Name: LotShape, dtype: int64
Column: LandContour
Lvl    2622
HLS     120
Bnk     117
Low      60
Name: LandContour, dtype: int64
Column: LotConfig
Inside     2133
Corner      511
CulDSac     176
FR2          85
FR3          14
Name: LotConfig, dtype: int64
Column: LandSlope
Gtl    2778
Mod     125
Sev      16
Name: LandSlope, dtype: int64
Column: Neighborhood
NAmes      443
CollgCr    267
OldTown    239
Edwards    194
Somerst    182
NridgHt    166
Gilbert    165
Sawyer     151
NWAmes     131
SawyerW    125
Mitchel    114
BrkSide    108
Crawfor    103
IDOTRR      93
Timber      72
NoRidge     71
StoneBr     51
SWISU       48
ClearCr     44
MeadowV     37
BrDale      30
Blmngtn     28
Veenker     24
NPkVill     23
Blueste     10
Name

In [266]:
#Drop columns with very low variability with not much information to provide

columns_to_drop = ['Street', 'RoofMatl', 'Heating', 'Condition2']
full_df.drop(columns_to_drop, axis=1, inplace=True)
full_df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,...,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,HasShed
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Reg,Lvl,Inside,Gtl,CollgCr,Norm,...,0,0,,0,2,2008,WD,Normal,208500.0,0
2,20,RL,80.0,9600,Reg,Lvl,FR2,Gtl,Veenker,Feedr,...,0,0,,0,5,2007,WD,Normal,181500.0,0
3,60,RL,68.0,11250,IR1,Lvl,Inside,Gtl,CollgCr,Norm,...,0,0,,0,9,2008,WD,Normal,223500.0,0
4,70,RL,60.0,9550,IR1,Lvl,Corner,Gtl,Crawfor,Norm,...,0,0,,0,2,2006,WD,Abnorml,140000.0,0
5,60,RL,84.0,14260,IR1,Lvl,FR2,Gtl,NoRidge,Norm,...,0,0,,0,12,2008,WD,Normal,250000.0,0


In [270]:
# Define the columns to encode
columns_to_encode = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

# Create a copy of the DataFrame to avoid modifying the original data
encoded_df = full_df.copy()

# Iterate over the columns and perform imputation and encoding
for column in columns_to_encode:
    # Create a new category 'NA' to handle missing values
    categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex', 'NA']
    
    # Impute missing values with 'NA'
    imputer = SimpleImputer(strategy='constant', fill_value='NA')
    encoded_df[column] = imputer.fit_transform(encoded_df[[column]])
    
    # Perform ordinal encoding
    ordinal_encoder = OrdinalEncoder(categories=[categories])
    encoded_df[column] = ordinal_encoder.fit_transform(encoded_df[[column]])


In [None]:
['BsmtExposure','BsmtFinType1', 'BsmtFinType2', 'Functional', 'LandSlope']

In [271]:
#Check if the types of the columns switched from object to int

for column_name in columns_to_encode:
    column_type = full_df[column_name].dtypes
    print(f"Column '{column_name}': {column_type}")

Column 'ExterQual': object
Column 'ExterCond': object
Column 'BsmtQual': object
Column 'BsmtCond': object
Column 'HeatingQC': object
Column 'KitchenQual': object
Column 'FireplaceQu': object
Column 'GarageQual': object
Column 'GarageCond': object


In [269]:
poolqc_counts = full_df['ExterCond'].value_counts()
print(poolqc_counts)

TA    2538
Gd     299
Fa      67
Ex      12
Po       3
Name: ExterCond, dtype: int64


In [111]:
full_df.select_dtypes(object)

Unnamed: 0_level_0,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,...,MasVnrType,Foundation,CentralAir,Electrical,GarageType,GarageFinish,PavedDrive,Fence,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,...,BrkFace,PConc,Y,SBrkr,Attchd,RFn,Y,,WD,Normal
2,RL,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,Gable,MetalSd,...,,CBlock,Y,SBrkr,Attchd,RFn,Y,,WD,Normal
3,RL,IR1,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,...,BrkFace,PConc,Y,SBrkr,Attchd,RFn,Y,,WD,Normal
4,RL,IR1,Lvl,Corner,Crawfor,Norm,1Fam,2Story,Gable,Wd Sdng,...,,BrkTil,Y,SBrkr,Detchd,Unf,Y,,WD,Abnorml
5,RL,IR1,Lvl,FR2,NoRidge,Norm,1Fam,2Story,Gable,VinylSd,...,BrkFace,PConc,Y,SBrkr,Attchd,RFn,Y,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,RM,Reg,Lvl,Inside,MeadowV,Norm,Twnhs,2Story,Gable,CemntBd,...,,CBlock,Y,SBrkr,,,Y,,WD,Normal
2916,RM,Reg,Lvl,Inside,MeadowV,Norm,TwnhsE,2Story,Gable,CemntBd,...,,CBlock,Y,SBrkr,CarPort,Unf,Y,,WD,Abnorml
2917,RL,Reg,Lvl,Inside,Mitchel,Norm,1Fam,1Story,Gable,VinylSd,...,,CBlock,Y,SBrkr,Detchd,Unf,Y,,WD,Abnorml
2918,RL,Reg,Lvl,Inside,Mitchel,Norm,1Fam,SFoyer,Gable,HdBoard,...,,PConc,Y,SBrkr,,,Y,MnPrv,WD,Normal


In [114]:
# Filter the dataset for normal sale conditions
filtered_df = full_df[full_df['SaleCondition'] == 'Normal']

# Remove the SaleCondition column
filtered_df = filtered_df.drop('SaleCondition', axis=1)

In [115]:
# Calculate mean and median sales prices for full_df
mean_sales_price_full = full_df['SalePrice'].mean()
median_sales_price_full = full_df['SalePrice'].median()

# Calculate mean and median sales prices for filtered_df
mean_sales_price_filtered = filtered_df['SalePrice'].mean()
median_sales_price_filtered = filtered_df['SalePrice'].median()

# Print the results
print("Mean sales price - Full dataset:", mean_sales_price_full)
print("Median sales price - Full dataset:", median_sales_price_full)
print("Mean sales price - Filtered dataset:", mean_sales_price_filtered)
print("Median sales price - Filtered dataset:", median_sales_price_filtered)

Mean sales price - Full dataset: 180921.19589041095
Median sales price - Full dataset: 163000.0
Mean sales price - Filtered dataset: 175202.21953255427
Median sales price - Filtered dataset: 160000.0
