In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv('../Data/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
missing_percentage = df.isnull().mean() * 100
print(missing_percentage.sort_values(ascending=False))

PoolQC         99.520548
MiscFeature    96.301370
Alley          93.767123
Fence          80.753425
MasVnrType     59.726027
                 ...    
ExterQual       0.000000
Exterior2nd     0.000000
Exterior1st     0.000000
RoofMatl        0.000000
SalePrice       0.000000
Length: 81, dtype: float64


In [10]:
df.drop(columns = ['Alley', 'PoolQC', 'MiscFeature', 'Fence'], inplace=True)

In [11]:
missing_percentage = df.isnull().mean() * 100
print(missing_percentage.sort_values(ascending=False))

MasVnrType     59.726027
FireplaceQu    47.260274
LotFrontage    17.739726
GarageCond      5.547945
GarageYrBlt     5.547945
                 ...    
BsmtUnfSF       0.000000
TotalBsmtSF     0.000000
MSSubClass      0.000000
HeatingQC       0.000000
SalePrice       0.000000
Length: 77, dtype: float64


In [14]:
features_to_check = ['MasVnrType', 'FireplaceQu', 'LotFrontage', 'GarageCond', 'GarageYrBlt']

for feature in features_to_check:
    if df[feature].dtype == 'object':
        print(f"{feature} is categorical")
    else:
        corr_value = df[feature].corr(df['SalePrice'])
        print(f"Correlation of {feature} with SalePrice: {corr_value:.3f}")


MasVnrType is categorical
FireplaceQu is categorical
Correlation of LotFrontage with SalePrice: 0.352
GarageCond is categorical
Correlation of GarageYrBlt with SalePrice: 0.486


In [15]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()

# Convert MasVnrType to string first (to handle NaNs)
df_encoded['MasVnrType'] = df_encoded['MasVnrType'].astype(str)

# Label encode
le = LabelEncoder()
df_encoded['MasVnrType_encoded'] = le.fit_transform(df_encoded['MasVnrType'])

# Check correlation
corr_value = df_encoded['MasVnrType_encoded'].corr(df_encoded['SalePrice'])
print(f"Correlation of MasVnrType with SalePrice: {corr_value:.3f}")


Correlation of MasVnrType with SalePrice: -0.282


In [17]:
df.drop(columns=['MasVnrType'], inplace=True)

In [None]:
missing_percentage = df.isnull().mean() * 100
print(missing_percentage.sort_values(ascending=False))

FireplaceQu     47.260274
LotFrontage     17.739726
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
                  ...    
BsmtUnfSF        0.000000
TotalBsmtSF      0.000000
Heating          0.000000
MSSubClass       0.000000
SalePrice        0.000000
Length: 76, dtype: float64


In [23]:
df['FireplaceQu'] = df['FireplaceQu'].fillna('None')
fill_garage_none = ['GarageCond', 'GarageQual', 'GarageFinish', 'GarageType', 'GarageYrBlt']
for col in fill_garage_none:
    df[col] = df[col].fillna('None')
    
missing_percentage = df.isnull().mean() * 100
print(missing_percentage.sort_values(ascending=False))

LotFrontage     17.739726
BsmtFinType2     2.602740
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtQual         2.534247
                  ...    
BsmtUnfSF        0.000000
TotalBsmtSF      0.000000
Heating          0.000000
MSSubClass       0.000000
SalePrice        0.000000
Length: 76, dtype: float64


In [None]:
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)
df['LotFrontage'].isnull().sum()

0

In [25]:
fill_basement_none = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for col in fill_basement_none:
    df[col] = df[col].fillna('None')
    
missing_percentage = df.isnull().mean() * 100
print(missing_percentage.sort_values(ascending=False))

MasVnrArea      0.547945
Electrical      0.068493
BedroomAbvGr    0.000000
FireplaceQu     0.000000
Fireplaces      0.000000
                  ...   
ExterQual       0.000000
Exterior2nd     0.000000
Exterior1st     0.000000
RoofMatl        0.000000
SalePrice       0.000000
Length: 76, dtype: float64


In [32]:
MasVnrArea = df['MasVnrArea'].fillna(0.0, inplace=True)
Elecrtical = df['Electrical'].fillna(df['Electrical'].mode()[0], inplace=True)

missing_percentage = df.isnull().mean() * 100
print(missing_percentage.sort_values(ascending=False))

Id             0.0
HalfBath       0.0
FireplaceQu    0.0
Fireplaces     0.0
Functional     0.0
              ... 
MasVnrArea     0.0
Exterior2nd    0.0
Exterior1st    0.0
RoofMatl       0.0
SalePrice      0.0
Length: 76, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  MasVnrArea = df['MasVnrArea'].fillna(0.0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Elecrtical = df['Electrical'].fillna(df['Electrical'].mode()[0], inplace=True)


In [33]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000
