Data  loading and finding missing values 

In [53]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [54]:
df = pd.read_csv("new_dataset_part2_picu_laura.csv")

In [55]:
df.columns

Index(['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SalePrice'],
      dtype='object')

In [56]:
df.shape

(1460, 20)

In [57]:
df.isna().any().any()


np.True_

In [58]:
df.isna().sum()

BsmtFullBath      0
BsmtHalfBath      0
FullBath          0
HalfBath          0
BedroomAbvGr      0
KitchenAbvGr      0
KitchenQual       0
TotRmsAbvGrd      0
Functional        0
Fireplaces        0
FireplaceQu     690
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageCars        0
GarageArea        0
GarageQual       81
GarageCond       81
PavedDrive        0
SalePrice         0
dtype: int64

In [59]:
df.drop(columns=["FireplaceQu"], inplace=True)


In [60]:
df.shape

(1460, 19)

In [61]:
df.columns

Index(['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'SalePrice'],
      dtype='object')

I identified the columns that don't have many missing values and changed their missing values with the implicit value "None".

In [62]:
garage_cols = ["GarageType", "GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond"]

for col in garage_cols:
    df[col].fillna("None", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("None", inplace=True)
  df[col].fillna("None", inplace=True)


In [63]:
df.dtypes

BsmtFullBath     int64
BsmtHalfBath     int64
FullBath         int64
HalfBath         int64
BedroomAbvGr     int64
KitchenAbvGr     int64
KitchenQual     object
TotRmsAbvGrd     int64
Functional      object
Fireplaces       int64
GarageType      object
GarageYrBlt     object
GarageFinish    object
GarageCars       int64
GarageArea       int64
GarageQual      object
GarageCond      object
PavedDrive      object
SalePrice        int64
dtype: object

Changed back GarageYrBlt from object to int.

In [64]:
df["GarageYrBlt"] = pd.to_numeric(df["GarageYrBlt"], errors='coerce')


In [65]:
df["GarageYrBlt"] = df["GarageYrBlt"].fillna(0).astype(int)


In [66]:
df.dtypes

BsmtFullBath     int64
BsmtHalfBath     int64
FullBath         int64
HalfBath         int64
BedroomAbvGr     int64
KitchenAbvGr     int64
KitchenQual     object
TotRmsAbvGrd     int64
Functional      object
Fireplaces       int64
GarageType      object
GarageYrBlt      int64
GarageFinish    object
GarageCars       int64
GarageArea       int64
GarageQual      object
GarageCond      object
PavedDrive      object
SalePrice        int64
dtype: object

In [67]:
df.isna().sum()

BsmtFullBath    0
BsmtHalfBath    0
FullBath        0
HalfBath        0
BedroomAbvGr    0
KitchenAbvGr    0
KitchenQual     0
TotRmsAbvGrd    0
Functional      0
Fireplaces      0
GarageType      0
GarageYrBlt     0
GarageFinish    0
GarageCars      0
GarageArea      0
GarageQual      0
GarageCond      0
PavedDrive      0
SalePrice       0
dtype: int64

Data preprocessing

In [68]:
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()


In [69]:
print(numeric_cols)

['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'SalePrice']


In [70]:
print(categorical_cols)

['KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive']


In [71]:
df.nunique()


BsmtFullBath      4
BsmtHalfBath      3
FullBath          4
HalfBath          3
BedroomAbvGr      8
KitchenAbvGr      4
KitchenQual       4
TotRmsAbvGrd     12
Functional        7
Fireplaces        4
GarageType        7
GarageYrBlt      98
GarageFinish      4
GarageCars        5
GarageArea      441
GarageQual        6
GarageCond        6
PavedDrive        3
SalePrice       663
dtype: int64

In [72]:
df[categorical_cols].nunique()


KitchenQual     4
Functional      7
GarageType      7
GarageFinish    4
GarageQual      6
GarageCond      6
PavedDrive      3
dtype: int64