In [40]:
#### Importing Libraries 
import pandas as pd
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile

import numpy as np
import math
from tqdm.notebook import tqdm
from sklearn.exceptions import ConvergenceWarning
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats as scs

#### Download dataset

In [43]:

api = KaggleApi()
api.authenticate()


competition_name = 'house-prices-advanced-regression-techniques'

api.competition_download_files(competition=competition_name, path="dataset", force=True)
with zipfile.ZipFile(f'dataset/{competition_name}.zip', 'r') as zip_ref:
    zip_ref.extractall(f'dataset')



#### loading dataset

In [44]:

train_data = pd.read_csv("dataset/train.csv")
test_data = pd.read_csv("dataset/test.csv")

In [53]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


#### Data constants

In [54]:
# categorical that support 'NA'
features_cat_with_na = [
    'Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
    'BsmtQual', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageFinish', 
    'GarageQual', 'GarageType',  'MiscFeature', 'PoolQC', 
]

# numerical that have NaNs
features_cont_with_na = [
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath',  'BsmtUnfSF', 
    'BsmtHalfBath', 'GarageArea', 'GarageCars',  'GarageYrBlt',
    'LotFrontage', 'MasVnrArea', 'TotalBsmtSF', 
]

# possibly being removed
features_with_too_much_nas = [
    'MiscFeature', 'PoolQC', 'Fence', 'FireplaceQu', 'Alley'
]

outliers_dict = {
    "LotArea": '< 30000', 
    "GrLivArea": '< 4000', 
    "TotalBsmtSF": '< 2800'
}

mode_dict = {
    "MSZoning": train_data["MSZoning"].mode()[0], 
    "Utilities": train_data["Utilities"].mode()[0]
}

#### Data preperation


In [93]:


# fill nans
def fillna(dataset, col, fill_with='NA'):
    dataset[col].fillna(fill_with, inplace=True)
    return dataset

def remove_outliers(dataset):

    dataset = dataset[eval(f"dataset['LotArea'] {outliers['LotArea']}")]
    # dataset = dataset[eval(f"dataset['GrLivArea'] {outliers['GrLivArea']}")]
    # dataset = dataset[eval(f"dataset['TotalBsmtSF'] {outliers['TotalBsmtSF']}")]
    return dataset

def prepare_dataset(dataset, type = 'train'):

    for feat in features_cat_with_na:
        dataset = fillna(dataset, feat)
        
    for feat in features_cont_with_na:
        dataset = fillna(dataset, feat, dataset[feat].mean())

    # these features don't support 'NA', they have different values
    dataset = fillna(dataset, 'Electrical', 'SBrkr')
    dataset = fillna(dataset, 'Exterior1st', 'Other')
    dataset = fillna(dataset, 'Exterior2nd', 'Other')
    dataset = fillna(dataset, 'Functional', 'Typ')
    dataset = fillna(dataset, 'KitchenQual', 'TA')
    dataset = fillna(dataset, 'SaleType', 'Oth')
    dataset = fillna(dataset, 'MasVnrType', 'None')

    # just with mode
    dataset = fillna(dataset, 'MSZoning', mode_dict["MSZoning"])
    dataset = fillna(dataset, 'Utilities', mode_dict["Utilities"])

    # remove outliers based on plots above
    dataset = remove_outliers(dataset)
    print(dataset.head())
    # extract target and ids for test set
    target = dataset['SalePrice']
    Ids = dataset['Id'] if type == 'test' else None

    # remove target and ids
    dataset.drop(columns=['SalePrice', 'Id'], inplace=True)
    dataset.drop(columns=['Id'], inplace=True)

    return dataset, target, Ids


In [94]:
prepare_dataset(dataset=train_data, type = 'train')

   MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0          60       RL         65.0     8450   Pave    NA      Reg   
1          20       RL         80.0     9600   Pave    NA      Reg   
2          60       RL         68.0    11250   Pave    NA      IR1   
3          70       RL         60.0     9550   Pave    NA      IR1   
4          60       RL         84.0    14260   Pave    NA      IR1   

  LandContour Utilities LotConfig  ... ScreenPorch PoolArea PoolQC Fence  \
0         Lvl    AllPub    Inside  ...           0        0     NA    NA   
1         Lvl    AllPub       FR2  ...           0        0     NA    NA   
2         Lvl    AllPub    Inside  ...           0        0     NA    NA   
3         Lvl    AllPub    Corner  ...           0        0     NA    NA   
4         Lvl    AllPub       FR2  ...           0        0     NA    NA   

  MiscFeature MiscVal  MoSold  YrSold  SaleType  SaleCondition  
0          NA       0       2    2008        WD         N

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[col].fillna(fill_with, inplace=True)


KeyError: 'SalePrice'

In [85]:
train_data[col].fillna(fill_with, inplace=True)

NameError: name 'col' is not defined