#### Importing Libraries

In [151]:
#### Importing Libraries 
import warnings 
warnings.filterwarnings("ignore")
import pandas as pd
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
import pickle

import numpy as np
import math
from tqdm.notebook import tqdm
from sklearn.exceptions import ConvergenceWarning
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats as scs

#### Download dataset

In [152]:

api = KaggleApi()
api.authenticate()


competition_name = 'house-prices-advanced-regression-techniques'

api.competition_download_files(competition=competition_name, path="dataset", force=True)
with zipfile.ZipFile(f'dataset/{competition_name}.zip', 'r') as zip_ref:
    zip_ref.extractall(f'dataset')



#### loading dataset

In [153]:

train_data = pd.read_csv("dataset/train.csv")
test_data = pd.read_csv("dataset/test.csv")

In [154]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [197]:
features_with_too_much_na

['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature']

#### Data constants

In [158]:
# categorical features
cat_features = list(train_data.select_dtypes(exclude=['int', 'float']).columns.values)
len(cat_features)

# categorical that support 'NA'
features_cat_with_na = [
    'Alley', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
    'BsmtQual', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageFinish', 
    'GarageQual', 'GarageType',  'MiscFeature', 'PoolQC', 
]

# numerical that have NaNs
features_cont_with_na = [
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath',  'BsmtUnfSF', 
    'BsmtHalfBath', 'GarageArea', 'GarageCars',  'GarageYrBlt',
    'LotFrontage', 'MasVnrArea', 'TotalBsmtSF', 
]

# possibly being removed
features_with_too_much_nas = ['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature']

outliers_dict = {
    "LotArea": '< 30000', 
    "GrLivArea": '< 4000', 
    "TotalBsmtSF": '< 2800'
}

mode_dict = {
    "MSZoning": train_data["MSZoning"].mode()[0], 
    "Utilities": train_data["Utilities"].mode()[0]
}

#### Data preperation


In [161]:


# fill nans
def fillna(dataset, col, fill_with='NA'):
    dataset[col].fillna(fill_with, inplace=True)
    return dataset

def remove_outliers(dataset):

    dataset = dataset[eval(f"dataset['LotArea'] {outliers['LotArea']}")]
    dataset = dataset[eval(f"dataset['GrLivArea'] {outliers['GrLivArea']}")]
    dataset = dataset[eval(f"dataset['TotalBsmtSF'] {outliers['TotalBsmtSF']}")]
    return dataset

def prepare_dataset(dataset, type = 'train'):

    for feat in features_cat_with_na:
        dataset = fillna(dataset, feat)
        
    for feat in features_cont_with_na:
        dataset = fillna(dataset, feat, dataset[feat].mean())

    # these features don't support 'NA', they have different values
    dataset = fillna(dataset, 'Electrical', 'SBrkr')
    dataset = fillna(dataset, 'Exterior1st', 'Other')
    dataset = fillna(dataset, 'Exterior2nd', 'Other')
    dataset = fillna(dataset, 'Functional', 'Typ')
    dataset = fillna(dataset, 'KitchenQual', 'TA')
    dataset = fillna(dataset, 'SaleType', 'Oth')
    dataset = fillna(dataset, 'MasVnrType', 'None')

    # just with mode
    dataset = fillna(dataset, 'MSZoning', mode_dict["MSZoning"])
    dataset = fillna(dataset, 'Utilities', mode_dict["Utilities"])

    # remove outliers based on plots above
    dataset = remove_outliers(dataset)
    # extract target and ids for test set

    target_trans = lambda price: np.log1p(price) ** .5
    target_inv_trans = lambda price: np.expm1(price ** 2)

    # convert
    dataset['SalePrice'] = target_trans(train['SalePrice'])

    target = dataset['SalePrice'] if type == 'train' else None
    Ids = dataset['Id']

    # remove target and ids
    dataset.drop(columns=['SalePrice', 'Id'], inplace=True) if type == 'train' else dataset.drop(columns='Id', inplace=True)

    return dataset, target, Ids


In [162]:
train_transformed, train_target, train_ids = prepare_dataset(dataset=train_data.copy(), type = 'train')
test_transformed, test_target, test_ids = prepare_dataset(dataset=test_data.copy(), type = 'test')

In [163]:
train_transformed.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [164]:
test_transformed.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


In [165]:
train_transformed.shape

(1433, 79)

In [166]:
test_transformed.shape

(1445, 79)

In [167]:
train_transformed.isna().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 79, dtype: int64

In [168]:
test_transformed.isna().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 79, dtype: int64

In [179]:
## Storing variables as pickle files
with open('../artifacts/train_data.pkl', 'wb') as file:
    pickle.dump(train_transformed, file)

with open('../artifacts/train_target.pkl', 'wb') as file:
    pickle.dump(train_target, file)

with open('../artifacts/test_data.pkl', 'wb') as file:
    pickle.dump(test_transformed, file)

with open('../artifacts/cat_features.pkl', 'wb') as file:
    pickle.dump(cat_features, file)



In [185]:
train_transformed

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal
