In [1]:
import logging

import pandas as pd
from sklearn.preprocessing import LabelEncoder

from split_data_type import SplitDataType
from pipelines import TreeSimilarPipeline
from feature_manager import FeatureManager

In [2]:
logging.basicConfig(level=logging.DEBUG)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
target_column = "SalePrice"

df = pd.read_csv("../data/HousePrice_train.csv")
X_train = df.drop(columns=[target_column])
X_test = pd.read_csv("../data/HousePrice_test.csv")

In [4]:
FeatureManager.get_features_with_none(X_train)

'LotFrontage, Alley, MasVnrType, MasVnrArea, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Electrical, FireplaceQu, GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond, PoolQC, Fence, MiscFeature'

In [5]:
FeatureManager.get_smallest_category(X_train)

Unnamed: 0_level_0,value,frequency(%)
column,Unnamed: 1_level_1,Unnamed: 2_level_1
Utilities,NoSeWa,0.068493
RoofMatl,Metal,0.068493
Exterior1st,AsphShn,0.068493
Condition2,PosA,0.068493
HeatingQC,Po,0.068493
Heating,Floor,0.068493
ExterCond,Po,0.068493
Exterior2nd,Other,0.068493
Functional,Sev,0.068493
Electrical,Mix,0.06854


In [6]:
FeatureManager.get_high_entropy(X_train)

['Id']

# Досліджую фічі, де є маленький % значень та заповнюю нулі

In [7]:
# заповню середнім арифметичним
print(X_train['LotFrontage'].isnull().sum())
X_train['LotFrontage'].value_counts().head()

259


LotFrontage
60.0    143
70.0     70
80.0     69
50.0     57
75.0     53
Name: count, dtype: int64

In [8]:
# в with_alley
print(X_train['Alley'].isnull().sum())
X_train['Alley'].value_counts() 

1369


Alley
Grvl    50
Pave    41
Name: count, dtype: int64

In [9]:
# в with masonry
print(X_train['MasVnrType'].isnull().sum())
X_train['MasVnrType'].value_counts() 

872


MasVnrType
BrkFace    445
Stone      128
BrkCmn      15
Name: count, dtype: int64

In [10]:
# заповню середнім арифметичним
print(X_train['MasVnrArea'].isnull().sum())
X_train['MasVnrArea'].value_counts().head()

8


MasVnrArea
0.0      861
108.0      8
72.0       8
180.0      8
16.0       7
Name: count, dtype: int64

In [11]:
# оцінічне
print(X_train['BsmtQual'].isnull().sum())
X_train['BsmtQual'].value_counts()

37


BsmtQual
TA    649
Gd    618
Ex    121
Fa     35
Name: count, dtype: int64

In [12]:
# оцінічне
print(X_train['BsmtCond'].isnull().sum())
X_train['BsmtCond'].value_counts()

37


BsmtCond
TA    1311
Gd      65
Fa      45
Po       2
Name: count, dtype: int64

In [13]:
# оцінічне
print(X_train['BsmtExposure'].isnull().sum())
X_train['BsmtExposure'].value_counts()

38


BsmtExposure
No    953
Av    221
Gd    134
Mn    114
Name: count, dtype: int64

In [14]:
# в is_finished_bsmt_fin_type(1, 2)
print(X_train['BsmtFinType1'].isnull().sum())
X_train['BsmtFinType1'].value_counts()
print(X_train['BsmtFinType2'].isnull().sum())
X_train['BsmtFinType2'].value_counts()

37
38


BsmtFinType2
Unf    1256
Rec      54
LwQ      46
BLQ      33
ALQ      19
GLQ      14
Name: count, dtype: int64

In [15]:
# в is_standard_electrical
print(X_train['Electrical'].isnull().sum())
X_train['Electrical'].value_counts() 

1


Electrical
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: count, dtype: int64

In [16]:
# оцінка
print(X_train['FireplaceQu'].isnull().sum())
X_train['FireplaceQu'].value_counts()

690


FireplaceQu
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: count, dtype: int64

In [17]:
# в attchd, detchd, others
print(X_train['GarageType'].isnull().sum())
X_train['GarageType'].value_counts()

81


GarageType
Attchd     870
Detchd     387
BuiltIn     88
Basment     19
CarPort      9
2Types       6
Name: count, dtype: int64

In [18]:
# -1 бо нема гаража
print(X_train['GarageYrBlt'].isnull().sum())
X_train['GarageYrBlt'].value_counts().head()

81


GarageYrBlt
2005.0    65
2006.0    59
2004.0    53
2003.0    50
2007.0    49
Name: count, dtype: int64

In [19]:
# No garage бо нема гаража
print(X_train['GarageFinish'].isnull().sum())
X_train['GarageFinish'].value_counts()

81


GarageFinish
Unf    605
RFn    422
Fin    352
Name: count, dtype: int64

In [20]:
# оцінічне
print(X_train['GarageQual'].isnull().sum())
X_train['GarageQual'].value_counts() 

81


GarageQual
TA    1311
Fa      48
Gd      14
Ex       3
Po       3
Name: count, dtype: int64

In [21]:
# оцінічне
print(X_train['GarageCond'].isnull().sum())
X_train['GarageCond'].value_counts() 

81


GarageCond
TA    1326
Fa      35
Gd       9
Po       7
Ex       2
Name: count, dtype: int64

In [22]:
# в with pool
print(X_train['PoolQC'].isnull().sum())
X_train['PoolQC'].value_counts() 

1453


PoolQC
Gd    3
Ex    2
Fa    2
Name: count, dtype: int64

In [23]:
# оцінка
print(X_train['Fence'].isnull().sum())
X_train['Fence'].value_counts()

1179


Fence
MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: count, dtype: int64

In [24]:
# в with feature
print(X_train['MiscFeature'].isnull().sum())
X_train['MiscFeature'].value_counts()

1406


MiscFeature
Shed    49
Gar2     2
Othr     2
TenC     1
Name: count, dtype: int64

In [25]:
# Мусор фіча, в смітник
X_train['Utilities'].value_counts() 

Utilities
AllPub    1459
NoSeWa       1
Name: count, dtype: int64

In [26]:
# Мусор фіча
X_train['RoofMatl'].value_counts() 

RoofMatl
CompShg    1434
Tar&Grv      11
WdShngl       6
WdShake       5
Metal         1
Membran       1
Roll          1
ClyTile       1
Name: count, dtype: int64

In [27]:
# перетворю < 200 в others
X_train['Exterior1st'].value_counts() 

Exterior1st
VinylSd    515
HdBoard    222
MetalSd    220
Wd Sdng    206
Plywood    108
CemntBd     61
BrkFace     50
WdShing     26
Stucco      25
AsbShng     20
BrkComm      2
Stone        2
AsphShn      1
ImStucc      1
CBlock       1
Name: count, dtype: int64

In [28]:
# мусорка
X_train['Condition2'].value_counts() 

Condition2
Norm      1445
Feedr        6
Artery       2
RRNn         2
PosN         2
PosA         1
RRAn         1
RRAe         1
Name: count, dtype: int64

In [29]:
# Зведу всі оціночні категорії в positive, neutral, negative
X_train['HeatingQC'].value_counts() 

HeatingQC
Ex    741
TA    428
Gd    241
Fa     49
Po      1
Name: count, dtype: int64

In [30]:
# мусор
X_train['Heating'].value_counts() 

Heating
GasA     1428
GasW       18
Grav        7
Wall        4
OthW        2
Floor       1
Name: count, dtype: int64

In [31]:
X_train['ExterCond'].value_counts() 

ExterCond
TA    1282
Gd     146
Fa      28
Ex       3
Po       1
Name: count, dtype: int64

In [32]:
# < 100 в others
X_train['Exterior2nd'].value_counts() 

Exterior2nd
VinylSd    504
MetalSd    214
HdBoard    207
Wd Sdng    197
Plywood    142
CmentBd     60
Wd Shng     38
Stucco      26
BrkFace     25
AsbShng     20
ImStucc     10
Brk Cmn      7
Stone        5
AsphShn      3
Other        1
CBlock       1
Name: count, dtype: int64

In [33]:
# в is_typical_functional
X_train['Functional'].value_counts() 

Functional
Typ     1360
Min2      34
Min1      31
Mod       15
Maj1      14
Maj2       5
Sev        1
Name: count, dtype: int64

In [34]:
# в is_standard_electrical
X_train['Electrical'].value_counts() 

Electrical
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: count, dtype: int64

In [35]:
# нічого не роблю
X_train['Neighborhood'].value_counts()

Neighborhood
NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
SWISU       25
StoneBr     25
Blmngtn     17
MeadowV     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: count, dtype: int64

In [36]:
# в is_gable_roof_style
X_train['RoofStyle'].value_counts() 

RoofStyle
Gable      1141
Hip         286
Flat         13
Gambrel      11
Mansard       7
Shed          2
Name: count, dtype: int64

In [37]:
# в is_norm_condition1
X_train['Condition1'].value_counts() 

Condition1
Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: count, dtype: int64

In [38]:
# в new, warrante_deed, others
X_train['SaleType'].value_counts() 

SaleType
WD       1267
New       122
COD        43
ConLD       9
ConLI       5
ConLw       5
CWD         4
Oth         3
Con         2
Name: count, dtype: int64

In [39]:
# < 200 в others
X_train['Foundation'].value_counts() 

Foundation
PConc     647
CBlock    634
BrkTil    146
Slab       24
Stone       6
Wood        3
Name: count, dtype: int64

In [40]:
# в is_normal_sale_condition
X_train['SaleCondition'].value_counts() 

SaleCondition
Normal     1198
Partial     125
Abnorml     101
Family       20
Alloca       12
AdjLand       4
Name: count, dtype: int64

In [41]:
# в inside, corner, others
X_train['LotConfig'].value_counts() 

LotConfig
Inside     1052
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: count, dtype: int64

In [42]:
# мусор фіча
X_train['Street'].value_counts()

Street
Pave    1454
Grvl       6
Name: count, dtype: int64

In [43]:
# звужую до 1 этажний, 2 этажний, інші
X_train['HouseStyle'].value_counts()

HouseStyle
1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
1.5Unf     14
2.5Unf     11
2.5Fin      8
Name: count, dtype: int64

In [44]:
# в is_reg
X_train['LotShape'].value_counts()

LotShape
Reg    925
IR1    484
IR2     41
IR3     10
Name: count, dtype: int64

In [45]:
# в is_residential
X_train['MSZoning'].value_counts()

MSZoning
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: count, dtype: int64

In [46]:
# в is_gtl
X_train['LandSlope'].value_counts()

LandSlope
Gtl    1382
Mod      65
Sev      13
Name: count, dtype: int64

In [47]:
# оцінічне
X_train['ExterQual'].value_counts()

ExterQual
TA    906
Gd    488
Ex     52
Fa     14
Name: count, dtype: int64

In [48]:
# в is_paved
X_train['PavedDrive'].value_counts()

PavedDrive
Y    1340
N      90
P      30
Name: count, dtype: int64

In [49]:
# нічого не роблю
X_train['BldgType'].value_counts()

BldgType
1Fam      1220
TwnhsE     114
Duplex      52
Twnhs       43
2fmCon      31
Name: count, dtype: int64

In [50]:
# в is_level
X_train['LandContour'].value_counts()

LandContour
Lvl    1311
Bnk      63
HLS      50
Low      36
Name: count, dtype: int64

In [51]:
X_train['MasVnrType'].value_counts()

MasVnrType
BrkFace    445
Stone      128
BrkCmn      15
Name: count, dtype: int64

In [52]:
# оцінка
X_train['KitchenQual'].value_counts()

KitchenQual
TA    735
Gd    586
Ex    100
Fa     39
Name: count, dtype: int64

In [53]:
# в is_central_air
X_train['CentralAir'].value_counts()

CentralAir
Y    1365
N      95
Name: count, dtype: int64

In [54]:
# оцінка
X_train['BsmtExposure'].value_counts()

BsmtExposure
No    953
Av    221
Gd    134
Mn    114
Name: count, dtype: int64

In [55]:
label_encoder = LabelEncoder()
X_train = TreeSimilarPipeline(X_train, SplitDataType.TRAIN, label_encoder).build()

DEBUG:root:TRAIN - Function: _drop_not_needed
Size: (1460, 74)
Columns with None: LotFrontage, Alley, MasVnrType, MasVnrArea, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Electrical, FireplaceQu, GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond, PoolQC, Fence, MiscFeature



DEBUG:root:TRAIN - Function: _fill_null
Size: (1460, 74)
Columns with None: BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Electrical, FireplaceQu, GarageType, GarageQual, GarageCond, Fence

DEBUG:root:TRAIN - Function: _preprocess_features
Size: (1460, 74)
Columns with None: None

DEBUG:root:TRAIN - Function: _encode
Size: (1460, 74)
Columns with None: None

DEBUG:root:TRAIN - Function: _normalize
Size: (1460, 74)
Columns with None: None

DEBUG:root:TRAIN - Function: _drop_high_correlation
Size: (1460, 63)
Columns with None: None



In [56]:
X_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotConfig,Neighborhood,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtUnfSF,HeatingQC,1stFlrSF,2ndFlrSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,WithAlley,WithMasonry,WithFeature,Is_finished_bsmt_fintype1,Is_finished_bsmt_fintype2,Is_typical_functional,Is_standard_electrical,Is_gable_roofstyle,Is_norm_condition1,Is_normal_sale_condition,Is_reg_lotshape,Is_residential_mszoning,Is_Gtl_landslope,Is_paved,Is_level_landContour,Is_central_air
0,60,65.0,8450,1,5,0,1,7,5,2003,2003,4,196.0,2,1,2,3,2,0,706,150,2,856,854,1,0,2,1,3,1,2,0,0,2003.0,2,2,0,61,0,0,0,0,0,0,2,2008,2,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1
1,20,80.0,9600,2,24,0,0,6,8,1976,1976,1,0.0,1,1,0,3,2,3,978,284,2,1262,0,0,1,2,0,3,1,1,1,0,1976.0,2,2,298,0,0,0,0,0,0,0,5,2007,2,0,0,0,1,0,1,1,1,0,1,1,1,1,1,1,1
2,60,68.0,11250,1,5,0,1,7,5,2001,2002,4,162.0,2,1,2,3,2,1,486,434,2,920,866,1,0,2,1,3,1,2,1,0,2001.0,2,2,0,42,0,0,0,0,0,0,9,2008,2,0,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1
3,70,60.0,9550,0,6,0,1,7,5,1915,1970,5,0.0,1,1,1,2,3,0,216,540,2,961,756,1,0,1,0,3,1,2,1,1,1998.0,3,3,0,35,272,0,0,0,0,0,2,2006,2,0,0,0,1,0,1,1,1,1,0,0,1,1,1,1,1
4,60,84.0,14260,2,15,0,1,8,5,2000,2000,4,350.0,2,1,2,3,2,2,655,490,2,1145,1053,1,0,2,1,4,1,2,1,0,2000.0,2,3,192,84,0,0,0,0,0,0,12,2008,2,0,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1


# Шукаю висококорельовані фічі (вже будуть дропнуті, бо зверху побудова)

In [57]:
FeatureManager.get_high_correlation_features(X_train).head()

Series([], dtype: float64)

In [58]:
FeatureManager.get_VIF_correlation_features(X_train).head()

Unnamed: 0,feature,VIF
19,BsmtFinSF1,6.755808
9,YearBuilt,6.136741
0,MSSubClass,6.119281
22,1stFlrSF,5.873152
20,BsmtUnfSF,5.69421


# Досліджую тестову

In [59]:
X_test = TreeSimilarPipeline(X_test, SplitDataType.TEST, label_encoder).build()

DEBUG:root:TEST - Function: _drop_not_needed
Size: (1459, 74)
Columns with None: MSZoning, LotFrontage, Alley, Exterior1st, Exterior2nd, MasVnrType, MasVnrArea, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinSF1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath, BsmtHalfBath, KitchenQual, Functional, FireplaceQu, GarageType, GarageYrBlt, GarageFinish, GarageCars, GarageArea, GarageQual, GarageCond, PoolQC, Fence, MiscFeature, SaleType

DEBUG:root:TEST - Function: _fill_null
Size: (1459, 74)
Columns with None: MSZoning, Exterior1st, Exterior2nd, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, TotalBsmtSF, KitchenQual, Functional, FireplaceQu, GarageType, GarageQual, GarageCond, Fence, SaleType

DEBUG:root:TEST - Function: _preprocess_features
Size: (1459, 74)
Columns with None: TotalBsmtSF

DEBUG:root:TEST - Function: _encode
Size: (1459, 74)
Columns with None: TotalBsmtSF

DEBUG:root:TEST - Function: _normalize
Size: (1459, 74)
Columns with None: Tot

In [60]:
X_test.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotConfig,Neighborhood,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtUnfSF,HeatingQC,1stFlrSF,2ndFlrSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,WithAlley,WithMasonry,WithFeature,Is_finished_bsmt_fintype1,Is_finished_bsmt_fintype2,Is_typical_functional,Is_standard_electrical,Is_gable_roofstyle,Is_norm_condition1,Is_normal_sale_condition,Is_reg_lotshape,Is_residential_mszoning,Is_Gtl_landslope,Is_paved,Is_level_landContour,Is_central_air
0,20,80.0,11622,1,12,0,0,5,6,1961,1961,4,0.0,1,1,0,2,2,0,468.0,270.0,1,896,0,0.0,0.0,1,0,2,1,2,0,0,1961.0,3,1.0,140,0,0,0,120,0,1,0,6,2010,2,0,0,0,1,1,1,1,1,0,1,1,1,1,1,1,1
1,20,81.0,14267,0,12,0,0,6,6,1958,1958,5,108.0,1,1,0,2,2,0,923.0,406.0,1,1329,0,0.0,0.0,1,1,3,1,3,0,0,1958.0,3,1.0,393,36,0,0,0,0,0,12500,6,2010,2,0,1,1,1,0,1,1,0,1,1,0,1,1,1,1,1
2,60,74.0,13830,1,8,0,1,5,5,1997,1998,4,0.0,1,1,2,3,2,0,791.0,137.0,2,928,701,0.0,0.0,2,1,3,1,2,1,0,1997.0,0,2.0,212,34,0,0,0,0,1,0,3,2010,2,0,0,0,1,0,1,1,1,1,1,0,1,1,1,1,1
3,60,78.0,9978,1,8,0,1,6,6,1998,1998,4,20.0,1,1,2,2,2,0,602.0,324.0,2,926,678,0.0,0.0,2,1,3,1,3,1,0,1998.0,0,2.0,360,36,0,0,0,0,0,0,6,2010,2,0,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1
4,120,43.0,5005,1,22,4,0,8,5,1992,1992,0,0.0,2,1,2,3,2,0,263.0,1017.0,2,1280,0,0.0,0.0,2,0,2,1,3,0,0,1992.0,2,2.0,0,82,0,0,144,0,0,0,1,2010,2,0,0,0,1,0,1,1,1,1,1,0,1,1,1,0,1


In [61]:
FeatureManager.get_features_with_none(X_test)

'None'

In [62]:
# 0, бо нема обліцовки
print(X_test['BsmtFinSF1'].isnull().sum())
X_test['BsmtFinSF1'].value_counts().head()

0


BsmtFinSF1
0.0      463
24.0      15
602.0      6
276.0      6
288.0      5
Name: count, dtype: int64

In [63]:
# 0, бо нема обліцовки
print(X_test['BsmtUnfSF'].isnull().sum())
X_test['BsmtUnfSF'].value_counts().head()

0


BsmtUnfSF
0.0      124
384.0     11
624.0      8
348.0      7
738.0      7
Name: count, dtype: int64

In [64]:
# середнім, бо нема даних, хоча ванни є, BsmtHalfBath теж саме
print(X_test['BsmtFullBath'].isnull().sum())
X_test['BsmtFullBath'].value_counts().head()

0


BsmtFullBath
0.000000    849
1.000000    584
2.000000     23
0.434454      2
3.000000      1
Name: count, dtype: int64

In [65]:
# 0 бо нема гаражу
print(X_test['GarageCars'].isnull().sum())
X_test['GarageCars'].value_counts().head()

0


GarageCars
2.0    770
1.0    407
3.0    193
0.0     77
4.0     11
Name: count, dtype: int64

In [66]:
FeatureManager.get_high_correlation_features(X_test).head()

Series([], dtype: float64)

In [67]:
FeatureManager.get_VIF_correlation_features(X_test).head()

Unnamed: 0,feature,VIF
9,YearBuilt,6.193133
19,BsmtFinSF1,5.95997
22,1stFlrSF,5.757492
23,2ndFlrSF,5.263341
0,MSSubClass,5.140468
