In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('Home_prices.csv')
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

Usuwanie brakujących wartości

In [3]:
df = df.dropna(axis=1)
print(f'Rozmiar: {df.size / df.columns.size} x {df.columns.size}')
df.head()

Rozmiar: 1460.0 x 62


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


Wybór macierzy predykcji 'Y' i macierzy cech 'X'

In [4]:
y_data = df["SalePrice"]
for name, series in df.items():
    print(f'Nazwa: {name}. Ilosc unikatowych wartosci: {series.nunique()}')

Nazwa: Id. Ilosc unikatowych wartosci: 1460
Nazwa: MSSubClass. Ilosc unikatowych wartosci: 15
Nazwa: MSZoning. Ilosc unikatowych wartosci: 5
Nazwa: LotArea. Ilosc unikatowych wartosci: 1073
Nazwa: Street. Ilosc unikatowych wartosci: 2
Nazwa: LotShape. Ilosc unikatowych wartosci: 4
Nazwa: LandContour. Ilosc unikatowych wartosci: 4
Nazwa: Utilities. Ilosc unikatowych wartosci: 2
Nazwa: LotConfig. Ilosc unikatowych wartosci: 5
Nazwa: LandSlope. Ilosc unikatowych wartosci: 3
Nazwa: Neighborhood. Ilosc unikatowych wartosci: 25
Nazwa: Condition1. Ilosc unikatowych wartosci: 9
Nazwa: Condition2. Ilosc unikatowych wartosci: 8
Nazwa: BldgType. Ilosc unikatowych wartosci: 5
Nazwa: HouseStyle. Ilosc unikatowych wartosci: 8
Nazwa: OverallQual. Ilosc unikatowych wartosci: 10
Nazwa: OverallCond. Ilosc unikatowych wartosci: 9
Nazwa: YearBuilt. Ilosc unikatowych wartosci: 112
Nazwa: YearRemodAdd. Ilosc unikatowych wartosci: 61
Nazwa: RoofStyle. Ilosc unikatowych wartosci: 6
Nazwa: RoofMatl. Ilosc unik

In [5]:
num_features = ['MSSubClass', 'OverallQual',
                'YearBuilt', 'YearRemodAdd']
# string_features = ['MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'ExterQual', 'Heating', 'CentralAir', 'KitchenQual']
string_features = ['Condition1', 'ExterQual', 'Heating', 'CentralAir', 'KitchenQual']


In [6]:
for feature in string_features:
    print(f'{feature}: {df[feature].unique()}')

Condition1: ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
ExterQual: ['Gd' 'TA' 'Ex' 'Fa']
Heating: ['GasA' 'GasW' 'Grav' 'Wall' 'OthW' 'Floor']
CentralAir: ['Y' 'N']
KitchenQual: ['Gd' 'TA' 'Ex' 'Fa']


In [7]:
df_str_dummy = pd.get_dummies(data=df[string_features], drop_first=True)
X = pd.concat([df_str_dummy, df[num_features]])
X.head()
# df[num_features].size / df[num_features].columns.size, df_str_dummy.size / df_str_dummy.columns.size

Unnamed: 0,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,ExterQual_Fa,ExterQual_Gd,...,Heating_OthW,Heating_Wall,CentralAir_Y,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,MSSubClass,OverallQual,YearBuilt,YearRemodAdd
0,False,True,False,False,False,False,False,False,False,True,...,False,False,True,False,True,False,,,,
1,True,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,,,,
2,False,True,False,False,False,False,False,False,False,True,...,False,False,True,False,True,False,,,,
3,False,True,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,,,,
4,False,True,False,False,False,False,False,False,False,True,...,False,False,True,False,True,False,,,,


In [8]:
X.describe()

Unnamed: 0,MSSubClass,OverallQual,YearBuilt,YearRemodAdd
count,1460.0,1460.0,1460.0,1460.0
mean,56.89726,6.099315,1971.267808,1984.865753
std,42.300571,1.382997,30.202904,20.645407
min,20.0,1.0,1872.0,1950.0
25%,20.0,5.0,1954.0,1967.0
50%,50.0,6.0,1973.0,1994.0
75%,70.0,7.0,2000.0,2004.0
max,190.0,10.0,2010.0,2010.0


In [9]:
x_train, y_train, x_test, y_test = train_test_split(X, y_data, test_size=0.2, random_state=31)

ValueError: Found input variables with inconsistent numbers of samples: [2920, 1460]

In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(accuracy, report)

ValueError: Found input variables with inconsistent numbers of samples: [1168, 292]