In [1]:
import pandas

In [2]:
def get_data():
    return pandas.read_csv('train.csv')

data: pandas.DataFrame = get_data()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

#### Обработка данных

In [3]:
# удаляем колонки с большим кол. пропусков

try:
    data = data.drop(columns=[
        'Fence', 'PoolQC', 'Alley', 'MiscFeature', 'FireplaceQu',
        'GarageQual', 'GarageCond', 'GarageType', 'GarageYrBlt', 'GarageFinish'
    ])
except:
    pass

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 71 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [4]:
# тарнсформируем категориальные признаки в не категориальные

from sklearn.preprocessing import OneHotEncoder

def transform_categorical_columns(data):
    object_series = data.dtypes[data.dtypes == 'object']
    categorical_column_names = list(object_series.reset_index()['index'])

    one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

    transform_array = one_hot_encoder.fit_transform(data[categorical_column_names])
    transform_data = pandas.DataFrame(
        transform_array.toarray(),
        columns=one_hot_encoder.get_feature_names())

    try:
        data = data.drop(columns=categorical_column_names)
        return data.join(transform_data)
    except:
        return data

data = transform_categorical_columns(data=data)

try:
    data = data.dropna()
except:
    pass

data.dtypes



Id               int64
MSSubClass       int64
LotFrontage    float64
LotArea          int64
OverallQual      int64
                ...   
x33_AdjLand    float64
x33_Alloca     float64
x33_Family     float64
x33_Normal     float64
x33_Partial    float64
Length: 259, dtype: object

### Сплит данных

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    data.drop(columns='SalePrice'),
    data['SalePrice'],
    test_size=0.3,
    random_state=42)

### Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression

def get_linear_regression_score(x_train, x_test, y_train, y_test):
    linearRegression = LinearRegression()
    linearRegression.fit(x_train, y_train)

    return linearRegression.score(x_test, y_test)

get_linear_regression_score(
    x_train=x_train,
    x_test=x_test,
    y_train=y_train,
    y_test=y_test)

-210842972.899306

### Случайный лес

In [7]:
from sklearn.ensemble import RandomForestRegressor

def get_random_forest_score(x_train, x_test, y_train, y_test):
    randomForest = RandomForestRegressor()
    randomForest.fit(x_train, y_train)

    return randomForest.score(x_test, y_test)

random_forest_score = get_random_forest_score(
    x_train=x_train,
    x_test=x_test,
    y_train=y_train,
    y_test=y_test)

print('score:', random_forest_score)

score: 0.8695289918988931


### Stacking

In [8]:
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

In [9]:
def get_stacking_score(x_train, x_test, y_train, y_test):
    stacking = StackingRegressor(
        [
            ('LinearRegression', LinearRegression()),
            ('KNeighborsRegressor', KNeighborsRegressor()),
            ('DecisionTree', DecisionTreeRegressor())
        ],
        RidgeCV())

    stacking.fit(x_train, y_train)

    return stacking.score(x_test, y_test)

stacking_score = get_stacking_score(
    x_train=x_train,
    x_test=x_test,
    y_train=y_train,
    y_test=y_test)

print('score:', stacking_score)

score: -725541009.9778752


In [10]:
# Убираем Линейную ригрессию

def get_without_line_regression_score(x_train, x_test, y_train, y_test):

    stacking = StackingRegressor(
        [
            ('KNeighborsRegressor', KNeighborsRegressor()),
            ('DecisionTree', DecisionTreeRegressor())
        ],
        RidgeCV())

    stacking.fit(x_train, y_train)

    return stacking.score(x_test, y_test)

stacking_without_line_regression_score = get_without_line_regression_score(
    x_train=x_train,
    x_test=x_test,
    y_train=y_train,
    y_test=y_test)

print('score:', stacking_without_line_regression_score)

score: 0.7994060480667107
