# Формирование модели

In [177]:
import random
import numpy as np 
import pandas as pd 


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold
from sklearn import metrics
from tqdm.notebook import tqdm
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [178]:
# считаем преобработанные данные
df = pd.read_csv('data/data_2.csv')
display(df.head())
df.info()

Unnamed: 0,status,baths,sqft,zipcode,beds,target,PoolPrivate,latitude,longitude,Year built,lotsize,school_rate,school_dist,PrType
0,active,3.5,2900.0,28387,4.0,418000,0,35.18,-79.4,2019-01-01,0.0,5.2,2.7,single family
1,for sale,3.0,1947.0,99216,3.0,310000,0,47.69,-117.19,2019-01-01,5828.0,4.0,1.0,single family
2,for sale,2.0,3000.0,90049,3.0,2895000,1,34.08,-118.49,1961-01-01,8626.0,6.7,1.2,single family
3,for sale,8.0,6457.0,75205,5.0,2395000,0,32.79,-96.76,2006-01-01,8220.0,9.0,0.1,single family
4,for sale,0.0,0.0,32908,0.0,5000,0,27.98,-80.66,1800-01-01,10019.0,4.7,3.0,land


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352505 entries, 0 to 352504
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   status       352505 non-null  object 
 1   baths        352505 non-null  float64
 2   sqft         352505 non-null  float64
 3   zipcode      352505 non-null  int64  
 4   beds         352505 non-null  float64
 5   target       352505 non-null  int64  
 6   PoolPrivate  352505 non-null  int64  
 7   latitude     352505 non-null  float64
 8   longitude    352505 non-null  float64
 9   Year built   352505 non-null  object 
 10  lotsize      352505 non-null  float64
 11  school_rate  352505 non-null  float64
 12  school_dist  352505 non-null  float64
 13  PrType       352505 non-null  object 
dtypes: float64(8), int64(3), object(3)
memory usage: 37.7+ MB


In [179]:
# переведем почтовый индекс в категориальный
df['zipcode'] = df['zipcode'].astype(str)

In [180]:
# стандартизируем числовые признаки
scaler_baths = StandardScaler()
df['baths'] = scaler_baths.fit_transform(df[['baths']])

scaler_sqft = StandardScaler()
df['sqft'] = scaler_sqft.fit_transform(df[['sqft']])

scaler_beds = StandardScaler()
df['beds'] = scaler_beds.fit_transform(df[['beds']])

scaler_lotsize = StandardScaler()
df['lotsize'] = scaler_lotsize.fit_transform(df[['lotsize']])

scaler_school_rate = StandardScaler() 
df['school_rate'] = scaler_school_rate.fit_transform(df[['school_rate']])

scaler_school_dist = StandardScaler()
df['school_dist'] = scaler_school_dist.fit_transform(df[['school_dist']])

df.head()

Unnamed: 0,status,baths,sqft,zipcode,beds,target,PoolPrivate,latitude,longitude,Year built,lotsize,school_rate,school_dist,PrType
0,active,0.912296,0.020741,28387,0.838383,418000,0,35.18,-79.4,2019-01-01,-0.074443,0.193948,0.723802,single family
1,for sale,0.614043,-0.04121,99216,0.312933,310000,0,47.69,-117.19,2019-01-01,-0.064913,-0.432747,-0.091598,single family
2,for sale,0.017536,0.027242,90049,0.312933,2895000,1,34.08,-118.49,1961-01-01,-0.060337,0.977316,0.004331,single family
3,for sale,3.596578,0.251967,75205,1.363833,2395000,0,32.79,-96.76,2006-01-01,-0.061001,2.178481,-0.523281,single family
4,for sale,-1.175478,-0.167777,32908,-1.263417,5000,0,27.98,-80.66,1800-01-01,-0.05806,-0.067175,0.867697,land


In [181]:
# закодируем категорийные признаки
encoder_status = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(encoder_status.fit_transform(df[['status']]).toarray())
encoder_df = encoder_df.add_prefix('status_')
df = df.join(encoder_df)

encoder_type = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(encoder_type.fit_transform(df[['PrType']]).toarray())
encoder_df = encoder_df.add_prefix('PrType_')
df = df.join(encoder_df)

df.head()

Unnamed: 0,status,baths,sqft,zipcode,beds,target,PoolPrivate,latitude,longitude,Year built,...,PrType_1,PrType_2,PrType_3,PrType_4,PrType_5,PrType_6,PrType_7,PrType_8,PrType_9,PrType_10
0,active,0.912296,0.020741,28387,0.838383,418000,0,35.18,-79.4,2019-01-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,for sale,0.614043,-0.04121,99216,0.312933,310000,0,47.69,-117.19,2019-01-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,for sale,0.017536,0.027242,90049,0.312933,2895000,1,34.08,-118.49,1961-01-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,for sale,3.596578,0.251967,75205,1.363833,2395000,0,32.79,-96.76,2006-01-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,for sale,-1.175478,-0.167777,32908,-1.263417,5000,0,27.98,-80.66,1800-01-01,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [182]:
# Закодируем часть признаков через LabelEncoder

le_year = LabelEncoder()
df['Year_built_enc'] = le_year.fit_transform(df['Year built'])

le_zip = LabelEncoder()
df['zipcode_enc'] = le_zip.fit_transform(df['zipcode'])

In [183]:
# удалим исходные столбцы категорийных признаков, которые закодировали выше
df = df.drop(['status', 'zipcode', 'Year built', 'PrType'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352505 entries, 0 to 352504
Data columns (total 34 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   baths           352505 non-null  float64
 1   sqft            352505 non-null  float64
 2   beds            352505 non-null  float64
 3   target          352505 non-null  int64  
 4   PoolPrivate     352505 non-null  int64  
 5   latitude        352505 non-null  float64
 6   longitude       352505 non-null  float64
 7   lotsize         352505 non-null  float64
 8   school_rate     352505 non-null  float64
 9   school_dist     352505 non-null  float64
 10  status_0        352505 non-null  float64
 11  status_1        352505 non-null  float64
 12  status_2        352505 non-null  float64
 13  status_3        352505 non-null  float64
 14  status_4        352505 non-null  float64
 15  status_5        352505 non-null  float64
 16  status_6        352505 non-null  float64
 17  status_7  

In [184]:
# разделим датасет
# целевую переменную логарифмируем
y = df['target'].apply(lambda x: np.log(x))
X = df.drop(['target'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

##### Линейная регрессия

In [185]:
# возьмем в качестве базового прогноза линейную регрессию
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.80
Test MSE: 0.79
Train MAE: 0.64
Test MAE: 0.64
Train R2: 0.41
Test R2: 0.41


Пока можно сказать, что результаты по обеим выборкам сопоставимы,  
т.е. не происходит явного недообучения или переобучения модели.

##### Дерево решений без указания параметров

In [186]:
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)

y_pred_train = dtr.predict(X_train)
y_pred_test = dtr.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.00
Test MSE: 0.32
Train MAE: 0.00
Test MAE: 0.31
Train R2: 1.00
Test R2: 0.76


Значения метрик улучшились по сравнению с линейной регрессией в т.ч. на тестовой выборке,  
однако явно видно переобучение модели.

##### Дерево решений с подбором глубины

In [187]:
# Задаем словарь гиперпараметров для GridSearchCV
param_grid_DTR = {
              'max_depth': list(np.arange(6,21)) # максимальная глубина дерева
              }

# Создаем объект класса            
grid_search_DTR = GridSearchCV(
    estimator=DecisionTreeRegressor(), 
    param_grid=param_grid_DTR,
    n_jobs = -1)

# Запускаем подбор гиперпараметров
grid_search_DTR.fit(X_train, y_train) 

# посмотрим наилучшую найденную комбинацию гиперпараметров
grid_search_DTR.best_params_

{'max_depth': 18}

In [188]:
# запустим модель с оптимальным параметром
dtr = DecisionTreeRegressor(max_depth=19, random_state=42)
dtr.fit(X_train, y_train)

y_pred_train = dtr.predict(X_train)
y_pred_test = dtr.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.11
Test MSE: 0.30
Train MAE: 0.20
Test MAE: 0.33
Train R2: 0.92
Test R2: 0.78


Значения лучше, чем у модели линейной регрессии, и при этом разница между выборками меньше.  
Однако, всё равно наблюдается эффект переобучения.

##### Random Forest

In [189]:
rf_regressor = RandomForestRegressor(max_depth=19, random_state=42)
rf_regressor.fit(X_train, y_train)
y_pred_train = rf_regressor.predict(X_train)
y_pred_test = rf_regressor.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.09
Test MSE: 0.19
Train MAE: 0.20
Test MAE: 0.27
Train R2: 0.93
Test R2: 0.86


Значения существенно улучшились по сравнению с baseline, однако прослеживается переобучение модели.

##### ElasticNetCV

In [190]:
# Создаем и тренируем модель ElasticNetCV с кросс-валидацией по 5 фолдам
model_el = ElasticNetCV(cv=5, random_state=42)
model_el.fit(X_train, y_train)

# Предсказания для обучающей и тестовой выборок
y_pred_train = model_el.predict(X_train)
y_pred_test = model_el.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.93
Test MSE: 0.92
Train MAE: 0.70
Test MAE: 0.70
Train R2: 0.31
Test R2: 0.31


Как ни странно, это худший результат из полученных.

In [191]:
data = {'Metric': ['Train MSE', 'Test MSE', 'Train MAE', 'Test MAE', 'Train R2', 'Test R2'],
        'LinearRegression': [0.80, 0.79, 0.64, 0.64, 0.41, 0.41],
        'DecisionTree - base': [0.00, 0.32, 0.00, 0.31, 1.00, 0.76],
        'DecisionTree - GridCV': [0.11, 0.30, 0.20, 0.33, 0.92, 0.78],
        'RandomForestRegressor': [0.09, 0.19, 0.20, 0.27, 0.93, 0.86],
        'ElasticNetCV': [0.93, 0.92, 0.70, 0.70, 0.31, 0.31]}

df_metric = pd.DataFrame(data)
df_metric

Unnamed: 0,Metric,LinearRegression,DecisionTree - base,DecisionTree - GridCV,RandomForestRegressor,ElasticNetCV
0,Train MSE,0.8,0.0,0.11,0.09,0.93
1,Test MSE,0.79,0.32,0.3,0.19,0.92
2,Train MAE,0.64,0.0,0.2,0.2,0.7
3,Test MAE,0.64,0.31,0.33,0.27,0.7
4,Train R2,0.41,1.0,0.92,0.93,0.31
5,Test R2,0.41,0.76,0.78,0.86,0.31


Исходя из результатов, показанных вышеуказанными моделями, наиболее качественно отработала  
Random Forest. В идеале надо подобрать самую эффективную сетку параметров.  
К сожалению, при отработке подбора параметров с помощью GridCV моя техника зависает,  
прошу принять во внимание этот фактор.  
Модель случайного леса и рекомендуется использовать в дальнейшем для прогнозирования значений.

# Воспроизводимость

In [207]:
# Проверим на тестовой выборке (представим, что это новые данные, но возьмем кусочек старой)
df_test = pd.read_csv('data/data_2.csv').sample(5).reset_index().drop(['index','target'], axis=1)
# снова переведем почтовый индекс в строковое значение
df_test['zipcode'] = df_test['zipcode'].astype(str)
df_test

Unnamed: 0,status,baths,sqft,zipcode,beds,PoolPrivate,latitude,longitude,Year built,lotsize,school_rate,school_dist,PrType
0,for sale,2.0,1275.0,32514,4.0,0,30.44,-87.18,1974-01-01,9583.0,5.3,0.7,single family
1,active,6.5,6338.0,37215,5.0,0,36.17,-86.78,2019-01-01,22216.0,4.2,0.3,single family
2,active,0.0,0.0,27105,0.0,0,36.1,-80.24,1800-01-01,28750.0,3.0,0.7,land
3,active,1.5,2262.0,77009,3.0,0,29.76,-95.38,1948-01-01,16636.0,4.7,0.3,single family
4,active,0.0,1227.0,33131,0.0,0,25.77,-80.2,1973-01-01,0.0,4.7,0.4,other


In [208]:
# Произведем манипуляции в том же порядке:

# стандартизируем числовые признаки
df_test['baths'] = scaler_baths.transform(df_test[['baths']])
df_test['sqft'] = scaler_sqft.transform(df_test[['sqft']])
df_test['beds'] = scaler_beds.transform(df_test[['beds']])
df_test['lotsize'] = scaler_lotsize.transform(df_test[['lotsize']])
df_test['school_rate'] = scaler_school_rate.transform(df_test[['school_rate']])
df_test['school_dist'] = scaler_school_dist.transform(df_test[['school_dist']])

# закодируем категорийные признаки
encoder_df_st = pd.DataFrame(encoder_status.transform(df_test[['status']]).toarray())
encoder_df_st = encoder_df_st.add_prefix('status_')
df_test = df_test.join(encoder_df_st)

encoder_df_ty = pd.DataFrame(encoder_type.transform(df_test[['PrType']]).toarray())
encoder_df_ty = encoder_df_ty.add_prefix('PrType_')
df_test = df_test.join(encoder_df_ty)

# Закодируем часть признаков через LabelEncoder
df_test['Year_built_enc'] = le_year.transform(df_test['Year built'])
df_test['zipcode_enc'] = le_zip.transform(df_test['zipcode'])

# удалим исходные столбцы категорийных признаков, которые закодировали выше
df_test = df_test.drop(['status', 'zipcode', 'Year built', 'PrType'], axis=1)
df_test.head()


Unnamed: 0,baths,sqft,beds,PoolPrivate,latitude,longitude,lotsize,school_rate,school_dist,status_0,...,PrType_3,PrType_4,PrType_5,PrType_6,PrType_7,PrType_8,PrType_9,PrType_10,Year_built_enc,zipcode_enc
0,0.017536,-0.084894,0.838383,0,30.44,-87.18,-0.058773,0.246172,-0.235493,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,170,1028
1,2.701818,0.244232,1.363833,0,36.17,-86.78,-0.038115,-0.328298,-0.427352,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,215,1449
2,-1.175478,-0.167777,-1.263417,0,36.1,-80.24,-0.027431,-0.954992,-0.235493,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,669
3,-0.280718,-0.020733,0.312933,0,29.76,-95.38,-0.04724,-0.067175,-0.427352,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,144,2569
4,-1.175478,-0.088014,-1.263417,0,25.77,-80.2,-0.074443,-0.067175,-0.379387,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,169,1131


In [212]:
y_pred = dtr.predict(df_test)
np.exp(y_pred)


array([ 173905.4413433 , 2038934.99272974,   51749.82681127,
        481909.60345578,  336261.80956485])

Модель при соблюдении последовательности производит прогноз стоимости объекта.