# Формирование модели

In [76]:
import random
import numpy as np 
import pandas as pd 
import sys
import optuna

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold
from sklearn import metrics
from tqdm.notebook import tqdm
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [60]:
# считаем преобработанные данные
df = pd.read_csv('data/data_2.csv')
display(df.head())
df.info()

Unnamed: 0,status,baths,sqft,zipcode,beds,target,PoolPrivate,latitude,longitude,Year built,lotsize,school_rate,school_dist,PrType
0,active,3.5,2900.0,28387,4.0,418000,0,35.18,-79.4,2019-01-01,0.0,5.2,2.7,single family
1,for sale,3.0,1947.0,99216,3.0,310000,0,47.69,-117.19,2019-01-01,5828.0,4.0,1.0,single family
2,for sale,2.0,3000.0,90049,3.0,2895000,1,34.08,-118.49,1961-01-01,8626.0,6.7,1.2,single family
3,for sale,8.0,6457.0,75205,5.0,2395000,0,32.79,-96.76,2006-01-01,8220.0,9.0,0.1,single family
4,for sale,0.0,0.0,32908,0.0,5000,0,27.98,-80.66,1800-01-01,10019.0,4.7,3.0,land


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352505 entries, 0 to 352504
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   status       352505 non-null  object 
 1   baths        352505 non-null  float64
 2   sqft         352505 non-null  float64
 3   zipcode      352505 non-null  int64  
 4   beds         352505 non-null  float64
 5   target       352505 non-null  int64  
 6   PoolPrivate  352505 non-null  int64  
 7   latitude     352505 non-null  float64
 8   longitude    352505 non-null  float64
 9   Year built   352505 non-null  object 
 10  lotsize      352505 non-null  float64
 11  school_rate  352505 non-null  float64
 12  school_dist  352505 non-null  float64
 13  PrType       352505 non-null  object 
dtypes: float64(8), int64(3), object(3)
memory usage: 37.7+ MB


In [61]:
# переведем почтовый индекс в категориальный
df['zipcode'] = df['zipcode'].astype(str)

In [62]:
# стандартизируем числовые признаки
scaler = StandardScaler()
for column in ['baths', 'sqft', 'beds', 'target', 'lotsize', 'school_rate', 'school_dist']:
        df[column] = scaler.fit_transform(df[[column]])
df.head()

Unnamed: 0,status,baths,sqft,zipcode,beds,target,PoolPrivate,latitude,longitude,Year built,lotsize,school_rate,school_dist,PrType
0,active,0.912296,0.020741,28387,0.838383,-0.146296,0,35.18,-79.4,2019-01-01,-0.074443,0.193948,0.723802,single family
1,for sale,0.614043,-0.04121,99216,0.312933,-0.259833,0,47.69,-117.19,2019-01-01,-0.064913,-0.432747,-0.091598,single family
2,for sale,0.017536,0.027242,90049,0.312933,2.457713,1,34.08,-118.49,1961-01-01,-0.060337,0.977316,0.004331,single family
3,for sale,3.596578,0.251967,75205,1.363833,1.932075,0,32.79,-96.76,2006-01-01,-0.061001,2.178481,-0.523281,single family
4,for sale,-1.175478,-0.167777,32908,-1.263417,-0.580472,0,27.98,-80.66,1800-01-01,-0.05806,-0.067175,0.867697,land


In [63]:
# закодируем категорийные признаки
encoder = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(encoder.fit_transform(df[['status']]).toarray())
encoder_df = encoder_df.add_prefix('status_')
df = df.join(encoder_df)

encoder = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(encoder.fit_transform(df[['PrType']]).toarray())
encoder_df = encoder_df.add_prefix('PrType_')
df = df.join(encoder_df)

df.head()

Unnamed: 0,status,baths,sqft,zipcode,beds,target,PoolPrivate,latitude,longitude,Year built,...,PrType_1,PrType_2,PrType_3,PrType_4,PrType_5,PrType_6,PrType_7,PrType_8,PrType_9,PrType_10
0,active,0.912296,0.020741,28387,0.838383,-0.146296,0,35.18,-79.4,2019-01-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,for sale,0.614043,-0.04121,99216,0.312933,-0.259833,0,47.69,-117.19,2019-01-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,for sale,0.017536,0.027242,90049,0.312933,2.457713,1,34.08,-118.49,1961-01-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,for sale,3.596578,0.251967,75205,1.363833,1.932075,0,32.79,-96.76,2006-01-01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,for sale,-1.175478,-0.167777,32908,-1.263417,-0.580472,0,27.98,-80.66,1800-01-01,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
# Закодируем часть признаков через LabelEncoder

le = LabelEncoder()
df['Year_built_enc'] = le.fit_transform(df['Year built'])

le = LabelEncoder()
df['zipcode_enc'] = le.fit_transform(df['zipcode'])

In [65]:
# удалим исходные столбцы категорийных признаков, которые закодировали выше
df = df.drop(['status', 'zipcode', 'Year built', 'PrType'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352505 entries, 0 to 352504
Data columns (total 34 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   baths           352505 non-null  float64
 1   sqft            352505 non-null  float64
 2   beds            352505 non-null  float64
 3   target          352505 non-null  float64
 4   PoolPrivate     352505 non-null  int64  
 5   latitude        352505 non-null  float64
 6   longitude       352505 non-null  float64
 7   lotsize         352505 non-null  float64
 8   school_rate     352505 non-null  float64
 9   school_dist     352505 non-null  float64
 10  status_0        352505 non-null  float64
 11  status_1        352505 non-null  float64
 12  status_2        352505 non-null  float64
 13  status_3        352505 non-null  float64
 14  status_4        352505 non-null  float64
 15  status_5        352505 non-null  float64
 16  status_6        352505 non-null  float64
 17  status_7  

In [66]:
# разделим датасет
y = df['target']
X = df.drop(['target'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

##### Линейная регрессия

In [82]:
# возьмем в качестве базового прогноза линейную регрессию
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.80
Test MSE: 0.77
Train MAE: 0.42
Test MAE: 0.42
Train R2: 0.21
Test R2: 0.22


Пока можно сказать, что результаты по обеим выборкам сопоставимы,  
т.е. не происходит явного недообучения или переобучения модели.

##### Дерево решений

In [83]:
dtr = DecisionTreeRegressor(max_depth=10, random_state=42)
dtr.fit(X_train, y_train)

y_pred_train = dtr.predict(X_train)
y_pred_test = dtr.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.30
Test MSE: 0.36
Train MAE: 0.25
Test MAE: 0.26
Train R2: 0.70
Test R2: 0.64


Значения метрик улучшились по сравнению с линейной регрессией.

##### Random Forest

In [81]:
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train, y_train)
y_pred_train = rf_regressor.predict(X_train)
y_pred_test = rf_regressor.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.03
Test MSE: 0.20
Train MAE: 0.06
Test MAE: 0.15
Train R2: 0.97
Test R2: 0.80


##### ElasticNetCV

In [None]:
# Создаем и тренируем модель ElasticNetCV с кросс-валидацией по 5 фолдам
model_el = ElasticNetCV(cv=5, random_state=42)
model_el.fit(X_train, y_train)

# Предсказания для обучающей и тестовой выборок
y_pred_train = model_el.predict(X_train)
y_pred_test = model_el.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")