In [1]:
import pandas as pd
import torch
import seaborn as sns
from data_loader import HouseDataset
from torch.utils.data import DataLoader
from model import get_model
import torch
import pickle
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error as mse

У меня есть данные: средняя цена за квадратный метр по районом в (-1990], (1990-2000], (2000-2010], (2010-] годах.
Самая выгодная квартира будет, та квартира, где разница между предсказанной ценой за квадратный метр(pred_price/sq_m) и средней ценой в соответсвующем районе и в соответсвующем годом постройки наибольшая.  

In [2]:
df_mean_price = pd.read_csv('./data/mean_price.csv')
df_mean_price.head()

Unnamed: 0,area,year_div,price_per_sqm
0,Алатауский,1990_2000,537248.511347
1,Алатауский,2000_2010,575455.819848
2,Алатауский,2010_,551040.822096
3,Алатауский,_1990,641945.976353
4,Алмалинский,1990_2000,650770.505993


In [3]:
data = HouseDataset(mode='all_data')
target_scaler = pickle.load(open(f'./utils/target_scaler0.pkl', 'rb'))
one_hot_enc = pickle.load(open(f'./utils/one_hot_enc0.pkl', 'rb'))
feature_scaler = pickle.load(open(f'./utils/feature_scaler0.pkl', 'rb'))

data_loader = DataLoader(data, batch_size=1, shuffle=False)
criterion = torch.nn.L1Loss()
model = get_model(model_name='net3')
model = torch.load('./best_models/net3_fold4_train18545373.3_val18185701.1_test17261524.6.pth')
mode = model.cuda()
model.eval()

The model net3 is ready, number of parameters = 241377


net3(
  (net): Sequential(
    (0): Linear(in_features=13, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=128, bias=True)
    (7): ReLU()
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): ReLU()
    (10): Linear(in_features=64, out_features=16, bias=True)
    (11): ReLU()
    (12): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [4]:
from math import sqrt
from sklearn.metrics import mean_squared_error as mse
def inference(model, loader, criterion, target_scaler):
    model.eval()
    tqdm_loader = tqdm(loader)
    running_loss = 0.0
    running_rmse = 0.0
    pred_labels = []
    for batch_idx, (X, y) in enumerate(tqdm_loader):
        X, y = X.cuda(), y.cuda()
        with torch.no_grad():
            pred = model(X)
            loss = criterion(pred, y[:, None].float())
            y_inverse = target_scaler.inverse_transform(y.cpu()[:, None])
            y_pred_inverse = target_scaler.inverse_transform(pred.detach().cpu().numpy())
            running_loss += loss.item() * X.size(0)
            running_rmse += mse(y_inverse, y_pred_inverse) * X.size(0)
            pred_labels.append(y_pred_inverse[0][0])
            tqdm_loader.set_description(f'Test...')

    epoch_loss = running_loss / len(loader.dataset)
    epoch_rmse = np.sqrt(running_rmse / len(loader.dataset))
    print(f'test: Loss: {epoch_loss:.2f}, rmse = {epoch_rmse:.2f}')


    return pred_labels

In [5]:
pred_labels = inference(model, data_loader, criterion, target_scaler)

Test...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11891/11891 [00:45<00:00, 260.57it/s]

test: Loss: 0.21, rmse = 19092358.09





In [6]:
df = pd.read_csv('./data/all_data.csv')
features = ['rooms', 'sq_m', 'floor', 'floors_all', 'year']
target = ['price']
one_hot_feat = ['area1', 'area2', 'area3', 'area4', 'area5', 'area6', 'area7', 'area8']
df['area'] = one_hot_enc.inverse_transform(df[one_hot_feat])
df.drop(columns=one_hot_feat, inplace=True)

df[features] = feature_scaler.inverse_transform(df[features])
df[features] = df[features].astype(int)
df['price'] = target_scaler.inverse_transform(df['price'].to_numpy()[:,None])

df['pred_price_per_sqm'] = pred_labels / df['sq_m']


year_div = []
for i in range(0, df.shape[0]):
    if df.iloc[i]['year'] <= 1990:
        year_div.append('_1990')
    elif 1990 < df.iloc[i]['year'] <= 2000:
        year_div.append('1990_2000')
    elif 2000 < df.iloc[i]['year'] <= 2010:
        year_div.append('2000_2010')
    else:
        year_div.append('2010_')
df['year_div'] = year_div

In [7]:
max_diff = {'Алмалинский': 0, 'Район9': 0, 'Ауэзовский': 0, 'Турксибский': 0, 'Бостандыкский': 0, 'Медеуский':0,
               'Алатауский':0, 'Наурызбайский':0, 'Жетысуский': 0
               }
best_options = {'Алмалинский': 0, 'Район9': 0, 'Ауэзовский': 0, 'Турксибский': 0, 'Бостандыкский': 0, 'Медеуский':0,
               'Алатауский':0, 'Наурызбайский':0, 'Жетысуский': 0
               }

for i in range(0, df.shape[0]):
    row = df.iloc[i]
    if row.sq_m >= 40:
        mean_price = df_mean_price.loc[(df_mean_price.area == row['area']) & (df_mean_price.year_div == row['year_div'])]['price_per_sqm'].item()
        if row['pred_price_per_sqm'] - mean_price > max_diff[row.area]:
            max_diff[row.area] = row['pred_price_per_sqm'] - mean_price
            best_options[row.area] = i

In [8]:
max_diff#difference between mean price per sq_m in area and based on year build - predicted price/sq_m 

{'Алмалинский': 338596.8159932636,
 'Район9': 409872.61973726365,
 'Ауэзовский': 225802.9497062734,
 'Турксибский': 97465.18551146553,
 'Бостандыкский': 1447348.318690986,
 'Медеуский': 563076.696376565,
 'Алатауский': 640726.0744560347,
 'Наурызбайский': 322266.6251079787,
 'Жетысуский': 224269.22275373782}

In [9]:
df.iloc[list(best_options.values())]

Unnamed: 0,rooms,sq_m,floor,floors_all,year,price,area,pred_price_per_sqm,year_div
4951,1,44,4,12,2020,33452420.0,Алмалинский,1039975.0,2010_
8643,1,44,2,19,2013,34545940.0,Район9,1025924.0,2010_
5990,1,66,17,18,2011,77853620.0,Ауэзовский,884466.1,2010_
11592,3,90,8,9,1989,45372860.0,Турксибский,659825.2,_1990
10334,4,213,30,39,1989,1526171000.0,Бостандыкский,2150148.0,_1990
4246,1,53,13,27,2021,73522850.0,Медеуский,1307092.0,2010_
3063,1,58,10,111,2018,35628630.0,Алатауский,1191767.0,2010_
2427,3,85,1,3,2020,104920900.0,Наурызбайский,871848.2,2010_
2311,1,46,3,12,2021,37794020.0,Жетысуский,776652.1,2010_


In [10]:
max(max_diff, key=max_diff.get)

'Бостандыкский'

In [11]:
df.iloc[list(best_options.values())].loc[df.area == max(max_diff, key=max_diff.get)]

Unnamed: 0,rooms,sq_m,floor,floors_all,year,price,area,pred_price_per_sqm,year_div
10334,4,213,30,39,1989,1526171000.0,Бостандыкский,2150148.0,_1990
