In [3]:
import pandas as pd
import sqlalchemy
import datetime
import numpy as np
import pandas_profiling


from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder




from sklearn.model_selection import train_test_split

from string import punctuation

In [4]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://simple_user@localhost:5432/apartments')

In [5]:
data = pd.read_sql_query('select * from apartments', con=engine)

In [6]:
data.set_index('id', inplace=True)

In [9]:
report = pandas_profiling.ProfileReport(data)

In [11]:
# report.to_file('./data_report.html')

In [7]:
data.drop(['title','absolute_url', 'city_name', 'price_usd'],
          axis=1, inplace=True)

In [8]:
def count_upper(string):
    return sum(map(lambda x:x.isupper(), string)) if string else 0

In [9]:
def count_punctuations(string):
    return sum(map(lambda x:x in punctuation, string)) if string else 0

In [10]:
data['number_of_images_attached'] = data['image_urls'].str.len()
data['len_of_description'] = data['description'].str.len()
data['num_of_uppercase_letters_in_description'] = data['description'].apply(count_upper) 
data['num_of_punctuations_in_description'] = data['description'].apply(count_upper) 

In [11]:
data.drop(['image_urls', 'description'],
          axis=1,
          inplace=True)

In [12]:
data['construction_year'] = data.construction_period.str.extract("([\d]{4})")

In [13]:
data['years_elapsed'] =  datetime.datetime.today().year - data['construction_year'].astype(np.float)

In [14]:
data.drop(['construction_year', 'construction_period'], axis=1, inplace=True)

In [15]:
data['is_bargain'] = data.tags.apply(lambda  row : 'Торг' in row)
data['is_used'] = data.tags.apply(lambda  row : 'Вторичное жилье' in row)
data['is_not_used'] = data.tags.apply(lambda  row : 'Первичное жилье' in row)
data['in_installments'] = data.tags.apply(lambda  row : 'Рассрочка/Кредит' in row)

In [16]:
# data

In [17]:
data.drop('tags',axis=1, inplace=True)

In [18]:
categorical_columns = ['offer_type','wall_type','heating','city_id']

In [19]:
# features, target = data.drop(['price_uah'],axis=1), data[['price_uah']]

In [20]:
def get_inliers_ind(data, feature_name, is_positive=True):

    """
    Generate inliers indecies using IQR

    :param data: pd.Dataframe
    :param feature_name: feature
    :param is_positive:
    :return:
    """

    q1 = data[feature_name].quantile(q=0.25)
    q3 = data[feature_name].quantile(q=0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    if is_positive:
        lower_bound = max(0, lower_bound)
    mask = (data[feature_name] > upper_bound) | (data[feature_name] < lower_bound)
    indecies = mask[mask == False].index
    return set(indecies)

In [21]:
def get_nan_ind(data, feature_name):
    return set(data[data[feature_name].isna()==False].index)


In [22]:
# clean_indecies = list(drop_outliers('price_uah'))

In [23]:
inliers_ind = get_inliers_ind(data,'price_uah')
not_nan_ind = get_nan_ind(data, 'price_uah')
inliers_ind = list(not_nan_ind & inliers_ind)
data = data.loc[inliers_ind]

In [24]:
# data

## preprocessing  part

In [25]:
cat_features = ['offer_type','wall_type','heating','city_id']
bool_features = ['is_bargain','is_used', 'is_not_used', 'in_installments']
num_features = [
    "position",
    "len_of_description",
    "floor_located",
    "number_of_floors_in_the_house",
    "longitude",
    "apartment_area",
    "years_elapsed",
    "num_of_punctuations_in_description",
    "number_rooms",
    "latitude",
    "num_of_uppercase_letters_in_description",
    "number_of_images_attached",
]


In [26]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.compose import ColumnTransformer

In [27]:
class BoolTranformer(TransformerMixin, BaseEstimator):
    def fit(self,X, y=None):
        return self
    def transform(self,X,y=None):
        return np.array(X).astype(np.float)

In [28]:
class MissingValuesImputer(TransformerMixin, BaseEstimator):
    
    def __init__(self, cat_features, num_features, bool_features, 
                 categorical_imputation_dict=None,
                 num_features_imputation_dict=None,
                 bool_imputation_dict=None,
                 default_cat_value = "Uknown",
                 default_bool_value = False,
                 default_num_value = 0
                 ):
        
        self.cat_features = cat_features
        self.num_features = num_features
        self.bool_features = bool_features
       
        dict_initializer = lambda given_dict: given_dict if type(given_dict) is dict else dict()
        self.cat_feature_imputer = dict_initializer(categorical_imputation_dict)
        self.num_feature_imputer = dict_initializer(num_features_imputation_dict)
        self.bool_feature_imputer = dict_initializer(bool_imputation_dict)
        self.default_cat_value = default_cat_value
        self.default_num_value = default_num_value
        self.default_bool_value = default_bool_value
        
    def fit(self, X, y = None):
        return self 
    
    def transform(self, X, y=None):
        X_transformed = X.copy()
        for category in self.cat_features:
            X_transformed[category].fillna(self.cat_feature_imputer.get(category,
                                                                        self.default_cat_value),
                                           inplace=True)
        for num_feature in self.num_features:
            X_transformed[num_feature].fillna(self.num_feature_imputer.get(num_feature,
                                                                           self.default_num_value),
                                              inplace=True)
        for bool_feature in self.bool_features:
            X_transformed[bool_feature].fillna(self.bool_feature_imputer.get(bool_feature,
                                                                             self.default_bool_value),
                                               inplace=True)
        return X_transformed

In [29]:
from sklearn.pipeline import Pipeline

In [30]:
imputer = MissingValuesImputer(cat_features, num_features, bool_features,
                     num_features_imputation_dict={'years_elapsed':1000})

In [52]:
column_transformer = ColumnTransformer([
                                ('one_hot_encoder', OneHotEncoder(categories='auto',
                                                                  handle_unknown='ignore',
                                                                  sparse=False), cat_features),
                                ('scaler', StandardScaler(), num_features),
                                ('bool_encoder', BoolTranformer(), bool_features)],
                                remainder='passthrough',
                               verbose=2)

In [53]:
imputer = MissingValuesImputer(cat_features, num_features, bool_features,
                     num_features_imputation_dict={'years_elapsed':1000})

preprocessing_pipeline = Pipeline([('missing_values_imputer', imputer),
          ('column_transformer',  column_transformer)], verbose=2)

In [54]:
features_transformed = preprocessing_pipeline.fit_transform(data)

[Pipeline]  (step 1 of 2) Processing missing_values_imputer, total=   0.0s
[ColumnTransformer]  (1 of 4) Processing one_hot_encoder, total=   0.2s
[ColumnTransformer] ........ (2 of 4) Processing scaler, total=   0.0s
[ColumnTransformer] .. (3 of 4) Processing bool_encoder, total=   0.0s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s
[Pipeline]  (step 2 of 2) Processing column_transformer, total=   0.4s


In [55]:
# column_transformer._n_features

In [56]:
features, target = features_transformed[:,:-1], features_transformed[:,[-1]]

In [634]:
tree = DecisionTreeRegressor(max_depth=10, random_state=42)

In [635]:
# tree.__class__.__name__

In [636]:
rmse = make_scorer(lambda y,preds : np.sqrt(mean_squared_error(y,preds)),
                   greater_is_better=False)

In [637]:
from functools import partial

In [639]:
np.mean(np.abs(cross_val_score(tree, features, target, cv=10, scoring = rmse)))

366525.077890595

In [69]:
tree.fit(with_dummies, target)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [70]:
list(zip(with_dummies.columns, tree.feature_importances_))

[('position', 0.0028728565598219397),
 ('number_rooms', 0.019173453126587636),
 ('floor_located', 0.0069971832574371885),
 ('number_of_floors_in_the_house', 0.02540956427815293),
 ('apartment_area', 0.5546683688167596),
 ('latitude', 0.002598432438930921),
 ('longitude', 0.005442356696076232),
 ('number_of_images_attached', 0.019544378686287174),
 ('len_of_description', 0.01215137866061125),
 ('num_of_uppercase_letters_in_description', 0.0036143076609968126),
 ('num_of_punctuations_in_description', 0.0027946062771268664),
 ('years_elapsed', 0.002186456075026997),
 ('is_bargain', 0.0013167599168236754),
 ('is_used', 0.00393724272409022),
 ('is_not_used', 0.0006789462932872677),
 ('in_installments', 0.010780911191414945),
 ('offer_type_Unknown', 0.0025883818984795133),
 ('offer_type_от застройщика', 2.671216353650564e-05),
 ('offer_type_от посредника', 0.003043048745137054),
 ('offer_type_от представителя застройщика', 0.0008794296993334059),
 ('offer_type_от представителя хозяина (без к

In [641]:
from lightgbm import LGBMRegressor

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [642]:
lgb_reg = LGBMRegressor(max_depth=30,
                        num_leaves=128,
                        learning_rate=.05,
                        n_estimators=500,
                        bagging_fraction=.6,
                        feature_fraction=.6,
                        random_state=42)

In [647]:
np.mean(np.abs(cross_val_score(lgb_reg, features, target, cv=5, scoring =rmse)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


308943.55295862857

In [873]:
np.mean(cross_val_score(lgb_reg, with_dummies, target, cv=5))

0.7569108979589639

In [110]:
from sklearn.externals import joblib



In [114]:
joblib.dump(['hello_there'],'test.jblib')

['test.jblib']

In [115]:
joblib.load('test.jblib')

['hello_there']

In [108]:
sample = with_dummies.sample(1000)

In [109]:
%%timeit
lgb_reg.predict(sample)

95.9 ms ± 4.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [90]:
lgb_reg.fit(with_dummies, target)

LGBMRegressor(bagging_fraction=0.6, boosting_type='gbdt', class_weight=None,
              colsample_bytree=1.0, feature_fraction=0.6,
              importance_type='split', learning_rate=0.05, max_depth=30,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=500, n_jobs=-1, num_leaves=128, objective=None,
              random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [92]:
lgb_reg.__class__.__name__

'LGBMRegressor'

# Torch fully connected

In [106]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

In [138]:
torch.manual_seed(42)

<torch._C.Generator at 0x122c438f0>

In [139]:
train_x, test_x, train_y, test_y = train_test_split(features, 
                                                    target,
                                                    test_size=.2)

In [140]:
X_train, X_test, Y_train, Y_test = map(
    lambda x: torch.from_numpy(x).float(), [train_x, test_x, train_y, test_y]
)

In [141]:
train_dataset = TensorDataset(X_train,
                              Y_train)

test_dataset = TensorDataset(X_test,
                             Y_test)

In [142]:
batch_size = 128

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

In [154]:
class PriceRegressorDNN(nn.Module):
    def __init__(self, input_dim, 
                activation_function):
        
#         assert(type(neurons) is list, f"{neurons} should a list of neurons")
        
        super(PriceRegressorDNN, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, 1024)
        self.activation_function = activation_function
        self.fc2 = nn.Linear(1024, 1)
#         self.fc3 = nn.Linear(128, 1)
        
    def forward(self, x):
        # Linear function  # LINEAR
        x = self.fc1(x)
        x = self.activation_function(x)
        x = self.fc2(x)
#         x = self.activation_function(x)
#         x = self.fc3(x)
        return x

In [155]:
model = PriceRegressorDNN(X_train.shape[1], nn.ReLU())

In [156]:
criterion = nn.L1Loss()
learning_rate = 1e-2
optimizer = torch.optim.SGD(model.parameters(),
                            lr=learning_rate,
                            momentum=.99)

In [157]:
num_epochs=100

In [158]:
def rmse(Y_pred, Y_true):
    rmse_val =  torch.sqrt(torch.mean((torch.pow(Y_pred - Y_true, 2))))
#     print(rmse_val.item())
#     if np.isnan(rmse_val.item()):
#         print(Y_pred)
#         print(Y_true)
    return rmse_val

In [159]:
def r2_score(Y_pred, Y_true):
    mse = torch.mean((torch.pow(Y_pred - Y_true, 2)))
    return 1 - mse/torch.var(Y_true)

In [160]:
# def mean_squared_logarithmic_loss(output, target):
#     loss = torch.mean((torch.log(target+1) - torch.log(output+1))**2)
#     return loss

In [161]:
# mean_squared_logarithmic_loss(torch.Tensor([100,100]),torch.Tensor([1000,100]))

In [162]:
# criterion(torch.zeros(64,1),torch.ones(64,1)*10)

In [163]:
# X_train.shape

In [164]:
iteration = 0

for epoch in range(num_epochs):
    
    for i, (train_features, labels_train) in enumerate(train_loader):
#         print(labels)

        # Clear gradients w.r.t. parameters
            
        # Forward pass to get output/logits
        outputs = model(train_features)
        
        loss = criterion(outputs, labels_train)
#         print(loss)
        
        optimizer.zero_grad()   # clear gradients for next train
        loss.backward()         # backpropagation, compute gradients
        optimizer.step()        

        iteration += 1
    
        
        if iteration % 500 == 0:
            
            rmses = []
            # Calculate RMSE
            # Iterate through test dataset
            for test_features, labels_test in test_loader:
                # Load images with gradient accumulation capabilities

                # Forward pass only to get logits/output
                outputs = model(test_features)

                # Get predictions from the maximum value
                rmses.append(rmse(outputs, labels_test).item())
                
            mean_rmse = np.mean(np.array(rmses))

            # Print Loss
            print('Iteration: {}. Loss: {:.3f}. RMSE: {:.3f}'.format(iteration, loss.item(), mean_rmse))

Iteration: 500. Loss: 296236.750. RMSE: 471544.946
Iteration: 1000. Loss: 302228.594. RMSE: 497495.805
Iteration: 1500. Loss: 269063.125. RMSE: 439105.572
Iteration: 2000. Loss: 265578.625. RMSE: 436933.191
Iteration: 2500. Loss: 400204.406. RMSE: 488300.735
Iteration: 3000. Loss: 240989.672. RMSE: 425107.865
Iteration: 3500. Loss: 323623.500. RMSE: 501193.130
Iteration: 4000. Loss: 337388.750. RMSE: 449636.922
Iteration: 4500. Loss: 276032.562. RMSE: 414188.242
Iteration: 5000. Loss: 330324.812. RMSE: 472789.098
Iteration: 5500. Loss: 260807.500. RMSE: 412609.455
Iteration: 6000. Loss: 319650.156. RMSE: 502228.542
Iteration: 6500. Loss: 294953.875. RMSE: 433779.196
Iteration: 7000. Loss: 303054.094. RMSE: 445753.643
Iteration: 7500. Loss: 290048.250. RMSE: 419041.165
Iteration: 8000. Loss: 269924.438. RMSE: 410508.481
Iteration: 8500. Loss: 351788.250. RMSE: 488137.569
Iteration: 9000. Loss: 286592.250. RMSE: 396365.756
Iteration: 9500. Loss: 237609.344. RMSE: 389933.280
Iteration: 10

KeyboardInterrupt: 

In [1036]:
np.sqrt(mean_squared_error(lgb_reg.predict(test_x) ,test_y))

295401.4066815665

In [65]:
# list(model.parameters())

In [1034]:
from sklearn.metrics import mean_squared_error

In [1032]:
r2_score(lgb_reg.predict(test_x), test_y)

0.7092784156813776

In [1016]:
rmse(model(X_test),Y_test)

tensor(222835.9062, grad_fn=<MeanBackward0>)

In [1017]:
from sklearn.metrics import r2_score

In [1028]:
preds = model(X_test).data.cpu().numpy()

In [1043]:
rmse(model(X_test), Y_test)

tensor(332933.1562, grad_fn=<SqrtBackward>)

In [54]:
torch.save(model.state_dict() ,"test.pt")

In [62]:
model.load_state_dict(state_dict)

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [56]:
state_dict = torch.load('test.pt')

In [5]:
model_loaded

PriceRegressorDNN(
  (fc1): Linear(in_features=284, out_features=512, bias=True)
  (activation_function): ReLU()
  (fc2): Linear(in_features=512, out_features=1, bias=True)
)

# Test encoder

In [712]:
joblib.load('assets/column_transformer.jblib')

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('one_hot_encoder',
                                 OneHotEncoder(categorical_features=None,
                                               categories='auto', drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='ignore',
                                               n_values=None, sparse=False),
                                 ['offer_type', 'wall_type', 'heating',
                                  'city_id']),
                                ('scaler',
                                 StandardSc...
                                 ['position', 'len_of_description',
                                  'floor_located',
                                  'number_of_floors_in_the_house', 'longitude',
                                  'apart