In [1]:
#import modules
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import loguniform
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
def process(file_name, isTrain):
    train = pd.read_csv(file_name)
    if isTrain:
        # link in present only in train dataset
        train = train.drop(['link'], axis=1)
    fields = ['publication_date', 'latitude', 'longitude','category', 'floor', 'area', 'kupcha', 'rooms', 'repairs', 'hypothec', 'location attributes']
    if isTrain:
        fields.append('price')
    train = train[fields]

    # clean data to make it readable for the algorithm
    train.replace('var', True, inplace=True)
    train.replace('yoxdur', False, inplace=True)
    train.replace(np.nan, False, inplace=True)
    train.replace({'m²': ''}, regex=True, inplace=True)
    train.replace('Yeni tikili', True, inplace=True)
    train.replace('Köhnə tikili', False, inplace=True)
    train = train.astype({"area": float})
    currentFloors = []
    maxFloors = []
    for floor in train['floor'].array:
        floors = floor.split(' / ')
        currentFloors.append(floors[0])
        maxFloors.append(floors[1])
    train["current_floor"] = currentFloors
    train["max_floor"] = maxFloors
    
    train = train.astype({"current_floor": int})
    train = train.astype({"max_floor": int})
    del train['floor']
    
    day = []
    month = []
    for index in range(0, len(train['publication_date'].array)):
        element = train['publication_date'].array[index]
        values = element.split(' ')
        if values[1] == 'Yanvar':
            month.append(1)
        elif values[1] == 'Dekabr':
            month.append(0)
        day.append(values[0]) 
    del train['publication_date']
    train['day'] = day
    train['month'] = month

    return train
train = process('modified_train_binaaz.csv', True)
tags = train["location attributes"]
clean_tags = tags.str.strip('[]\'').str.split('\', \'')
a = clean_tags.apply(pd.Series)
b = a.stack()
c = pd.get_dummies(b)
d = c.groupby(level=0).sum()

train = train.join(d)
train = train.drop(['location attributes'], axis=1)


In [3]:
test = process('test_binaaz_updated.csv', False)
tags = test["location attributes"]
test_clean_tags = tags.str.strip('[]\'').str.split('\', \'')
a = test_clean_tags.apply(pd.Series)
b = a.stack()
c = pd.get_dummies(b)
d = c.groupby(level=0).sum()

tags = set()
for ls in clean_tags:
   for tag in ls:
    tags.add(tag)

test_tags = set()
for ls in test_clean_tags:
   for tag in ls:
    test_tags.add(tag)

to_add = tags.difference(test_tags)
empty = []
for i in range(0, len(d.index)):
    empty.append(0)

map = {'price': empty}
for add in to_add:
    map[add] = empty

additional = pd.DataFrame(map)

test = test.join(d)
test = test.join(additional)
to_drop = ['location attributes'] + list(test_tags.difference(tags))
test = test.drop(to_drop, axis=1)

target = 'price'
columns = train.columns.tolist()
columns = [c for c in columns if c not in ["price"]]

In [4]:
param_distributions = {
    'n_estimators': range(100, 1501, 200),
    # 2. 'max_depth': range(5, 71, 5),
    # 2. 'min_samples_split': range(100, 1101, 200),
    # 3. 'max_features': range(7, 20, 2),
    # 4. 'subsample': [0.6,0.7,0.75,0.8,0.85,0.9]
}


gbr = GradientBoostingRegressor(learning_rate = 0.1, min_samples_split=180, min_samples_leaf = 50,
                                 max_features='sqrt', subsample=0.8, max_depth= 8)
search_cv = GridSearchCV(estimator=gbr, 
                   param_grid=param_distributions,
                   scoring = 'neg_mean_squared_error', 
                   verbose=3,
                   error_score='raise',
                   n_jobs=-1)

search_cv.fit(train[columns], train[target])

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Fitting 5 folds for each of 8 candidates, totalling 40 fits


Unnamed: 0,param_n_estimators,mean_test_error,std_test_error
6,1300,6520662000.0,2454553000.0
5,1100,6653452000.0,2479403000.0
7,1500,6657138000.0,2535673000.0
4,900,6828559000.0,2557727000.0
3,700,6957095000.0,2539434000.0
2,500,7181199000.0,2618706000.0
1,300,7616456000.0,2559157000.0
0,100,9092941000.0,2504205000.0


In [4]:

clf = GradientBoostingRegressor(learning_rate=0.1, n_estimators = 900, max_leaf_nodes= 50, min_samples_split=25, subsample=0.8, random_state= 0, max_features=17, max_depth=50)
training = train.sample(frac=0.7, random_state=5)
test = train.loc[~train.index.isin(training.index)]
clf.fit(training[columns], training[target])
clf = clf.predict(test[columns])
# Compute error between our test predictions and the actual values.
lin_mse = mean_squared_error(clf, test[target], squared=False)
print("Computed error:", lin_mse)

Computed error: 72165.93501543833


In [7]:
# to save the result
result = pd.DataFrame(clf)
result.to_csv("result.csv", header=['price'])