In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import mpl_toolkits

%matplotlib inline

In [2]:
def process(file_name, price):
    train = pd.read_csv(file_name)
    fields = ['publication_date', 'latitude', 'longitude', 'category', 'floor', 'area', 'kupcha', 'rooms', 'repairs', 'hypothec', 'location attributes']
    if price:
        fields.append('price')
    train = train[fields]
    train.replace('var', True, inplace=True)
    train.replace('yoxdur', False, inplace=True)
    train.replace(np.nan, False, inplace=True)
    train.replace({'m²': ''}, regex=True, inplace=True)
    train.replace('Yeni tikili', True, inplace=True)
    train.replace('Köhnə tikili', False, inplace=True)
    train = train.astype({"area": float})
    currentFloors = []
    maxFloors = []
    for floor in train['floor'].array:
        floors = floor.split(' / ')
        currentFloors.append(floors[0])
        maxFloors.append(floors[1])
    train["current_floor"] = currentFloors
    train["max_floor"] = maxFloors

    train = train.astype({"current_floor": int})
    train = train.astype({"max_floor": int})
    del train['floor']

    def get_normalized_series(df, col):
        return (df[col] - df[col].mean()) / df[col].std()
    #train["NormArea"]= get_normalized_series(train, "area")
    #train["NormCurrent"]= get_normalized_series(train, "current_floor")
    #train["NormMax"]= get_normalized_series(train, "max_floor")
    #if price:
       #train["NormPrice"]= get_normalized_series(train, "price")
       #del train['price']
    #del train['area']
    #del train['current_floor']
    #del train['max_floor']
    for index in range(0, len(train['publication_date'].array)):
        element = train['publication_date'].array[index]
        values = element.split(' ')
        if values[1] == 'Yanvar':
            values[1] = '/01/'
        elif values[1] == 'Dekabr':
            values[1] = '/12/'
        train['publication_date'].array[index] = values[0] + values[1] + values[2]

    train['publication_date'] = pd.to_datetime(train.publication_date, format="%d/%m/%Y", errors='coerce')
    train["year"] = train["publication_date"].dt.year
    train["month"] = train["publication_date"].dt.month
    train["day"] = train["publication_date"].dt.day
    return train
train = process('modified_train_binaaz.csv', True)
tags = train["location attributes"]
clean_tags = tags.str.strip('[]\'').str.split('\', \'')
#tag_columns = pd.get_dummies(clean_tags.apply(pd.Series).stack()).groupby(level=0).sum()
a = clean_tags.apply(pd.Series)
b = a.stack()
c = pd.get_dummies(b)
d = c.groupby(level=0).sum()

train = train.join(d)
train = train.drop(['location attributes', 'publication_date'], axis=1)


In [3]:
train.to_csv("clear_data.csv", index=False)

In [4]:

test = process('test_binaaz_updated.csv', False)
tags = test["location attributes"]
test_clean_tags = tags.str.strip('[]\'').str.split('\', \'')
#tag_columns = pd.get_dummies(test_clean_tags.apply(pd.Series).stack()).groupby(level=0).sum()
a = test_clean_tags.apply(pd.Series)
b = a.stack()
c = pd.get_dummies(b)
d = c.groupby(level=0).sum()

tags = set()
for ls in clean_tags:
   for tag in ls:
    tags.add(tag)

test_tags = set()
for ls in test_clean_tags:
   for tag in ls:
    test_tags.add(tag)

to_add = tags.difference(test_tags)
empty = []
for i in range(0, len(d.index)):
    empty.append(0)

map = {'price': empty}
for add in to_add:
    map[add] = empty

additional = pd.DataFrame(map)

test = test.join(d)
test = test.join(additional)
to_drop = ['location attributes', 'publication_date'] + list(test_tags.difference(tags))
test = test.drop(to_drop, axis=1)

In [5]:
test.to_csv("test_clear.csv", index=False)

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

target = "price"
columns = train.columns.tolist()
columns = [c for c in columns if c not in ["price"]]
#training = train.sample(frac=0.7, random_state=5)
#lin_model = LinearRegression()
# Fit the model to the training data.
#lin_model.fit(train[columns], train[target])

In [8]:
# Generate our predictions for the test set.
#test = train.loc[~train.index.isin(training.index)]
#lin_predictions = lin_model.predict(test[columns])
# Compute error between our test predictions and the actual values.
#lin_mse = mean_squared_error(lin_predictions, test[target], squared=False)
#print("Computed error:", lin_mse)

In [15]:
from scipy.stats import loguniform
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    #"n_estimators": [5, 10, 20, 50, 100, 200, 300, 400, 500, 600,650, 700, 800, 900, 1000],
    #"max_leaf_nodes": [2, 3, 4, 5, 10, 20, 50, 100, 150, 200, 250],
    #"learning_rate": loguniform(0.01, 1),
    #"min_samples_split": [10, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250]
    #"max_depth":range(9,20,2),
    #"min_samples_split": list(range(5, 25, 5)) + list(range(25, 100, 10)) + [125, 150, 200, 250, 300]
    #'max_features': range(47, 52, 3)
    #"subsample": [0.6,0.7,0.75,0.8,0.85,0.9]
}
search_cv = RandomizedSearchCV(
    GradientBoostingRegressor(max_features=50, n_estimators = 900, min_samples_split=25, max_depth=15, max_leaf_nodes= 50), param_distributions=param_distributions,
     n_iter=12, random_state=0, n_jobs=2,
)
search_cv.fit(train[columns], train[target])

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")



Unnamed: 0,param_subsample,mean_test_error,std_test_error
3,0.8,-0.838726,0.041361
4,0.85,-0.836915,0.046917
2,0.75,-0.836839,0.046674
1,0.7,-0.834879,0.046136
5,0.9,-0.833659,0.047175
0,0.6,-0.824562,0.046748


In [None]:
cv_results

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor(learning_rate=0.1, n_estimators = 900, max_leaf_nodes= 50, min_samples_split=25, random_state=0, subsample=0.8, max_features=17, max_depth=50)

target = 'price'
columns = train.columns.tolist()
columns = [c for c in columns if c not in ["price"]]

#training = train.sample(frac=0.7, random_state=5)
#test = train.loc[~train.index.isin(training.index)]
clf.fit(train[columns], train[target])
clf = clf.predict(test[columns])
# Compute error between our test predictions and the actual values.
#lin_mse = mean_squared_error(clf, test[target], squared=False)
#print("Computed error:", lin_mse)

In [19]:
#for index in range(100, 120):
    #print(lin_predictions[index], test[target].array[index])

In [7]:
#np.savetxt("result.csv", lin_predictions, header=['price'])
result = pd.DataFrame(clf)
result.to_csv("result.csv", header=['price'])