In [10]:
# import neccessary libary
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn import ensemble

In [11]:
# import dataset 
df = pd.read_csv('listings_berlin.csv')
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,1944,bright & airy Pberg/Mitte 3 months or more,2164,Lulah,Mitte,Brunnenstr. Nord,52.54433,13.39761,Private room,28,60,18,2018-11-11,0.21,3,101
1,2015,Berlin-Mitte Value! Quiet courtyard/very central,2217,Ion,Mitte,Brunnenstr. Süd,52.53305,13.40394,Entire home/apt,74,90,141,2020-04-03,2.2,6,357
2,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.53471,13.4181,Entire home/apt,90,62,147,2017-03-20,1.14,1,254
3,3309,BerlinSpot Schöneberg near KaDeWe,4108,Jana,Tempelhof - Schöneberg,Schöneberg-Nord,52.49884,13.3494,Private room,29,7,27,2018-08-16,0.28,1,285
4,6883,Stylish East Side Loft in Center with AC & 2 b...,16149,Steffen,Friedrichshain-Kreuzberg,Frankfurter Allee Süd FK,52.51163,13.45289,Entire home/apt,79,7,135,2021-01-02,1.02,1,0


In [12]:
# remove unneccessary variable
colToRemove = ['id', 'name', 'host_name', 'last_review', 'calculated_host_listings_count', 'availability_365', 'longitude', 'latitude', 'neighbourhood']
df.drop(columns=colToRemove, errors='ignore', inplace=True)
df.head()

Unnamed: 0,host_id,neighbourhood_group,room_type,price,minimum_nights,number_of_reviews,reviews_per_month
0,2164,Mitte,Private room,28,60,18,0.21
1,2217,Mitte,Entire home/apt,74,90,141,2.2
2,3718,Pankow,Entire home/apt,90,62,147,1.14
3,4108,Tempelhof - Schöneberg,Private room,29,7,27,0.28
4,16149,Friedrichshain-Kreuzberg,Entire home/apt,79,7,135,1.02


In [13]:
# convert non-numeric value
df = pd.get_dummies(df, columns = ['neighbourhood_group', 'room_type'])
df.dropna(axis=0, how='any', inplace=True)
df.head()

Unnamed: 0,host_id,price,minimum_nights,number_of_reviews,reviews_per_month,neighbourhood_group_Charlottenburg-Wilm.,neighbourhood_group_Friedrichshain-Kreuzberg,neighbourhood_group_Lichtenberg,neighbourhood_group_Marzahn - Hellersdorf,neighbourhood_group_Mitte,...,neighbourhood_group_Pankow,neighbourhood_group_Reinickendorf,neighbourhood_group_Spandau,neighbourhood_group_Steglitz - Zehlendorf,neighbourhood_group_Tempelhof - Schöneberg,neighbourhood_group_Treptow - Köpenick,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,2164,28,60,18,0.21,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
1,2217,74,90,141,2.2,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
2,3718,90,62,147,1.14,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
3,4108,29,7,27,0.28,False,False,False,False,False,...,False,False,False,False,True,False,False,False,True,False
4,16149,79,7,135,1.02,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [16]:
# Set X and y variable
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10, shuffle=True)

In [17]:
# Set algorithm
model = ensemble.GradientBoostingRegressor(
    n_estimators = 350, 
    learning_rate = 0.1,
    max_depth = 5,
    min_samples_split = 4,
    min_samples_leaf = 6,
    max_features = 0.6,
    loss = 'huber'
)

model.fit(X_train, y_train)

In [22]:
# Evaluate
mae_train = mean_absolute_error(y_train, model.predict(X_train))
print(f'Training Set Mean Absolute Error {mae_train:.2f}')

mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(f'Test Set Mean Absolute Error {mae_test:.2f}')

Training Set Mean Absolute Error 22.61
Test Set Mean Absolute Error 28.55


In [28]:
#8- Predict
new_property = [
    2217, #host_id
    4, #minimum_nights
    118, #number_of_reviews
    3.76, #reviews_per_month
    0, #neighbourhood_group_Charlottenburg-Wilm.
    0, #neighbourhood_group_Friedrichshain-Kreuzberg
    0, #neighbourhood_group_Lichtenberg
    0, #neighbourhood_group_Marzahn - Hellersdorf
    1, #neighbourhood_group_Mitte
    0, #neighbourhood_group_Neukölln
    0, #neighbourhood_group_Pankow
    0, #neighbourhood_group_Reinickendorf
    0, #neighbourhood_group_Spandau
    0, #neighbourhood_group_Steglitz - Zehlendorf
    0, #neighbourhood_group_Tempelhof - Schöneberg
    0, #neighbourhood_group_Treptow - Köpenick
    1, #room_type_Entire home/apt
    0, #room_type_Private room
    0, #room_type_Shared room
    1, #number_of_bathrooms
]

new_pred = model.predict([new_property])
print(new_pred)

[67.68629419]


