In [None]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from sklearn.model_selection import train_test_split

print("Set up completed.")

# Apartment Rental Prediction in Germany

This notebook is an exercise for machine learning, which aims to predict ther renting price (baseRent) of apartments in Germany, with data collected from Immoblienscout24. 

## Input and preprocess the data
We want to predict the base price of renting apartments in Germany, thus we need to split the data into train, validation and test sets. 
1. Remove some columns to prevent data leaking
2. Remove columns irrelavent to machine learning

In [None]:
filepath = "../input/apartment-rental-offers-in-germany/immo_data.csv" # input data
df = pd.read_csv(filepath)
df.set_index("scoutId", inplace=True) # set index
print('The raw data is ', df.shape[0], 'rows and', df.shape[1], 'columns.' )
df.info()

In [None]:
X = df.drop(columns = ['totalRent','pricetrend','baseRentRange']) # prevent data leaking
X = X.drop(columns = ['geo_bln','houseNumber','street','streetPlain','description','geo_krs','geo_plz','date','noRooms','newlyConst','yearConstructed','livingSpace']) # data irrelavent or redundant to prediction
X = X.drop(columns = ['telekomHybridUploadSpeed']) # too much na values



cols = ['regio1','regio2','regio3',
        'typeOfFlat','livingSpaceRange','noRoomsRange','floor','numberOfFloors','lift','balcony','garden','hasKitchen','cellar','noParkSpaces',
        'petsAllowed','condition','facilities','interiorQual',
        'serviceCharge','heatingType','firingTypes','thermalChar','heatingCosts','electricityBasePrice','electricityKwhPrice','energyEfficiencyClass',
        'telekomTvOffer','telekomUploadSpeed',
        'yearConstructedRange', 'lastRefurbish',
         'picturecount',
        'baseRent'
 ]
X = X[cols]

X.head()

## Dealing NA values

In [None]:
#na check
na = X.isna().sum() / X.shape[0] * 100
print('percentage of na values in each columns:')
na.sort_values(ascending=False)

In [None]:
X = X.drop(columns = ['electricityKwhPrice','electricityBasePrice','energyEfficiencyClass','heatingCosts','facilities','lastRefurbish']) # too much na values

values = {'noParkSpaces': 0,'petsAllowed': 'no','interiorQual':'normal', 'condition':'unknown', 'yearConstructedRange':5, 'firingTypes':'unknown','heatingType':'unknown','typeOfFlat':'unknown','telekomUploadSpeed':40,'telekomTvOffer':'NONE'}
X = X.fillna(value=values)



numberOfFloors = X.numberOfFloors.fillna(df.floor) # If there is no renovation, input built year instead
X.numberOfFloors = numberOfFloors

values = {'floor':0,'numberOfFloors':0}
X = X.fillna(value=values)

thermalChar_mean = X.thermalChar.mean()
values = {'thermalChar' : thermalChar_mean}
X = X.fillna(value=values)


X.groupby(by = 'numberOfFloors',dropna = False).count()

X = X.dropna()


X.head()
X.shape
X.info()

## Data cleaning


In [None]:
X.groupby(by = 'floor').count()

X = X.drop(X[X.floor > 30].index)

X = X.drop(X[X.numberOfFloors > 30].index)

X = X.drop(X[X.baseRent > 1300].index)
X = X.drop(X[X.baseRent < 200].index)


X.lift = X.lift.astype(np.int64)

y = X.baseRent
X = X.drop(columns = ['baseRent'])

X.head()


## Data Visualization

In [None]:
y.plot.hist(title = 'Base rent per month', bins = 20)
print('The skewness of y is:',  y.skew())
print('Mean of y is:', y.mean())

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(40, 10), constrained_layout=True)
fig.suptitle('log(y) and sqrt(y)')
sns.histplot(np.log(y), kde=True, ax=ax1, bins = 20)
sns.histplot(np.log2(y), kde=True, ax=ax2, bins = 20)
sns.histplot(np.log10(y), kde=True, ax=ax3, bins = 20)
sns.histplot(np.sqrt(y), kde=True, ax=ax4, bins = 20)


In [None]:
plot = X.groupby(by = 'regio1').count().plot.pie(y='regio2', figsize=(10, 10), legend=False, label = "",labeldistance=1.1,title = 'Apartments in all Bundesländer')


In [None]:

plot = X.groupby(by = 'typeOfFlat').count().plot.pie(y='regio2', figsize=(10, 10), legend=False, label = "",labeldistance=1.1,title = 'Types of Flats', colors = mcolors.BASE_COLORS)


In [None]:
sns.barplot(y = X.groupby(by = 'livingSpaceRange').count().T.iloc[0], x = np.arange(1,8)).set_title("Living Space Range")


## Modeling

In [None]:
X.columns
bool_cols = X.select_dtypes("bool").columns.tolist()
cat_cols = X.select_dtypes("object").columns.tolist()
num_cols = X.select_dtypes("number").columns.tolist()
bivariate_cols = ['floor', 'numberOfFloors', 'noParkSpaces', 'serviceCharge', 'thermalChar', 'picturecount']
num_cat_cols = [c for c in num_cols if c not in bivariate_cols]
print(num_cols)



X.typeOfFlat.unique() #one-hit encoding
X.petsAllowed.unique() #ordinal encoding
X.condition.unique() #ordinal encoding
X.interiorQual.unique() #one-hit/ordinal encoding
X.heatingType.unique() #one-hit encoding (14)
X.firingTypes.unique() #too many categories
X = X.drop(columns = ['firingTypes'])
X.telekomTvOffer.unique() #ordinal encoding

### Ordinal encoding

In [None]:
X.balcony = X.balcony.astype(np.int64)
X.garden = X.garden.astype(np.int64)
X.hasKitchen = X.hasKitchen.astype(np.int64)
X.cellar = X.cellar.astype(np.int64)

pets_dict = {'no': 0, 'negotiable': 0.5, 'yes' : 1}
X.petsAllowed = X.petsAllowed.map(pets_dict)

condition_dict = {'ripe_for_demolition': -2, 'need_of_renovation': -1, 'unknown': 0, 'negotiable': 0.5, 'modernized': 1, 'first_time_use_after_refurbishment': 2, 
                  'mint_condition': 2, 'fully_renovated': 3, 'first_time_use': 4, 'refurbished': 1.5, 'well_kept': 2.5}
X.condition = X.condition.map(condition_dict)

tv_dict = {'NONE': 0, 'ON_DEMAND': 0.5, 'ONE_YEAR_FREE' : 1}
X.telekomTvOffer = X.telekomTvOffer.map(tv_dict)


### One-hit encoding

In [None]:
# split train,validation and test sets
X_train, X_valid_test, y_train, y_valid_test = train_test_split(X, y,
                                                      train_size=0.6, test_size=0.4,
                                                      random_state=0)

X_test, X_valid, y_test, y_valid = train_test_split(X_valid_test, y_valid_test,
                                                      train_size=0.5, test_size=0.5,
                                                      random_state=0)


print('shapes of train, validation and test sets: ',X_train.shape, X_valid.shape, X_test.shape)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

tree_model = DecisionTreeRegressor()
forest_model = RandomForestRegressor(random_state=1)

features = num_cat_cols + bivariate_cols 

tree_model.fit(X_train[features],y_train)
forest_model.fit(X_train[features],y_train)

In [None]:
from sklearn.metrics import mean_absolute_error

tree_predicted_y_valid_features = tree_model.predict(X_valid[features])
print(mean_absolute_error(y_valid, tree_predicted_y_valid_features))

tree_predicted_y_test_features = tree_model.predict(X_test[features])
print(mean_absolute_error(y_test, tree_predicted_y_test_features))

In [None]:
forest_predicted_y_valid_features = forest_model.predict(X_valid[features])
print(mean_absolute_error(y_valid, forest_predicted_y_valid_features))

forest_predicted_y_test_features = forest_model.predict(X_test[features])
print(mean_absolute_error(y_test, forest_predicted_y_test_features))

In [None]:
OH_cols = ['regio1', 'typeOfFlat' ]

from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OHed_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[OH_cols]))
OHed_cols_train.index = X_train.index

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OHed_cols_valid = pd.DataFrame(OH_encoder.fit_transform(X_valid[OH_cols]))
OHed_cols_valid.index = X_valid.index

OH_X_train = pd.concat([X_train[features], OHed_cols_train], axis=1)





In [None]:
features = num_cat_cols + bivariate_cols + bool_cols + ['petsAllowed','condition','telekomTvOffer']
labeled_X_train = pd.concat([X_train[features], OHed_cols_train], axis=1)
labeled_X_valid = pd.concat([X_valid[features], OHed_cols_valid], axis=1)


In [None]:
tree_model.fit(labeled_X_train,y_train)
forest_model.fit(labeled_X_train,y_train)

tree_predicted_y_valid_features = tree_model.predict(labeled_X_valid)
print(mean_absolute_error(y_valid, tree_predicted_y_valid_features))

forest_predicted_y_valid_features = forest_model.predict(labeled_X_valid)
print(mean_absolute_error(y_valid, forest_predicted_y_valid_features))



In [None]:
from xgboost import XGBRegressor

XGB_model = XGBRegressor()

XGB_model.fit(labeled_X_train,y_train)
XGB_predicted_y_valid_features = XGB_model.predict(labeled_X_valid)
print(mean_absolute_error(y_valid, XGB_predicted_y_valid_features))

