In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time 
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import xgboost as xgb
import math 
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
def drop_virgule(x):
    try:return x.split('.')[0]
    except:return x

def processing(df):
    df.drop(columns=columns_to_drop,inplace=True) #specified irrelevant columns 
    df.dropna(subset=['Price'], inplace=True)    #drop price missing (9rows)
    
    df['Listing ID']=df['Listing ID'].apply(drop_virgule)              # Drop the '.0' in IDs and Postal Code
    df['Host ID']=df['Host ID'].apply(drop_virgule)
    df['Postal Code']=df['Postal Code'].apply(drop_virgule)
    
    df['Host Since']=pd.to_datetime(df['Host Since'])                  # Convert values to datetime
    df['First Review']=pd.to_datetime(df['First Review'])
    df['Last Review']=pd.to_datetime(df['Last Review'])
    
    df['Accomodates'].replace('*', np.nan,inplace=True)                 # Put NaN in missing values where we have *
    df['Bathrooms'].replace('*', np.nan,inplace=True)
    df['Bedrooms'].replace('*', np.nan,inplace=True)
    df['Beds'].replace('*', np.nan,inplace=True)
    df['Guests Included'].replace('*', np.nan,inplace=True)
    df['Min Nights'].replace('*', np.nan,inplace=True)
    df['Postal Code'].replace('*', np.nan,inplace=True)
    df['neighbourhood'].replace('*', np.nan,inplace=True)
    df['Property Type'].replace('*', np.nan,inplace=True)
    df['Host Response Rate'].replace('*', np.nan,inplace=True)

    #from 98% to 0.98
    df['Host Response Rate'].replace('nan', np.nan,inplace=True)
    df['Host Response Rate']=df[~df['Host Response Rate'].isnull()]["Host Response Rate"].str.split('%').apply(lambda x:float(x[0])/100)

    df['Accomodates'] = df['Accomodates'].astype('float')              # Convert values to float
    df['Bathrooms'] = df['Bathrooms'].astype('float')
    df['Bedrooms'] = df['Bedrooms'].astype('float')
    df['Beds'] = df['Beds'].astype('float')
    df['Guests Included'] = df['Guests Included'].astype('float')
    df['Min Nights'] = df['Min Nights'].astype('float') 
    
    df.columns = df.columns.str.lower().str.replace(' ','_')           # Rename the columns with '_' instead of ' '
    
    return df

def processing_2(df):
    
    numerical_columns = df.select_dtypes(exclude=object).columns   # numeric columns names
    categorical_columns = df.select_dtypes(include=object).columns # categorical columns names
    
    df_cleaned_num = df[numerical_columns]    # dataframe with numeric columns only
    df_cleaned_cat = df[categorical_columns]  # dataframe with categorical columns only
    
    #standardisation for numeric columns
    scalerx = StandardScaler() 
    df_scaled=pd.DataFrame(scalerx.fit_transform(df_cleaned_num), columns = numerical_columns)
    
    #one encoder for categorical columns
    df_encoded=pd.DataFrame() 
    for cat in categorical_columns:
        df_temp = pd.get_dummies(df_cleaned_cat[cat], prefix=cat)
        df_encoded=pd.concat([df_temp, df_encoded], axis=1)
    
    #concatenation of numeric and categorical dataframes
    df_scaled.reset_index(drop=True, inplace=True)
    df_encoded.reset_index(drop=True, inplace=True)
    df_final=pd.concat([df_scaled, df_encoded], axis=1) 
    
    #KNN imputer (n=5 ????)
    imputer = KNNImputer(n_neighbors=5)
    df_final = pd.DataFrame(imputer.fit_transform(df_final),columns = df_final.columns)
    
    return df_final 



In [3]:
path='train_airbnb_berlin.csv'
my_sep,my_encoding=',','utf-8'
columns_to_drop=['Listing Name','Host Name','City','Country Code','Country']
types={'Listing ID':'str','Host ID':'str','Postal Code':'str'}
data=pd.read_csv(path,sep=my_sep,encoding=my_encoding,dtype=types)

#processing 1: drop the irrelevant columns / replace '*' values by nan / standardisation of columns name
df=processing(data)

df.drop(columns=['host_since','last_review','first_review','square_feet',
                 'business_travel_ready','host_id','listing_id','host_since',
                 'first_review','last_review','neighborhood_group'],inplace=True)

# droping outlighers for prices >300
df = df[~(df['price'] > 300)]

#droping rows where there is more than 3 nan 
#from dataviz we saw that it was the best option
df=df[df.isnull().sum(1)<4]

#processing 2: standardisation / one-hot-encoder / KNN imputer
df=processing_2(df)
print(df.shape)
df

(12622, 227)


Unnamed: 0,host_response_rate,latitude,longitude,accomodates,bathrooms,bedrooms,beds,guests_included,min_nights,reviews,...,neighbourhood_Wilhelmstadt,neighbourhood_Wilmersdorf,neighbourhood_Wittenau,neighbourhood_Zehlendorf,is_superhost_f,is_superhost_t,host_response_time_a few days or more,host_response_time_within a day,host_response_time_within a few hours,host_response_time_within an hour
0,0.394467,1.175537,0.172116,-0.471371,-0.314926,-0.238267,-0.561690,-0.472320,-0.238624,-0.421896,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.505893,1.776806,0.245246,-0.471371,-0.314926,1.404082,0.360710,0.913122,0.132964,-0.421896,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.281922,1.204307,0.298446,0.245177,-0.314926,-0.238267,0.360710,-0.472320,-0.312942,-0.515852,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.203140,-0.018597,0.752990,-0.471371,-0.314926,-0.238267,-0.561690,-0.472320,-0.238624,-0.468874,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.394467,-2.000847,-0.033061,0.245177,-0.314926,-0.238267,0.360710,0.913122,-0.312942,-0.327940,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12617,-0.393348,0.563600,-1.223574,-1.187919,-0.314926,-0.238267,-0.561690,-0.472320,-0.015672,-0.492363,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12618,-0.730982,0.524485,-0.190756,-0.471371,-0.314926,-0.238267,-0.561690,-0.472320,-0.312942,-0.539341,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12619,-3.375787,-0.646374,-0.269950,-0.471371,1.490561,-0.238267,-0.561690,-0.472320,0.058646,-0.304451,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
12620,-0.359584,-1.108641,0.790767,3.827915,3.296048,3.046432,3.127911,-0.472320,-0.312942,-0.468874,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [4]:
X=df.drop(columns=['price'])
y=df['price'].copy()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
import xgboost as xgb

regressor = xgb.XGBRegressor(
    n_estimators=150,
    reg_lambda=2,
    gamma=1,
    max_depth=3
)
regressor.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=1, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=150, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=2, ...)

In [6]:
import math 
from sklearn import metrics

y_pred = regressor.predict(X_test)
print('rmse ',math.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('r2   ',metrics.r2_score(y_test, y_pred))

rmse  0.6958388187664527
r2    0.5248109522329976


In [7]:
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test) 
print(y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print ('mse = {}, rmse = {} \nmae = {} r2 = {}'.format(mse,math.sqrt(mse), mae, r2))

[-0.7756958   2.10760498 -1.14654541 ... -0.68218994 -0.38592529
  1.34729004]
mse = 0.5175430189036458, rmse = 0.7194046280805023 
mae = 0.4838898947043751 r2 = 0.4926788294601854
