In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time 
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import xgboost as xgb
import math 
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from geopy.distance import great_circle

In [2]:
def drop_virgule(x):
    try:return x.split('.')[0]
    except:return x

def distance_to_mid_center(lat, lon):
    berlin_center = (52.5200, 13.4050)
    accommodation = (lat, lon)
    return great_circle(berlin_center, accommodation).km


def processing(df):
    df.drop(columns=columns_to_drop,inplace=True) #specified irrelevant columns 
    
    df.dropna(subset=['Price'], inplace=True)    #drop price missing (9rows)
    
    df['Listing ID']=df['Listing ID'].apply(drop_virgule)              # Drop the '.0' in IDs and Postal Code
    df['Host ID']=df['Host ID'].apply(drop_virgule)
    df['Postal Code']=df['Postal Code'].apply(drop_virgule)
    
    df['Host Since']=pd.to_datetime(df['Host Since'])                  # Convert values to datetime
    df['First Review']=pd.to_datetime(df['First Review'])
    df['Last Review']=pd.to_datetime(df['Last Review'])
    
    df['Accomodates'].replace('*', np.nan,inplace=True)                 # Put NaN in missing values where we have *
    df['Bathrooms'].replace('*', np.nan,inplace=True)
    df['Bedrooms'].replace('*', np.nan,inplace=True)
    df['Beds'].replace('*', np.nan,inplace=True)
    df['Guests Included'].replace('*', np.nan,inplace=True)
    df['Min Nights'].replace('*', np.nan,inplace=True)
    df['Postal Code'].replace('*', np.nan,inplace=True)
    df['neighbourhood'].replace('*', np.nan,inplace=True)
    df['Property Type'].replace('*', np.nan,inplace=True)
    df['Host Response Rate'].replace('*', np.nan,inplace=True)

    #from 98% to 0.98
    df['Host Response Rate'].replace('nan', np.nan,inplace=True)
    df['Host Response Rate']=df[~df['Host Response Rate'].isnull()]["Host Response Rate"].str.split('%').apply(lambda x:float(x[0])/100)

    df['Accomodates'] = df['Accomodates'].astype('float')              # Convert values to float
    df['Bathrooms'] = df['Bathrooms'].astype('float')
    df['Bedrooms'] = df['Bedrooms'].astype('float')
    df['Beds'] = df['Beds'].astype('float')
    df['Guests Included'] = df['Guests Included'].astype('float')
    df['Min Nights'] = df['Min Nights'].astype('float') 
    
    df.columns = df.columns.str.lower().str.replace(' ','_')           # Rename the columns with '_' instead of ' '

    #distance from center of berlin
    df['distance_to_midcenter'] = df.apply(lambda x: distance_to_mid_center(x.latitude, x.longitude), axis=1)
    return df

def processing_2(df):
    
    numerical_columns = df.select_dtypes(exclude=object).columns   # numeric columns names
    categorical_columns = df.select_dtypes(include=object).columns # categorical columns names
    
    df_cleaned_num = df[numerical_columns]    # dataframe with numeric columns only
    df_cleaned_cat = df[categorical_columns]  # dataframe with categorical columns only
    
    #standardisation for numeric columns
    scalerx = StandardScaler() 
    df_scaled=pd.DataFrame(scalerx.fit_transform(df_cleaned_num), columns = numerical_columns)
    
    #one encoder for categorical columns
    df_encoded=pd.DataFrame() 
    for cat in categorical_columns:
        df_temp = pd.get_dummies(df_cleaned_cat[cat], prefix=cat)
        df_encoded=pd.concat([df_temp, df_encoded], axis=1)
    
    #concatenation of numeric and categorical dataframes
    df_scaled.reset_index(drop=True, inplace=True)
    df_encoded.reset_index(drop=True, inplace=True)
    df_final=pd.concat([df_scaled, df_encoded], axis=1) 
    
    #KNN imputer (n=5 ????)
    imputer = KNNImputer(n_neighbors=5)
    df_final = pd.DataFrame(imputer.fit_transform(df_final),columns = df_final.columns)
    
    return df_final 


In [None]:
path='train_airbnb_berlin.csv'
my_sep,my_encoding=',','utf-8'
columns_to_drop=['Listing Name','Host Name','City','Country Code','Country']
types={'Listing ID':'str','Host ID':'str','Postal Code':'str'}
data=pd.read_csv(path,sep=my_sep,encoding=my_encoding,dtype=types)

#processing 1: drop the irrelevant columns / replace '*' values by nan / standardisation of columns name
df=processing(data)

df.drop(columns=['host_since','last_review','first_review','square_feet',
                 'business_travel_ready','host_id','listing_id','host_since',
                 'first_review','last_review','neighborhood_group'],inplace=True)

# droping outlighers for prices >300
df = df[~(df['price'] > 300)]

#droping rows where there is more than 3 nan 
#from dataviz we saw that it was the best option
df=df[df.isnull().sum(1)<4]

#processing 2: standardisation / one-hot-encoder / KNN imputer
df=processing_2(df)
print(df.shape)
df

In [None]:
X=df.drop(columns=['price'])
y=df['price'].copy()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import xgboost as xgb

regressor = xgb.XGBRegressor(
    n_estimators=150,
    reg_lambda=2,
    gamma=1,
    max_depth=3
)
regressor.fit(X_train, y_train)

In [None]:
import math 
from sklearn import metrics

y_pred = regressor.predict(X_test)
print('rmse ',math.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('r2   ',metrics.r2_score(y_test, y_pred))

In [None]:
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test) 
print(y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print ('mse = {}, rmse = {} \nmae = {} r2 = {}'.format(mse,math.sqrt(mse), mae, r2))

In [None]:
from lazypredict.Supervised import LazyRegressor
reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None )
models,predictions = reg.fit(X_train, X_test, y_train, y_test)