In [57]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time 
import itertools
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
import xgboost as xgb
import math 
import datetime as dt
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from geopy.distance import great_circle
from math import sin, cos, sqrt, atan2
import statsmodels.api as sm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


In [48]:
def drop_virgule(x):
    try:return x.split('.')[0]
    except:return x

def distance_to_mid_center(lat, lon):
    berlin_center = (52.5200, 13.4050)
    accommodation = (lat, lon)
    return great_circle(berlin_center, accommodation).km


def processing(df):
    df.drop(columns=columns_to_drop,inplace=True) #specified irrelevant columns 
    
    df.dropna(subset=['Price'], inplace=True)    #drop price missing (9rows)
    
    df['Listing ID']=df['Listing ID'].apply(drop_virgule)              # Drop the '.0' in IDs and Postal Code
    df['Host ID']=df['Host ID'].apply(drop_virgule)
    df['Postal Code']=df['Postal Code'].apply(drop_virgule)
    
    df['Host Since']=pd.to_datetime(df['Host Since'])                  # Convert values to datetime
    df['First Review']=pd.to_datetime(df['First Review'])
    df['Last Review']=pd.to_datetime(df['Last Review'])
    df['Host Since']=df['Host Since'].map(dt.datetime.toordinal)
    df['First Review']=df['First Review'].map(dt.datetime.toordinal)
    df['Last Review']=df['Last Review'].map(dt.datetime.toordinal)
    
    df['Accomodates'].replace('*', np.nan,inplace=True)                 # Put NaN in missing values where we have *
    df['Bathrooms'].replace('*', np.nan,inplace=True)
    df['Bedrooms'].replace('*', np.nan,inplace=True)
    df['Beds'].replace('*', np.nan,inplace=True)
    df['Guests Included'].replace('*', np.nan,inplace=True)
    df['Min Nights'].replace('*', np.nan,inplace=True)
    df['Postal Code'].replace('*', np.nan,inplace=True)
    df['neighbourhood'].replace('*', np.nan,inplace=True)
    df['Property Type'].replace('*', np.nan,inplace=True)
    df['Host Response Rate'].replace('*', np.nan,inplace=True)

    #from 98% to 0.98
    df['Host Response Rate'].replace('nan', np.nan,inplace=True)
    df['Host Response Rate']=df[~df['Host Response Rate'].isnull()]["Host Response Rate"].str.split('%').apply(lambda x:float(x[0])/100)

    df['Accomodates'] = df['Accomodates'].astype('float')              # Convert values to float
    df['Bathrooms'] = df['Bathrooms'].astype('float')
    df['Bedrooms'] = df['Bedrooms'].astype('float')
    df['Beds'] = df['Beds'].astype('float')
    df['Guests Included'] = df['Guests Included'].astype('float')
    df['Min Nights'] = df['Min Nights'].astype('float')    
    
    df.columns = df.columns.str.lower().str.replace(' ','_')           # Rename the columns with '_' instead of ' '

    #distance from center of berlin
    df['distance_to_midcenter'] = df.apply(lambda x: distance_to_mid_center(x.latitude, x.longitude), axis=1)
    for lieu in centres_berlins:                                       # Add distances 
        df = add_distance_feature(df, lieu)

    return df

def processing_2(df):
    
    numerical_columns = df.select_dtypes(exclude=object).columns   # numeric columns names
    categorical_columns = df.select_dtypes(include=object).columns # categorical columns names
    
    df_cleaned_num = df[numerical_columns]    # dataframe with numeric columns only
    df_cleaned_cat = df[categorical_columns]  # dataframe with categorical columns only
    
    #standardisation for numeric columns
    #scalerx = MinMaxScaler() 
    #df_scaled=pd.DataFrame(scalerx.fit_transform(df_cleaned_num), columns = numerical_columns)
    
    #one hot encoder for categorical columns
    df_encoded=pd.DataFrame() 
    for cat in categorical_columns:
        df_temp = pd.get_dummies(df_cleaned_cat[cat], prefix=cat)
        df_encoded=pd.concat([df_temp, df_encoded], axis=1)
    
    #concatenation of numeric and categorical dataframes
    df_cleaned_num.reset_index(drop=True, inplace=True)
    df_encoded.reset_index(drop=True, inplace=True)
    df_final=pd.concat([df_cleaned_num, df_encoded], axis=1) 
    
    #KNN imputer (n=5 ????)
    imputer = KNNImputer(n_neighbors=5)
    df_final = pd.DataFrame(imputer.fit_transform(df_final),columns = df_final.columns)
    
    return df_final 


In [55]:
def processSubset(feature_set):
    # Fit OLS (Ordinary Least Squares) model on feature_set and calculate RSS
    model = sm.OLS(y,X[list(feature_set)])
    regr = model.fit()
    RSS = ((regr.predict(X[list(feature_set)]) - y) ** 2).sum()
    return {"model":regr, "RSS":RSS}

## For Forward step wise selection approach
def forward(features):

    # Pull out features we still need to process
    remaining_features = [d for d in X.columns if d not in features]
    
    tic = time.time()
    
    results = []
    
    for d in remaining_features:
        results.append(processSubset(features+[d]))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(features)+1, "features in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

In [43]:

#liste des lieux emblématiques de Berlin [nom,lat,long]
centres_berlins = [["charlottenburg", 52.516602, 13.304105],
                   ["kreuzberg", 52.498605, 13.391799],
                   ["wedding", 52.561559, 13.35002],
                   ["mitte", 52.531677, 13.381777],
                   ["mariendorf", 52.4333316, 13.3833318],
                   ["tegel", 52.558833, 13.288437], 
                   ["tempelhof", 52.472160, 13.370287],
                   ["spandau", 52.534080, 13.181716],
                   ["schöneberg", 52.497161, 13.346865],
                   ["wilmersdorf",  52.48333, 13.31667],
                   ["biesdorf", 52.508429, 13.563317],
                   ["moabit", 52.530832, 13.345876],
                   ["britz", 52.45, 13.433333],
                   ["neukölln", 52.440771, 13.444507],
                   ["dahlem", 52.466562, 13.300082], 
                   ["tiergarten", 52.51449, 13.350091],
                   ["hellersdorf", 52.536107, 13.604973],
                   ["prenzlauer_berg", 52.550113, 13.423125], 
                   ["friedrichshain", 52.515816, 13.454293],
                   ["reinickendorf", 52.566667, 13.333333],
                   ["friedrichsfelde", 52.503664652, 13.507664636],
                   ["friedenau", 52.47133, 13.32813],
                   ["gesundbrunnen", 52.548611, 13.390278],
                   ["charlottenbourg_nord", 52.53048, 13.29371],
                   ["hansaviertel", 52.5166646, 13.33666532],
                   ["haselhorst", 52.54409, 13.23743],
                   ["gropiusstadt", 52.425, 13.46667],
                   ["westend", 52.5166646, 13.2833322],
                   ["wittenau", 52.592455, 13.329694],
                   ["zehlendorf", 52.435077, 13.260425],
                   ["lichtenberg", 52.534306, 13.502326],
                   ["pankow", 52.592879, 13.431700],
                   ["steglitz", 52.453096, 13.331171],
                   ["siemenstadt", 52.537664516, 13.257832302],
                   ["rudow", 52.402310, 13.509220],
                   ["kaulsdorf", 52.506512, 13.593946]]

In [44]:
def add_distance_feature(df, lieu):
    name_column = "distance_" + str(lieu[0])
    df[name_column] = df.apply(lambda x : distance(x['latitude'], x['longitude'], lieu[1], lieu[2]), axis = 1)
    return df
def distance(lat1,lon1,lat2,lon2):
    R = 6373.0
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = (sin(dlat/2))**2 + cos(lat1) * cos(lat2) * (sin(dlon/2))**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

In [None]:
def missing(df,detail=False):
    total = 0
    for col in df.columns:
        miss = df[col].isnull().sum()
        pct = df[col].isna().mean() * 100
        total += miss
        if miss != 0:
            if pct>10: color=Fore.RED
            else: color=Fore.YELLOW
            print(color+'{} => {} [{}%]'.format(col, miss, round(pct, 2)))
        
        elif (total == 0) and(detail):
            print(Fore.GREEN+'{} => no missing values [{}%]'.format(col, 0))
        total=0

In [49]:
#path='train_airbnb_berlin.csv'
path='/Users/Pierr/OneDrive/Documents/CentraleSupelec/MLAirBnb/train_airbnb_berlin.csv'
my_sep,my_encoding=',','utf-8'
columns_to_drop=['Listing Name','Host Name','City','Country Code','Country']
types={'Listing ID':'str','Host ID':'str','Postal Code':'str'}
data=pd.read_csv(path,sep=my_sep,encoding=my_encoding,dtype=types)
data.describe()

Unnamed: 0,Latitude,Longitude,Square Feet,Reviews,Overall Rating,Accuracy Rating,Cleanliness Rating,Checkin Rating,Communication Rating,Location Rating,Value Rating,Price
count,15692.0,15692.0,303.0,15692.0,12730.0,12721.0,12722.0,12719.0,12722.0,12721.0,12720.0,15683.0
mean,52.509893,13.407334,445.90099,19.452014,94.717282,9.717475,9.328407,9.769164,9.779201,9.556874,9.458097,60.342983
std,0.031286,0.058646,414.817342,39.483853,7.069787,0.671793,1.023807,0.620347,0.619545,0.731109,0.788891,48.829687
min,52.36927,13.1214,0.0,0.0,20.0,2.0,2.0,2.0,2.0,2.0,2.0,8.0
25%,52.48892,13.376025,0.0,1.0,93.0,10.0,9.0,10.0,10.0,9.0,9.0,32.0
50%,52.5091,13.41725,440.0,5.0,97.0,10.0,10.0,10.0,10.0,10.0,10.0,49.0
75%,52.532713,13.43975,700.0,17.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,70.0
max,52.63967,13.70902,1912.0,424.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,900.0


In [None]:
missing(data)

In [50]:

#processing 1: drop the irrelevant columns / replace '*' values by nan / standardisation of columns name
df=processing(data)

df.drop(columns=['square_feet','business_travel_ready','host_id','listing_id','neighborhood_group'],inplace=True)

# droping outlighers for prices >300
df = df[~(df['price'] > 300)]

#droping rows where there is more than 3 nan 
#from dataviz we saw that it was the best option
df=df[df.isnull().sum(1)<4]

#processing 2: standardisation / one-hot-encoder / KNN imputer
df=processing_2(df)
print(df.shape)
df

(12622, 267)


Unnamed: 0,host_since,host_response_rate,latitude,longitude,accomodates,bathrooms,bedrooms,beds,guests_included,min_nights,...,neighbourhood_Wilhelmstadt,neighbourhood_Wilmersdorf,neighbourhood_Wittenau,neighbourhood_Zehlendorf,is_superhost_f,is_superhost_t,host_response_time_a few days or more,host_response_time_within a day,host_response_time_within a few hours,host_response_time_within an hour
0,735983.0,0.780,52.54652,13.41792,2.0,1.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,734965.0,0.980,52.56512,13.42214,2.0,1.0,2.0,2.0,2.0,7.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,736001.0,0.630,52.54741,13.42521,3.0,1.0,1.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,736871.0,0.780,52.50958,13.45144,2.0,1.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,736198.0,1.000,52.44826,13.40608,3.0,1.0,1.0,2.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12617,735605.0,0.808,52.52759,13.33738,1.0,1.0,1.0,1.0,1.0,5.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12618,735351.0,0.936,52.52638,13.39698,2.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12619,735041.0,0.330,52.49016,13.39241,2.0,1.5,1.0,1.0,1.0,6.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
12620,735707.0,0.834,52.47586,13.45362,8.0,2.0,3.0,5.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [58]:
models_fwd = pd.DataFrame(columns=["RSS", "model"])

tic = time.time()
features = []

for i in range(1,len(df.columns)+1):    
    models_fwd.loc[i] = forward(features)
    features = models_fwd.loc[i]["model"].model.exog_names

Processed  266 models on 1 features in 0.8915071487426758 seconds.
Processed  265 models on 2 features in 0.7423319816589355 seconds.
Processed  264 models on 3 features in 0.8554203510284424 seconds.
Processed  263 models on 4 features in 0.9408206939697266 seconds.
Processed  262 models on 5 features in 1.3603081703186035 seconds.
Processed  261 models on 6 features in 1.6947126388549805 seconds.
Processed  260 models on 7 features in 1.980072021484375 seconds.
Processed  259 models on 8 features in 2.189037561416626 seconds.
Processed  258 models on 9 features in 2.393483877182007 seconds.
Processed  257 models on 10 features in 3.2242801189422607 seconds.
Processed  256 models on 11 features in 4.095274209976196 seconds.
Processed  255 models on 12 features in 3.8356738090515137 seconds.
Processed  254 models on 13 features in 3.9871277809143066 seconds.
Processed  253 models on 14 features in 4.233303070068359 seconds.
Processed  252 models on 15 features in 4.576990842819214 seco

KeyboardInterrupt: 

In [51]:
X=df.drop(columns=['price'])
y=df['price'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
regressor = xgb.XGBRegressor(
    n_estimators=500,
    reg_alpha=0.05,
    reg_lambda = 0.05,
    colsample_bytree = 0.3,
    max_depth=3
)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('rmse ',math.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('r2   ',metrics.r2_score(y_test, y_pred))

In [None]:
params = { 'max_depth': [3,6],
           'learning_rate': [0.0001,0.01, 0.05, 0.1, 0.15],
           'n_estimators': [100, 250, 500, 600],
           'colsample_bytree': [0.3]}
xgbr = xgb.XGBRegressor()
clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring=two_scorer(), 
                   verbose=1)
clf.fit(X_train, y_train)
best_params = clf.best_params_
model = clf.best_estimator_
score = clf.best_score_

In [54]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test) 
mse = metrics.mean_squared_error(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print ('mse = {}, rmse = {} \nmae = {} r2 = {}'.format(mse,math.sqrt(mse), mae, r2))

mse = 820.1187274812519, rmse = 28.637715123264492 
mae = 19.35108061471563 r2 = 0.506884079734975


In [34]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10,20,30,40],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


KeyboardInterrupt: 