# Introduction

In [47]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
pd.options.mode.chained_assignment = None
from math import *
from sklearn.metrics import mean_squared_error
np.random.seed(1)

In [48]:
paris_listings = pd.read_csv('paris_airbnb.csv') 

In [49]:
stripped = paris_listings['price'].str.replace(',','').str.replace('$','')
paris_listings['price']=stripped.astype(float)

In [50]:
shuffled_index = np.random.permutation(paris_listings.index)

In [51]:
paris_listings = paris_listings.reindex(shuffled_index)

In [52]:
split_one = paris_listings.iloc[0:4000]
split_two = paris_listings.iloc[4000:]

# Validation croisée Holdout

#### model one

In [53]:
knn_one = KNeighborsRegressor(n_neighbors = 5, algorithm='auto')

In [54]:
train_one = split_one
test_one = split_two
train_two = split_two
test_two = split_one

In [55]:
knn_one.fit(train_one[['accommodates']], train_one['price'])
prediction_one = knn_one.predict(test_one[['accommodates']])

In [56]:
mse_one = mean_squared_error(test_one['price'], prediction_one)

In [57]:
iteration_one_rmse = sqrt(mse_one)

#### Model two

In [58]:
knn_two = KNeighborsRegressor(n_neighbors = 5, algorithm='auto')

In [59]:
knn_two.fit(train_two[['accommodates']], train_two['price'])
prediction_two = knn_two.predict(test_two[['accommodates']])
mse_two = mean_squared_error(test_two['price'], prediction_two)
iteration_two_rmse = sqrt(mse_two)

In [60]:
data = [iteration_one_rmse, iteration_two_rmse]
avg_rmse=np.mean(data)

In [61]:
avg_rmse

102.07284610848862

In [62]:
print(iteration_one_rmse, iteration_two_rmse)

88.96592437557203 115.17976784140521


# Validation croisée des K-Fold

In [63]:
paris_listings['fold']=0
paris_listings['fold'].iloc[0:1600]=1
paris_listings['fold'].iloc[1600:3200]=2
paris_listings['fold'].iloc[3200:4800]=3
paris_listings['fold'].iloc[4800:6400]=4
paris_listings['fold'].iloc[6400:8000]=5

In [64]:
print(paris_listings['fold'].value_counts())

3    1600
2    1600
5    1600
1    1600
4    1600
Name: fold, dtype: int64


# Première itération

In [67]:
knn = KNeighborsRegressor()

In [69]:
train_iteration_one = paris_listings[paris_listings['fold']!=1]
test_iteration_one = paris_listings[paris_listings['fold']==1]
knn.fit(train_iteration_one[['accommodates']], train_iteration_one['price'])
labels = knn.predict(test_iteration_one[['accommodates']])

iteration_one_rmse=sqrt(mean_squared_error(test_iteration_one['price'], labels))
print(iteration_one_rmse)

81.94523308283405


# Fonction pour entrainer des modèles

In [91]:
fold_ids = [1,2,3,4,5]

def train_and_validate(df, folds):
    fold_rmses = []
    for n in folds:
        knn=KNeighborsRegressor()
        train_iteration = df[df['fold']!=n]
        test_iteration = df[df['fold']==n]
        knn.fit(train_iteration[['accommodates']], train_iteration['price'])
        prediction = knn.predict(test_iteration[['accommodates']])
        rmse = sqrt(mean_squared_error(test_iteration['price'], prediction))
        fold_rmses.append(rmse)
    return (fold_rmses)


In [94]:
rmses = train_and_validate(paris_listings, fold_ids)
avg_rmse = np.mean(rmses)

print(rmses)
print(avg_rmse)

[81.94523308283405, 156.1902075995803, 72.58622217749041, 99.10605291807357, 83.16789539840478]
98.59912223527662


# Executer une validation croisée des K-Fold en utilisant Scikit-Learn

In [108]:
from sklearn.model_selection import KFold
kf = KFold(5, shuffle=True, random_state=1)

knn = KNeighborsRegressor()

from sklearn.model_selection import cross_val_score
mses = cross_val_score(knn, paris_listings[['accommodates']], paris_listings['price'], scoring='neg_mean_squared_error', cv=kf)

avg_rmse = abs(mses)
avg_rmse = np.sqrt(avg_rmse)
avg_rmse = np.mean(avg_rmse)

In [109]:
avg_rmse

98.26413709965395

In [107]:
mses

array([ -5683.678775,  -6180.884725,  -8394.137675,  -7635.3341  ,
       -25062.68305 ])