 # Récapitulatif

In [1]:
import pandas as pd
import numpy as np
np.random.seed(1)

paris_listings = pd.read_csv('paris_airbnb.csv')
paris_listings = paris_listings.loc[np.random.permutation(len(paris_listings))]
stripped_commas = paris_listings['price'].str.replace(',','')
stripped_dollars = stripped_commas.str.replace('$','')
paris_listings['price'] = stripped_dollars.astype('float')

In [2]:
paris_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 4740 to 5157
Data columns (total 19 columns):
host_response_rate      5000 non-null object
host_acceptance_rate    0 non-null float64
host_listings_count     7999 non-null float64
latitude                8000 non-null float64
longitude               8000 non-null float64
city                    7997 non-null object
zipcode                 7930 non-null object
state                   7977 non-null object
accommodates            8000 non-null int64
room_type               8000 non-null object
bedrooms                7976 non-null float64
bathrooms               7942 non-null float64
beds                    7986 non-null float64
price                   8000 non-null float64
cleaning_fee            6250 non-null object
security_deposit        6320 non-null object
minimum_nights          8000 non-null int64
maximum_nights          8000 non-null int64
number_of_reviews       8000 non-null int64
dtypes: float64(8), int64(4), obje

In [3]:
paris_listings.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,latitude,longitude,city,zipcode,state,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews
4740,,,1.0,48.88285,2.33852,Paris,75018,Île-de-France,4,Entire home/apt,1.0,1.0,2.0,65.0,$30.00,$200.00,2,1125,6
5606,,,1.0,48.86006,2.34768,Paris,75001,Île-de-France,3,Entire home/apt,2.0,1.5,2.0,98.0,,,5,1124,1
4824,100%,,3.0,48.88794,2.34532,Paris,75018,Île-de-France,2,Entire home/apt,0.0,1.0,1.0,65.0,$30.00,"$1,000.00",10,1125,0
4205,60%,,1.0,48.88868,2.34111,Paris,75018,Île-de-France,2,Private room,1.0,1.0,1.0,45.0,$30.00,$200.00,5,1125,83
3228,90%,,1.0,48.88686,2.3367,Paris,75018,Île-de-France,3,Entire home/apt,1.0,1.0,2.0,65.0,,$200.00,5,365,5


# Supprimer des caractéristiques et afficher somme des valeurs manquantes

In [48]:
# paris_listings = paris_listings.drop(['room_type','city','state','longitude','latitude','zipcode','host_response_rate','host_acceptance_rate','host_listings_count'], axis=1)

In [50]:
print(paris_listings.isnull().sum())

accommodates            0
bedrooms               24
bathrooms              58
beds                   14
price                   0
cleaning_fee         1750
security_deposit     1680
minimum_nights          0
maximum_nights          0
number_of_reviews       0
dtype: int64


# Gerer les valeurs manquantes

In [57]:
paris_listings = paris_listings.drop(['cleaning_fee','security_deposit'], axis = 1)

In [67]:
paris_listings = paris_listings.dropna(subset=['bedrooms','bathrooms','beds'],axis=0)

In [70]:
paris_listings.isnull().sum()

accommodates         0
bedrooms             0
bathrooms            0
beds                 0
price                0
minimum_nights       0
maximum_nights       0
number_of_reviews    0
dtype: int64

# Normaliser les colonnes

In [71]:
paris_listings.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
4740,4,1.0,1.0,2.0,65.0,2,1125,6
5606,3,2.0,1.5,2.0,98.0,5,1124,1
4824,2,0.0,1.0,1.0,65.0,10,1125,0
4205,2,1.0,1.0,1.0,45.0,5,1125,83
3228,3,1.0,1.0,2.0,65.0,5,365,5


In [72]:
#Soustraire chaque valeur dans la colonne par moyenne
first_transformation = paris_listings['maximum_nights'] - paris_listings['maximum_nights'].mean()

#Diviser chaque valeur dans la colonne par l'écart type
normalized_col = first_transformation / first_transformation.std()

In [73]:
normalized_listings = (paris_listings - paris_listings.mean()) / (paris_listings.std())

In [75]:
normalized_listings['price'] = paris_listings['price']

In [76]:
normalized_listings.head(3)

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
4740,0.503868,-0.296884,-0.293537,0.20531,65.0,-0.184601,1.062858,-0.564545
5606,-0.131849,0.892605,0.843973,0.20531,98.0,-0.101183,1.061018,-0.636924
4824,-0.767566,-1.486372,-0.293537,-0.64526,65.0,0.037847,1.062858,-0.6514


# Distance euclidienne pour le cas multivarié

In [78]:
from scipy.spatial import distance
first_listing = [0.503868, -0.296884]
second_listing = [-0.131849, 0.892605]

dist = distance.euclidean(first_listing, second_listing)
dist

1.3487105639128063

In [85]:
first_fith_distance = distance.euclidean(normalized_listings[['accommodates','bedrooms']].iloc[0], normalized_listings[['accommodates','bedrooms']].iloc[4])

In [86]:
print(first_fith_distance)

0.6357172321498359


# Introduction à la bibliothèque Scikit-learn

In [89]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(algorithm='brute')

# Adapter un modèle et faire des prédictions

In [90]:
#Separer le dataset complet en 2 sets: un de training et un de test
train_df = normalized_listings.iloc[0:6000]
test_df = normalized_listings.iloc[6000:]

# objet de type matrice contenant seulement les 2 colonnes de notre set de training qui
#nous intéressent
train_features = train_df[['accommodates','bedrooms']]

#objet de type liste contenant juste la colonne cible 'price'
train_target = train_df['price']

#passer tous ces objets dans la méthode fit
knn.fit(train_features, train_target) #Cette méthode nous permet de passer les caractéristiques
#du dataset training souhaité, en argument

KNeighborsRegressor(algorithm='brute', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [91]:
predictions = knn.predict(test_df[['accommodates', 'bedrooms']])

In [92]:
predictions

array([ 72. ,  79.2,  79.2, ...,  72. , 148.4,  79.2])

In [93]:
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')

In [94]:
train_df = normalized_listings.iloc[0:6000]
test_df = normalized_listings.iloc[6000:]
knn.fit(train_df[['accommodates','bedrooms']], train_df['price'])

KNeighborsRegressor(algorithm='brute', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [97]:
predictions = knn.predict(test_df[['accommodates','bedrooms']])

In [98]:
predictions

array([ 72. ,  79.2,  79.2, ...,  72. , 148.4,  79.2])

# Calculer l'erreur quadratique moyenne

In [109]:
from math import *

In [100]:
from sklearn.metrics import mean_squared_error

In [107]:
two_features_mse=mean_squared_error(test_df['price'], predictions)

In [110]:
two_features_rmse = sqrt(two_features_mse)

In [111]:
print(two_features_mse)
print(two_features_rmse)

6067.834762649973
77.89630776005993


# Utiliser plus de caractéristiques

In [113]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5,algorithm='brute')
train_df = normalized_listings.iloc[0:6000]
test_df = normalized_listings.iloc[6000:]

In [115]:
knn.fit(train_df[['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']], train_df['price'])

KNeighborsRegressor(algorithm='brute', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [117]:
four_predictions = knn.predict(test_df[['accommodates','bedrooms','bathrooms', 'number_of_reviews']])

In [119]:
four_mse = mean_squared_error(test_df['price'], four_predictions)

In [120]:
four_rmse = sqrt(four_mse)

In [121]:
print(four_mse)
print(four_rmse)

5488.169827856025
74.08218293122864


# Utiliser toutes les caractéristiques

In [122]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5,algorithm='brute')
train_df = normalized_listings.iloc[0:6000]
test_df = normalized_listings.iloc[6000:]

In [133]:
#df.columns.tolist() remove()
#features = train_df.columns.tolist()
#features.remove('price')
features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'minimum_nights', 'maximum_nights', 'number_of_reviews']
knn.fit(train_df[features], train_df['price'])
all_features_predictions=knn.predict(test_df[features])

In [125]:
all_features_predictions

array([239.8,  70.6, 106.8, ...,  54.8, 159.4, 163.6])

In [128]:
all_features_mse = mean_squared_error(test_df['price'], all_features_predictions)

In [129]:
all_features_rmse = sqrt(all_features_mse)

In [131]:
print(all_features_mse)
print(all_features_rmse)

7241.331684924361
85.0960145066992
