In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [2]:
paris_weekday_file = "Resources/paris_weekdays.csv"
paris_weekend_file = "Resources/paris_weekends.csv"
paris_weekday_data = pd.read_csv(paris_weekday_file)
paris_weekend_data = pd.read_csv(paris_weekend_file)

In [3]:
paris_weekday_data.columns

Index(['Unnamed: 0', 'realSum', 'room_type', 'room_shared', 'room_private',
       'person_capacity', 'host_is_superhost', 'multi', 'biz',
       'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms', 'dist',
       'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index',
       'rest_index_norm', 'lng', 'lat'],
      dtype='object')

In [4]:
paris_weekend_data.columns

Index(['Unnamed: 0', 'realSum', 'room_type', 'room_shared', 'room_private',
       'person_capacity', 'host_is_superhost', 'multi', 'biz',
       'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms', 'dist',
       'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index',
       'rest_index_norm', 'lng', 'lat'],
      dtype='object')

In [5]:
paris_weekday_df = paris_weekday_data[
    ["realSum", "room_type", "room_shared","person_capacity", "cleanliness_rating",
    "guest_satisfaction_overall", "bedrooms", "dist", "metro_dist", "lng", "lat"]
]

paris_weekend_df = paris_weekend_data[
    ["realSum", "room_type", "room_shared","person_capacity", "cleanliness_rating",
    "guest_satisfaction_overall", "bedrooms", "dist", "metro_dist", "lng", "lat"]
]

In [6]:
paris_weekday_df =paris_weekday_df.rename(columns={
    "realSum":"airbnb_weekday_price", "dist":"city_centre_dist"
})

paris_weekend_df =paris_weekend_df.rename(columns={
    "realSum":"airbnb_weekend_price", "dist":"city_centre_dist"
})

In [7]:
paris_weekday_df = paris_weekday_df.dropna()
paris_weekend_df = paris_weekend_df.dropna()

In [8]:
paris_weekday_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3130 entries, 0 to 3129
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   airbnb_weekday_price        3130 non-null   float64
 1   room_type                   3130 non-null   object 
 2   room_shared                 3130 non-null   bool   
 3   person_capacity             3130 non-null   float64
 4   cleanliness_rating          3130 non-null   float64
 5   guest_satisfaction_overall  3130 non-null   float64
 6   bedrooms                    3130 non-null   int64  
 7   city_centre_dist            3130 non-null   float64
 8   metro_dist                  3130 non-null   float64
 9   lng                         3130 non-null   float64
 10  lat                         3130 non-null   float64
dtypes: bool(1), float64(8), int64(1), object(1)
memory usage: 247.7+ KB


In [9]:
paris_weekend_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3558 entries, 0 to 3557
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   airbnb_weekend_price        3558 non-null   float64
 1   room_type                   3558 non-null   object 
 2   room_shared                 3558 non-null   bool   
 3   person_capacity             3558 non-null   float64
 4   cleanliness_rating          3558 non-null   float64
 5   guest_satisfaction_overall  3558 non-null   float64
 6   bedrooms                    3558 non-null   int64  
 7   city_centre_dist            3558 non-null   float64
 8   metro_dist                  3558 non-null   float64
 9   lng                         3558 non-null   float64
 10  lat                         3558 non-null   float64
dtypes: bool(1), float64(8), int64(1), object(1)
memory usage: 281.6+ KB


In [10]:
print(len(paris_weekday_df))
print(len(paris_weekend_df))

3130
3558


In [11]:
weekday_id = np.arange(1, 3131)
weekend_id = np.arange(1, 3559)

print(weekday_id)
print(weekend_id)

[   1    2    3 ... 3128 3129 3130]
[   1    2    3 ... 3556 3557 3558]


In [12]:
paris_weekday_ids = ['paris'+id.astype(str) for id in weekday_id]
paris_weekend_ids = ['paris'+id.astype(str) for id in weekend_id]

In [13]:
paris_weekday_df["weekday_id"] = paris_weekday_ids
paris_weekend_df["weekend_id"] = paris_weekend_ids

In [14]:
paris_weekday_df.columns

Index(['airbnb_weekday_price', 'room_type', 'room_shared', 'person_capacity',
       'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms',
       'city_centre_dist', 'metro_dist', 'lng', 'lat', 'weekday_id'],
      dtype='object')

In [15]:
paris_weekend_df.columns

Index(['airbnb_weekend_price', 'room_type', 'room_shared', 'person_capacity',
       'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms',
       'city_centre_dist', 'metro_dist', 'lng', 'lat', 'weekend_id'],
      dtype='object')

In [16]:
paris_weekday_new = paris_weekday_df[
    ['weekday_id', 'airbnb_weekday_price', 'room_type', 'room_shared', 'person_capacity',
       'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms',
       'city_centre_dist', 'metro_dist', 'lng', 'lat']
].set_index('weekday_id', inplace=False)
paris_weekday_new.head()

Unnamed: 0_level_0,airbnb_weekday_price,room_type,room_shared,person_capacity,cleanliness_rating,guest_satisfaction_overall,bedrooms,city_centre_dist,metro_dist,lng,lat
weekday_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
paris1,296.15994,Private room,False,2.0,10.0,97.0,1,0.699821,0.193709,2.35385,48.86282
paris2,288.237487,Private room,False,2.0,10.0,97.0,1,2.100005,0.107221,2.32436,48.85902
paris3,211.343089,Private room,False,2.0,10.0,94.0,1,3.302325,0.234724,2.31714,48.87475
paris4,298.9561,Entire home/apt,False,2.0,9.0,91.0,1,0.547567,0.195997,2.356,48.861
paris5,247.926181,Entire home/apt,False,4.0,7.0,82.0,1,1.197921,0.103573,2.35915,48.86648


In [17]:
paris_weekend_new = paris_weekend_df[
    ['weekend_id', 'airbnb_weekend_price', 'room_type', 'room_shared', 'person_capacity',
       'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms',
       'city_centre_dist', 'metro_dist', 'lng', 'lat']
].set_index('weekend_id', inplace=False)
paris_weekend_new.head()

Unnamed: 0_level_0,airbnb_weekend_price,room_type,room_shared,person_capacity,cleanliness_rating,guest_satisfaction_overall,bedrooms,city_centre_dist,metro_dist,lng,lat
weekend_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
paris1,536.396682,Entire home/apt,False,5.0,9.0,89.0,1,1.351201,0.212346,2.359,48.868
paris2,290.101594,Private room,False,2.0,10.0,97.0,1,0.699821,0.19371,2.35385,48.86282
paris3,445.754497,Entire home/apt,False,4.0,10.0,100.0,1,0.968982,0.294343,2.36023,48.86375
paris4,211.343089,Private room,False,2.0,10.0,94.0,1,3.302319,0.23474,2.31714,48.87475
paris5,266.334234,Entire home/apt,False,2.0,9.0,88.0,1,1.40243,0.055052,2.33408,48.85384


In [20]:
paris_weekend_new.to_csv("cleaned_files/paris_weekend.csv", encoding='utf8')
paris_weekday_new.to_csv("cleaned_files/paris_weekday.csv", encoding='utf8')