In [1]:
import pandas as pd
import time
import numpy as np
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
columns = [
        'id','host_is_superhost', 'neighbourhood_cleansed', 'property_type', 'room_type', 'accommodates',
        'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities',
        'price', 'guests_included', 'extra_people', 'minimum_nights',
        'maximum_nights', 'number_of_reviews', 'cancellation_policy',
    ]

df = pd.read_csv('listings_details.csv', index_col='id', usecols=columns)
df = df.dropna()
drop_indices = np.random.choice(df.index, 2, replace=False)
df = df.drop(drop_indices)

In [7]:
df

Unnamed: 0_level_0,superhost,neighbourhood,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,guests_included,extra_people,minimum_nights,maximum_nights,number_of_reviews,cancellation_policy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2818,1,Oostelijk Havengebied - Indische Buurt,Apartment,Private room,2,1.5,1.0,2.0,Real Bed,"{Internet,Wifi,""Paid parking off premises"",""Bu...",59.0,1,20.0,3,15,248,strict_14_with_grace_period
3209,0,Westerpark,Apartment,Entire home/apt,5,1.0,2.0,2.0,Real Bed,"{Internet,Wifi,Kitchen,""Paid parking off premi...",160.0,2,15.0,4,20,42,moderate
20168,0,Centrum-Oost,Townhouse,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,""Paid parking off premises"",...",80.0,2,0.0,1,1000,233,strict_14_with_grace_period
25428,0,Centrum-West,Apartment,Entire home/apt,3,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",125.0,2,10.0,14,60,1,strict_14_with_grace_period
27886,1,Centrum-West,Houseboat,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,Breakfast,Heating,""Smoke det...",150.0,1,0.0,2,730,171,strict_14_with_grace_period
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30576148,0,Watergraafsmeer,House,Entire home/apt,4,2.0,3.0,4.0,Real Bed,"{TV,Wifi,Kitchen,""Free parking on premises"",He...",340.0,1,0.0,7,12,0,strict_14_with_grace_period
30577727,0,Oostelijk Havengebied - Indische Buurt,Apartment,Entire home/apt,3,1.0,2.0,3.0,Real Bed,"{TV,Wifi,Kitchen,Essentials,""Hair dryer"",Iron}",150.0,1,0.0,3,14,0,flexible
30578037,0,Oud-Oost,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,""Smoking allowed"",...",80.0,1,0.0,10,22,0,moderate
30579673,0,Oostelijk Havengebied - Indische Buurt,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Wifi,Kitchen,Essentials,""Hair dryer"",Iron}",55.0,1,0.0,2,15,0,flexible


In [5]:
# rename columns
df = df.rename(columns={'host_is_superhost':'superhost', 'neighbourhood_cleansed':'neighbourhood'})

df['superhost'] = df.superhost.replace({'t':1, 'f':0})

In [6]:
df["price"] = pd.to_numeric(df["price"].str.strip("$").str.replace(",", ""), downcast="float")
df["extra_people"] = pd.to_numeric(df["extra_people"].str.strip("$").str.replace(",", ""), downcast="float")

In [150]:
# make sure amenities are split properly
def get_info(x):
    return x.strip("{|}").replace('"', '').split(",")

df['len_amenities'] = df['amenities'].apply(lambda x:len(get_info(x)))
df['amenities'] = df['amenities'].apply(lambda x:get_info(x))

In [151]:
mlb = MultiLabelBinarizer()
new_df = pd.DataFrame(mlb.fit_transform(df.pop('amenities')),
                          columns=mlb.classes_,
                          index=df.index).add_prefix('amenities_')

In [152]:
categorical_columns = ['neighbourhood', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']

In [153]:
for col in categorical_columns:
    df[col] = df[col].astype('category')
df = pd.get_dummies(df)

In [155]:
final_df = df.join(new_df)

In [157]:
final_df.to_csv("cleaned_listings.csv")