In [31]:
#Import Dependencies
import pandas as pd
import numpy as np

In [32]:
file = "Resources/listings.csv"

df = pd.read_csv(file)
columns_to_keep = ["zipcode", "room_type", "accommodates", "bedrooms", "bathrooms", "beds", "minimum_nights", "number_of_reviews", "price"]
df = df[columns_to_keep]

#drop all rows with nulls
df.dropna(how='any', inplace=True)

#make copy of df to avoid warning errors
data_df = df.copy()

# price is a string.  Remove $ and convert into float
# https://stackoverflow.com/questions/32464280/converting-currency-with-to-numbers-in-python-pandas
data_df["price"] = data_df["price"].replace('[\$,]', '', regex=True).astype(float)

#there is one row where they have "CA" for zipcode.  Drop that row.
data_df=data_df[data_df.zipcode != "CA"]

#drop hotel room from room_type
data_df=data_df[data_df.room_type != "Hotel room"]

#some zipcodes start with CA.  replace "CA " with "" 
#https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column
data_df["zipcode"] = data_df["zipcode"].replace('CA ', '', regex=True)

#drop all rows with nulls
data_df.dropna(how='any', inplace=True)

#convert zipcodes to intergers - ask justin if we need to do this?????
data_df["zipcode"] = data_df["zipcode"].astype(str).astype(int)

#we only want to keep prices below $800.  Everything about $800 is an outlier
data_df = data_df[data_df.price < 800]

data_df.head()

Unnamed: 0,zipcode,room_type,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews,price
0,94117,Entire home/apt,3,1.0,1.0,2.0,1,240,170.0
1,94110,Entire home/apt,5,2.0,1.0,3.0,30,111,235.0
2,94117,Private room,2,1.0,4.0,1.0,32,19,65.0
3,94117,Private room,2,1.0,4.0,1.0,32,8,65.0
4,94117,Entire home/apt,4,2.0,1.5,2.0,5,28,703.0


In [33]:
data_df.dtypes

zipcode                int32
room_type             object
accommodates           int64
bedrooms             float64
bathrooms            float64
beds                 float64
minimum_nights         int64
number_of_reviews      int64
price                float64
dtype: object

In [34]:
#create price bins
bins = [0]
group_names = []

for i in range (0,80):
    x = round((i*10) +9.99,3)
    bins.append(x)
    name = "bin " + str(i*10) + "-" + str(x)
    group_names.append(name)
    
print(bins, group_names)
print(len(bins), len(group_names))

[0, 9.99, 19.99, 29.99, 39.99, 49.99, 59.99, 69.99, 79.99, 89.99, 99.99, 109.99, 119.99, 129.99, 139.99, 149.99, 159.99, 169.99, 179.99, 189.99, 199.99, 209.99, 219.99, 229.99, 239.99, 249.99, 259.99, 269.99, 279.99, 289.99, 299.99, 309.99, 319.99, 329.99, 339.99, 349.99, 359.99, 369.99, 379.99, 389.99, 399.99, 409.99, 419.99, 429.99, 439.99, 449.99, 459.99, 469.99, 479.99, 489.99, 499.99, 509.99, 519.99, 529.99, 539.99, 549.99, 559.99, 569.99, 579.99, 589.99, 599.99, 609.99, 619.99, 629.99, 639.99, 649.99, 659.99, 669.99, 679.99, 689.99, 699.99, 709.99, 719.99, 729.99, 739.99, 749.99, 759.99, 769.99, 779.99, 789.99, 799.99] ['bin 0-9.99', 'bin 10-19.99', 'bin 20-29.99', 'bin 30-39.99', 'bin 40-49.99', 'bin 50-59.99', 'bin 60-69.99', 'bin 70-79.99', 'bin 80-89.99', 'bin 90-99.99', 'bin 100-109.99', 'bin 110-119.99', 'bin 120-129.99', 'bin 130-139.99', 'bin 140-149.99', 'bin 150-159.99', 'bin 160-169.99', 'bin 170-179.99', 'bin 180-189.99', 'bin 190-199.99', 'bin 200-209.99', 'bin 210-2

In [35]:
#cut the bins and group names to the dataframe

data_df["price_bin"] = pd.cut(data_df["price"], bins, labels=group_names)
data_df.head()

Unnamed: 0,zipcode,room_type,accommodates,bedrooms,bathrooms,beds,minimum_nights,number_of_reviews,price,price_bin
0,94117,Entire home/apt,3,1.0,1.0,2.0,1,240,170.0,bin 170-179.99
1,94110,Entire home/apt,5,2.0,1.0,3.0,30,111,235.0,bin 230-239.99
2,94117,Private room,2,1.0,4.0,1.0,32,19,65.0,bin 60-69.99
3,94117,Private room,2,1.0,4.0,1.0,32,8,65.0,bin 60-69.99
4,94117,Entire home/apt,4,2.0,1.5,2.0,5,28,703.0,bin 700-709.99


In [36]:
data_df.to_csv("Output/cleaned_data.csv", index = False)