In [28]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import math
import pylab
import scipy.stats as stats
%matplotlib inline

In [29]:
cols = [
    'id',
    'listing_url',
    'name',
    'latitude',
    'longitude',
    'neighbourhood_cleansed',
    'property_type',
    'room_type',
    'accommodates',
    'bedrooms',
    'bathrooms',
    'beds',
    'bed_type',
    'amenities',
    'price',
    'minimum_nights',
    'availability_365',
    'reviews_per_month',
    'review_scores_rating',
]

data = pd.read_csv('paris.csv', usecols=cols)

In [30]:
data.head()

Unnamed: 0,id,listing_url,name,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,minimum_nights,availability_365,review_scores_rating,reviews_per_month
0,3109,https://www.airbnb.com/rooms/3109,zen and calm,Observatoire,48.83349,2.31852,Apartment,Entire home/apt,2,1.0,0.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Paid parking off premi...",$60.00,2,246,100.0,0.24
1,5396,https://www.airbnb.com/rooms/5396,Explore the heart of old Paris,Hôtel-de-Ville,48.851,2.35869,Apartment,Entire home/apt,2,1.0,0.0,1.0,Pull-out Sofa,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Heating,W...",$115.00,1,70,90.0,1.51
2,7397,https://www.airbnb.com/rooms/7397,MARAIS - 2ROOMS APT - 2/4 PEOPLE,Hôtel-de-Ville,48.85758,2.35275,Apartment,Entire home/apt,4,1.0,2.0,2.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Paid par...",$119.00,10,257,94.0,2.45
3,7964,https://www.airbnb.com/rooms/7964,Large & sunny flat with balcony !,Opéra,48.87464,2.34341,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",$130.00,6,352,96.0,0.05
4,8522,https://www.airbnb.com/rooms/8522,GREAT FLAT w/ CITY VIEW,Ménilmontant,48.86528,2.39326,Apartment,Entire home/apt,3,1.0,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Washer / Dryer"",Elevat...",$90.00,3,255,100.0,0.01


In [31]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum())/data.isnull().count().sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'], sort=False).sort_values('Total', ascending=False)
missing_data.head(40)

Unnamed: 0,Total,Percent
review_scores_rating,14386,0.221425
reviews_per_month,13285,0.204479
beds,443,0.006819
bedrooms,80,0.001231
name,66,0.001016
bathrooms,57,0.000877
property_type,0,0.0
listing_url,0,0.0
availability_365,0,0.0
neighbourhood_cleansed,0,0.0


In [32]:
# remove NaN values from dataframe
original = len(data)

data = data.dropna(how='any', subset=['name', 'bathrooms', 'bedrooms', 'beds'])

# convert formatting for price
data['price'] = (data['price'].str.replace(r'[^-+\d.]', '').astype(float))

# drop any inconsistent values
print('Number of Accommodates 0:', len(data[data['accommodates'] == 0]))
print('Number of Bedrooms 0:', len(data[data['bedrooms'] == 0]))
print('Number of Beds 0:', len(data[data['beds'] == 0]))
print('Number of Listings with Price $0.00:', len(data[data['price'] == 0.00]))

data = data[data['accommodates'] != 0]
data = data[data['bedrooms'] != 0]
data = data[data['beds'] != 0]
data = data[data['price'] != 0.00]

# convert amenities
# data['amenities'] = data['amenities'].apply(lambda x: len(x.split(',')))

print('Number of original values:', original)
print('Number of NaN values removed:', original - len(data))
print('Number of listings:', len(data))

Number of Accommodates 0: 0
Number of Bedrooms 0: 13360
Number of Beds 0: 1346
Number of Listings with Price $0.00: 8
Number of original values: 64970
Number of NaN values removed: 14778
Number of listings: 50192


In [33]:
# turn NaN scores with 0 reviews into 'No Reviews'
idx_vals = data['review_scores_rating'][data['reviews_per_month'] == 0].index.values.tolist()
data.loc[idx_vals, ('review_scores_rating')] = data['review_scores_rating'].fillna("No Reviews", inplace=True)
data.loc[idx_vals, ('reviews_per_month')] = data['reviews_per_month'].fillna(0, inplace=True)

In [34]:
data.head()

Unnamed: 0,id,listing_url,name,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,minimum_nights,availability_365,review_scores_rating,reviews_per_month
2,7397,https://www.airbnb.com/rooms/7397,MARAIS - 2ROOMS APT - 2/4 PEOPLE,Hôtel-de-Ville,48.85758,2.35275,Apartment,Entire home/apt,4,1.0,2.0,2.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Paid par...",119.0,10,257,94,2.45
3,7964,https://www.airbnb.com/rooms/7964,Large & sunny flat with balcony !,Opéra,48.87464,2.34341,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",130.0,6,352,96,0.05
4,8522,https://www.airbnb.com/rooms/8522,GREAT FLAT w/ CITY VIEW,Ménilmontant,48.86528,2.39326,Apartment,Entire home/apt,3,1.0,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,""Washer / Dryer"",Elevat...",90.0,3,255,100,0.01
5,9359,https://www.airbnb.com/rooms/9359,"Cozy, Central Paris: WALK or VELIB EVERYWHERE !",Louvre,48.85899,2.34735,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{Internet,Wifi,Kitchen,Elevator,Heating,Essent...",75.0,180,169,No Reviews,0.0
6,9952,https://www.airbnb.com/rooms/9952,Paris petit coin douillet,Popincourt,48.86227,2.37134,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,Kitchen,""Paid parking off pr...",75.0,5,336,98,0.25


In [35]:
# read to csv
data.to_csv('paris_pre.csv', index=False)