In [6]:
import csv
import os
import pandas as pd
import jenkspy
import folium
from folium import plugins

In [7]:
os.getcwd()

'/home/jupyter/Airbnb'

In [8]:
data = pd.read_csv('/home/jupyter/Airbnb/listings.csv')
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2318,Casa Madrona - Urban Oasis 1 block from the park!,2536,Megan,Central Area,Madrona,47.61082,-122.29082,Entire home/apt,296,30,28,2019-08-30,0.21,2,84
1,5682,"Cozy Studio, min. to downtown -WiFi",8993,Maddy,Delridge,South Delridge,47.52398,-122.35989,Entire home/apt,48,3,462,2018-11-24,3.99,1,0
2,6606,"Fab, private seattle urban cottage!",14942,Joyce,Other neighborhoods,Wallingford,47.65411,-122.33761,Entire home/apt,90,2,147,2019-09-07,1.19,3,85
3,9419,Glorious sun room w/ memory foambed,30559,Angielena,Other neighborhoods,Georgetown,47.55062,-122.32014,Private room,62,2,144,2019-09-02,1.29,8,365
4,9460,Downtown Convention Center B&B -- Free Minibar,30832,Siena,Downtown,First Hill,47.61265,-122.32936,Private room,99,3,443,2019-09-02,3.62,4,150


## **Data Understanding Phase**

In [9]:
data.shape

(9040, 16)

In [10]:
data.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [11]:
data.columns[data.isnull().mean()==0] #Data With Nulls: name, reviews_per_month, last_review, number_of_reviews

Index(['id', 'host_id', 'neighbourhood_group', 'neighbourhood', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

# **Data Preparation Phase**

In [12]:
data.room_type.unique()

array(['Entire home/apt', 'Private room', 'Shared room', 'Hotel room'],
      dtype=object)

##### **Creating frequency count of prices**

In [13]:
data.price.describe()

count    9040.000000
mean      169.111283
std       191.104568
min         0.000000
25%        82.000000
50%       120.000000
75%       189.000000
max      5400.000000
Name: price, dtype: float64

#### **Getting natural breaks in the pricing data by using Jenks natural breaks algorithm**
##### *- Jenks helps to create frequency distribution based on smallest variance within a group*

In [14]:
breaks = jenkspy.jenks_breaks(data.price, nb_class=5)
breaks

[0.0, 137.0, 280.0, 650.0, 2000.0, 5400.0]

In [15]:
df = data.loc[:,['id','neighbourhood_group','neighbourhood','latitude','longitude','room_type','price','minimum_nights']]

#### **Splitting price data into VeryLow, Low, Medium, High, VeryHigh**

In [16]:
filter_method = lambda x: '0_VeryLow' if (0 < x <= 137) else '1_Low' if (137 < x <= 280) else '2_Medium' if (280 < x <= 650) else '3_High' if (650 < x <= 2000) else '4_VeryHigh' if (2000 < x <= 5400) else None
'''The lambda X takes each row and creates the price_bracket column
based on price value. Price_Bracket segregates price into categories
based on ranges identified using Jenks Algorithm'''

'The lambda X takes each row and creates the price_bracket column\nbased on price value. Price_Bracket segregates price into categories\nbased on ranges identified using Jenks Algorithm'

In [17]:
df['price_bracket'] = df['price'].apply(filter_method)
df.head()

Unnamed: 0,id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,price_bracket
0,2318,Central Area,Madrona,47.61082,-122.29082,Entire home/apt,296,30,2_Medium
1,5682,Delridge,South Delridge,47.52398,-122.35989,Entire home/apt,48,3,0_VeryLow
2,6606,Other neighborhoods,Wallingford,47.65411,-122.33761,Entire home/apt,90,2,0_VeryLow
3,9419,Other neighborhoods,Georgetown,47.55062,-122.32014,Private room,62,2,0_VeryLow
4,9460,Downtown,First Hill,47.61265,-122.32936,Private room,99,3,0_VeryLow


##### *- Since the columns (name, reviews_per_month, last_review, number_of_reviews) will not be required for our analysis we will be dropping it. It contains null values while the other columns do not have nulls*

In [18]:
df.dropna().shape

(9038, 9)

In [19]:
df1 = df[['neighbourhood','price']].groupby(['neighbourhood']).agg(['mean']).reset_index()

In [20]:
df1.columns = df1.columns.get_level_values(1)

In [21]:
df1.columns = ['neighbourhood', 'mean']

In [22]:
df1['Price_bracket'] = df1['mean'].apply(filter_method)

In [23]:
df1.Price_bracket.unique()

array(['1_Low', '0_VeryLow', '2_Medium'], dtype=object)

In [24]:
df1.sort_values(by='Price_bracket', ascending = False)

Unnamed: 0,neighbourhood,mean,Price_bracket
29,Harrison/Denny-Blaine,352.189189,2_Medium
35,International District,423.935484,2_Medium
59,Pike-Market,478.786477,2_Medium
12,Central Business District,332.891697,2_Medium
62,Portage Bay,198.551724,1_Low
...,...,...,...
47,Mid-Beacon Hill,98.875000,0_VeryLow
53,North Beacon Hill,120.016043,0_VeryLow
54,North College Park,79.643836,0_VeryLow
55,North Delridge,119.265625,0_VeryLow


# **Modeling / Evaluation and visualization Phase** 

## **Is there any relation between minimum night stay and room prices?**

In [25]:
df['price'].corr(df['minimum_nights'])

-0.02366752376848446

##### **RESULT: Since the correlation is very minimal, no significant relationship exist between price and minimum nughts stay**

## **Location suggestion for an affluent traveller**

In [26]:
list = ['3_High','4_VeryHigh']
df2 = df[df.price_bracket.isin(list)].loc[:,['latitude','longitude']]

In [27]:
m = folium.Map([47.6062,-122.3321], zoom_start=11)
m

In [28]:
for index, row in df2.iterrows():
    folium.CircleMarker([row['latitude'],row['longitude']]
                        , radius=2
                        , fill_color="#3db7e4", # divvy color
                       ).add_to(m)
m

### **Observation: From the picture it can be noticed that most of the high priced airbnbs are located near downtown** 

## **What are popular stay option for affluent traveller**

In [42]:
df.head(5)

Unnamed: 0,id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,price_bracket
0,2318,Central Area,Madrona,47.61082,-122.29082,Entire home/apt,296,30,2_Medium
1,5682,Delridge,South Delridge,47.52398,-122.35989,Entire home/apt,48,3,0_VeryLow
2,6606,Other neighborhoods,Wallingford,47.65411,-122.33761,Entire home/apt,90,2,0_VeryLow
3,9419,Other neighborhoods,Georgetown,47.55062,-122.32014,Private room,62,2,0_VeryLow
4,9460,Downtown,First Hill,47.61265,-122.32936,Private room,99,3,0_VeryLow


In [39]:
list=['3_High','4_VeryHigh']
Type=df[df.price_bracket.isin(list)].loc[:,['neighbourhood','room_type']]

In [40]:
Type.describe()

Unnamed: 0,neighbourhood,room_type
count,262,262
unique,35,2
top,Pike-Market,Entire home/apt
freq,93,259


In [44]:
df[df.price_bracket.isin(list)].loc[:,['price']].describe()

Unnamed: 0,price
count,262.0
mean,1027.992366
std,446.441606
min,690.0
25%,999.0
50%,999.0
75%,1002.0
max,5400.0


####  **The mean average price of high cost airbnb is Dollar 1024.99 per night. There is wide standard deviation of 446, giving a oveview on the range of data. The actual range is 5400-690 = 4710.** 

In [33]:
#### **Observation: Entire home/apartment a

In [34]:
n=Type.groupby('neighbourhood')['neighbourhood'].value_counts()

In [35]:
n #Neighbourhood with expensive airbnbs

neighbourhood              neighbourhood            
Alki                       Alki                          1
Belltown                   Belltown                     30
Briarcliff                 Briarcliff                    1
Broadway                   Broadway                      2
Cedar Park                 Cedar Park                    2
Central Business District  Central Business District    37
Columbia City              Columbia City                 1
East Queen Anne            East Queen Anne               3
Georgetown                 Georgetown                    1
Greenwood                  Greenwood                     3
Haller Lake                Haller Lake                   1
Harrison/Denny-Blaine      Harrison/Denny-Blaine         2
International District     International District       40
Lawton Park                Lawton Park                   1
Leschi                     Leschi                        2
Lower Queen Anne           Lower Queen Anne              4
Loy

In [76]:
n.to_frame('Count').sort_values(by=['Count'], ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
neighbourhood,neighbourhood,Unnamed: 2_level_1
Pike-Market,Pike-Market,93
International District,International District,40
Central Business District,Central Business District,37
Belltown,Belltown,30
Pioneer Square,Pioneer Square,6
Lower Queen Anne,Lower Queen Anne,4
Minor,Minor,4
West Queen Anne,West Queen Anne,4
East Queen Anne,East Queen Anne,3
Greenwood,Greenwood,3
