## Istanbul Airbnb Listings

In [None]:
import pandas as pd
import missingno as msno
import numpy as np
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
import seaborn as sns
%matplotlib inline

In [None]:
import folium
from folium import *
from folium.plugins import *

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/airbnb-istanbul-dataset/AirbnbIstanbul.csv')

In [None]:
df.head(15)

In [None]:
df.info()

In [None]:
msno.matrix(df,sparkline=False)

In [None]:
msno.bar(df)

In [None]:
fig,ax = plt.subplots(figsize=(10,6))
msno.heatmap(df,ax=ax)
plt.show()
#x.set_ylim([0,5])

In [None]:
df = df.drop(['neighbourhood_group'],axis=1)

In [None]:
df.head()

In [None]:
df.last_review = pd.to_datetime(df.last_review)

In [None]:
df.reviews_per_month.fillna(0.0,inplace=True)

In [None]:
df.host_name.fillna('Not Stated',inplace = True)

In [None]:
df.name.fillna('No Description',inplace = True)

In [None]:
df.last_review.fillna(df.last_review.min(),inplace=True)

In [None]:
df.info()

In [None]:
df[df.duplicated(['host_id','neighbourhood'],keep=False)].sort_values(['host_id'])

In [None]:
dup_df = df[df.duplicated(['host_id','neighbourhood'],keep=False)].sort_values(['host_id'])

In [None]:
dup_dict = dup_df.host_id.value_counts()

In [None]:
import operator
sorted_x = sorted(dup_dict.items(), key=operator.itemgetter(1))

In [None]:
top10 = [i[0] for i in sorted_x[::-1][:10]]

In [None]:
top10_hosts = df[df.host_id.isin(top10)]

In [None]:
top10_hosts.host_name.value_counts()

In [None]:
top10_hosts.groupby('host_name')['number_of_reviews'].mean().reset_index()

In [None]:
top10_host_numrv = top10_hosts.groupby('host_name')['number_of_reviews'].mean().reset_index()

In [None]:
top10_hosts.groupby('host_name')['price'].mean().reset_index()

In [None]:
top10_host_price = top10_hosts.groupby('host_name')['price'].mean().reset_index()

In [None]:
names,n = top10_host_numrv['host_name'],top10_host_numrv['number_of_reviews']

In [None]:
names,price = top10_host_price['host_name'],top10_host_price['price']

In [None]:
f,ax = plt.subplots(figsize=(12,8))
ax.barh(names,n)
ax.axvline(df.number_of_reviews.mean(),color='r')
ax.set_title('Mean # of Reviews of Top10 Multiple Property Owners',fontsize=20,pad=20)
plt.show()

In [None]:
f,ax = plt.subplots(figsize=(12,8))
ax.barh(names,price)
ax.axvline(df.price.mean(),color='r')
ax.set_title('Mean # Prices set by Top10 Multiple Property Owners',fontsize=20,pad=20)
plt.show()

In [None]:
city_pos = [df.latitude.mean(),df.longitude.mean()]

In [None]:
def embed_map(m, file_name):
    from IPython.display import IFrame
    m.save(file_name)
    return IFrame(file_name, width='100%', height='500px')

In [None]:
tip = '<i>%d TL</i>'
pop = '<i>%s</i>'

### Let's See the Crazy Prices

In [None]:
#map_df = df.sample(1000)
map_df = df[df.price>10000] 

In [None]:
# Create the map
istanbul = folium.Map(location=city_pos, tiles='cartodbpositron', zoom_start=10)
# Add points to the map
mc = MarkerCluster()
for idx, row in map_df.iterrows():
    name = row['name']
    price = row['price']
    mc.add_child(Marker([row['latitude'], row['longitude']], tooltip=pop%name, popup=tip%price))
istanbul.add_child(mc)

# Display the map
embed_map(istanbul, 'istanbul.html')


In [None]:
df['days_since_last_rv'] = (df.last_review.max() - df.last_review).dt.days

In [None]:
corr = df.corr()

In [None]:
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

In [None]:
f,ax = plt.subplots(figsize=(12,8))
df.boxplot(column='price',by='neighbourhood',rot=90,ax=ax)
#ax.set_ylim([0,10000])
plt.suptitle('')
ax.set_title('Price Grouped by Neighbourhood',fontsize = 15)
ax.title.set(y=1.05)
ax.set_ylabel('Price (TL)',fontsize=15)
ax.set_xlabel('',fontsize=15)
plt.show()

In [None]:
nb_price = df.groupby('neighbourhood')['price'].agg([np.mean,np.std]).reset_index()
n = nb_price['neighbourhood'].values
m = nb_price['mean'].values
std = nb_price['std'].values

In [None]:
nb_price

In [None]:
f,ax = plt.subplots(figsize=(12,8))
ax.barh(n,m,xerr=std)
ax.set_xlabel('Price (TL)',fontsize=15,x=0.42)

plt.show()

In [None]:
nb_reviews = df.groupby('neighbourhood')['reviews_per_month'].agg([np.mean,np.std]).reset_index()
n = nb_reviews['neighbourhood'].values
r = nb_reviews['mean'].values
std = nb_reviews['std'].values
mean = r.mean()

In [None]:
f,ax = plt.subplots(figsize=(12,8))
ax.barh(n,r,xerr=std)
ax.set_xlabel('Reviews per Month',fontsize=15,x=0.42)
ax.axvline(x=mean,ls='-.',color='r')

plt.show()

In [None]:
f,ax = plt.subplots(figsize=(12,8))
df[df.price<1000].plot.scatter('days_since_last_rv','number_of_reviews',ax=ax)
ax.set_ylabel('Number of Reviewers',fontsize = 15,labelpad=10)
ax.set_xlabel('Days Since Last Review',fontsize = 15,labelpad=10)
ax.set_title('Number of Reviews on a Listing vs. Days Past Since Last Review',fontsize=20,pad=20)
plt.show()

In [None]:
#POSSIBLE OTHER THINGS TO LOOK AT
#Limit to Top Neighbourhoods
#Limit to Top Hosts
#Availability Distribution in Top NB
#Availablity overall on heatmap
#Availability vs Reviews per month and num of reviews

### Top Neighbourhoods

In [None]:
nb_group = df.groupby(['neighbourhood'])

In [None]:
nb_df = nb_group.agg({'number_of_reviews':['sum','max','mean'],
              'reviews_per_month':['sum','max','mean'],
              'price':['mean','median']}).reset_index()

#### 1. By Median Price

In [None]:
fig,ax = plt.subplots(figsize=(12,10))
top20nb_price = nb_df.sort_values(('price','median'),ascending=True)[['neighbourhood','price']]
ax.barh(top20nb_price.neighbourhood.values,top20nb_price.price['median'].values)


#### 2. By Sum of Reviews

In [None]:
fig,ax = plt.subplots(figsize=(12,10))
top20nb_review_sum = nb_df.sort_values(('number_of_reviews','sum'),ascending=True)[['neighbourhood','number_of_reviews']]
ax.barh(top20nb_review_sum.neighbourhood.values,top20nb_review_sum.number_of_reviews['sum'].values)

#### 3. By Mean Monthly Review

In [None]:
fig,ax = plt.subplots(figsize=(12,10))
top20nb_review_monthly_mean = nb_df.sort_values(('reviews_per_month','mean'),ascending=True)[['neighbourhood','reviews_per_month']]
ax = plt.barh(top20nb_review_monthly_mean.neighbourhood.values,top20nb_review_monthly_mean.reviews_per_month['mean'].values)

### Availability of Least and Most Reviewed Neighbourhoods

In [None]:
top5 = top20nb_review_sum.neighbourhood.values[::-1][:5]
last5 = top20nb_review_sum.neighbourhood.values[:5]

In [None]:
fig,(ax1,ax2) = plt.subplots(nrows=2,ncols=1,figsize=(12,12))

for n in top5:
    values = df.availability_365[df.neighbourhood==n].values
    sns.distplot(values,ax=ax1,label=n)
    
ax1.legend()
    
for n in last5:
    values = df.availability_365[df.neighbourhood==n].values
    sns.distplot(values,ax=ax2,label=n)
    
ax2.legend()


### TODO: Introduce Property Count for Scoring Importance of the neighborhood

### Star Neighbourhood: Beyoğlu!

In [None]:
bey_df = df[df.neighbourhood=='Beyoglu']