<img src="./assets/airbnb_logo.png"
    style="width:300px; float: right; margin: 0 40px 40px 40px;"></img>
# Group Project AirBnB
**Useful links**

<a href="https://github.com/ShimantoRahman/aCRM-Group-Project-AirBnB"><img src="./assets/github_logo.png" style="width:120px; margin: 0 0 40px 40px;"></a>

> [Inside AirBnB: New York](http://insideairbnb.com/new-york-city/?fbclid=IwAR3lvDyNFboZqns1jNJ8v4OzqzG8sLFsqeSlRjqb_-tyvk4iM_XRSYdwmEQ)

> [Airbnb Rental Listings Dataset Mining](https://towardsdatascience.com/airbnb-rental-listings-dataset-mining-f972ed08ddec)

## 1 Setup
### 1.1 Import modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import os
import folium
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from nltk.corpus import stopwords
from collections import Counter
# nltk.download('punkt')
# nltk.download('stopwords')

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
!pip install -U textblob
!pip install WordCloud
!pip install folium

![green-divider](./assets/green_divider.png)

### 1.2 Read data

In [None]:
calendar_detail = pd.read_csv("./data/calendar_detail.csv")
listings_summary = pd.read_csv("./data/listings_summary.csv")
reviews_summary = pd.read_csv("./data/reviews_summary.csv")
neighbourhoods = pd.read_csv("./data/neighbourhoods.csv")

In [None]:
listings_detail = pd.read_csv("./data/listings_detail.csv")

In [None]:
reviews_detail = pd.read_csv("./data/reviews_detail.csv")

In [None]:
!pip install folium
!pip install -U textblob
!pip install WordCloud

In [None]:
calendar_detail.head()

In [None]:
listings_summary.head()

In [None]:
reviews_summary.head()

In [None]:
neighbourhoods.head()

In [None]:
listings_detail.head()

In [None]:
reviews_detail.head()

![green-divider](./assets/green_divider.png)

### 1.3 data preparation
#### 1.3.1 Detecting NaN values

In [None]:
print("listings_summary\n")
print(listings_summary.isnull().sum())

In [None]:
print("listings_detail\n")
print(listings_detail.isnull().sum())

In [None]:
print("reviews_summary\n")
print(reviews_summary.isnull().sum())

In [None]:
print("reviews_detail\n")
print(reviews_detail.isnull().sum())

In [None]:
print("neighbourhoods\n")
print(neighbourhoods.isnull().sum())

In [None]:
print("\ncalendar_detail\n")
print(calendar_detail.isnull().sum())

![green-divider](./assets/green_divider.png)

#### 1.3.2 Cleansing data

In [None]:
# removing listings where first and last review do not both match
listings_detail = listings_detail[~((listings_detail["first_review"].isnull()) & ~(listings_detail["last_review"].isnull()))]

# removing reviews without a comment
reviews_detail = reviews_detail.dropna(subset=['comments'])

# removing listings without a superhost value
listings_detail = listings_detail.dropna(subset=['host_is_superhost'])

In [None]:
# removing parallel rows in summary
reviews_detail[reviews_detail["comments"].isna()]

In [None]:
# replacing NaN values for reviews_per_month to 0
# rows with NaN value for reviews_per_month do not have a first or last review, thus they have 0 reviews per month
column_imputations = {"reviews_per_month": 0}
listings_detail = listings_detail.fillna(value = column_imputations)

![green-divider](./assets/green_divider.png)

#### 1.3.2 Correcting data types

In [None]:
# dates
def column_to_date(df, column):
    df[column] = pd.to_datetime(df[column], format="%Y-%m-%d")
    
# listings_summary
column_to_date(listings_summary, "last_review")

# reviews_summary
column_to_date(reviews_summary, "date")

# calendar_detail
column_to_date(calendar_detail, "date")

# listings_detail
column_to_date(listings_detail, "first_review")
column_to_date(listings_detail, "last_review")
column_to_date(listings_detail, "last_scraped")
column_to_date(listings_detail, "calendar_last_scraped")
column_to_date(listings_detail, "host_since")

# reviews_detail

In [None]:
# change t/f columns to 1/0
label_encoder = LabelEncoder()

def column_to_bool(df, column):
    label_encoder.fit(df[column])
    new_column = column + "_num"
    df[new_column] = label_encoder.transform(df[column])

column_to_bool(listings_detail, "instant_bookable")
column_to_bool(listings_detail, "requires_license")
column_to_bool(listings_detail, "is_business_travel_ready")
column_to_bool(listings_detail, "require_guest_profile_picture")
column_to_bool(listings_detail, "require_guest_phone_verification")
column_to_bool(listings_detail, "instant_bookable")
# column_to_bool(listings_detail, "host_is_superhost")
column_to_bool(listings_detail, "has_availability")

In [None]:
# change categorical columns to numerical labels
def column_to_numeric_labels(df, column):
    label_encoder = LabelEncoder()
    label_encoder.fit(df[column])
    df[column + "_num"] = label_encoder.transform(df[column])

column_to_numeric_labels(listings_summary, "neighbourhood_group")
column_to_numeric_labels(listings_summary, "neighbourhood")
column_to_numeric_labels(listings_summary, "room_type")

In [None]:
# changing price from $00.0 [String] to 00.0 float 

calendar_detail["price"] = calendar_detail["price"].str.replace("$", "")
calendar_detail["price"] = calendar_detail["price"].str.replace(",", "")
calendar_detail["price"] = pd.to_numeric(calendar_detail["price"])

listings_detail["price"] = listings_detail["price"].str.replace("$", "")
listings_detail["price"] = listings_detail["price"].str.replace(",", "")
listings_detail["price"] = pd.to_numeric(listings_detail["price"])

![green-divider](./assets/green_divider.png)

## 2 Analysis
### 2.1 Calculate the average listing price per neighbourhood

In [None]:
print(listings_summary.groupby("neighbourhood").mean()["price"])

### 2.2 Plot how the average price evolves through the year across New York.
<span style="color:red"> **TODO:** investigate peaks and prices weekdays vs prices weekends</span> 

In [None]:
# seaborn plot using plot()
calendar_detail[["date", "price"]].groupby("date").mean().plot()
plt.xlabel("Date")
plt.ylabel("Average price")
plt.title("Average price across the year")
plt.show()

In [None]:
# seaborn plot using explicit seaborn function
avg_price_day = calendar_detail[["date", "price"]].groupby("date").mean()
avg_price_day['date'] = avg_price_day.index
avg_price_day.head()

sns.lineplot(x = "date", y = "price", data = avg_price_day)
plt.xlabel("Date")
plt.ylabel("Average price")
plt.title("Average price across the year")
plt.show()

In [None]:
# median instead of mean
calendar_detail[["date", "price"]].groupby("date").median().plot()
plt.xlabel("Date")
plt.ylabel("Average price")
plt.title("Average price across the year")
plt.show()

### 2.3	Identify which neighborhood has the largest price fluctuations across the year. Plot the fluctuations for this neighborhood.

In [None]:
# joining listings_summary with calendar_detail
cal_listing = pd.merge(calendar_detail, listings_summary[["id", "neighbourhood"]], how="inner", left_on="listing_id", right_on = "id")
cal_listing.head()

In [None]:
# calculating variance per neighbourhood and requesting top one
price_by_neighbourhood = cal_listing[["neighbourhood", "price"]].groupby("neighbourhood").var().reset_index()
nb_with_largest_var = price_by_neighbourhood.sort_values("price", ascending = False).iloc[0]["neighbourhood"]

In [None]:
# plotting the prices of the neighbourhood with the largest price fluctuations
cal_listing[["date", "price", "neighbourhood"]][cal_listing["neighbourhood"] == nb_with_largest_var].groupby("date").mean().plot()
plt.xlabel("Date")
plt.ylabel("Average price")
plt.title("Average price in neighbourhood with largest price fluctuations")
plt.show()

### 2.4 In marketing, there is a phenomenon known as ‘the long tail’ (Hint: look it up). This also translates to the number of reviews. Plot this on an intuitive graph.

#### Definition
**The long tail** is a business strategy that allows companies to realize significant profits 
by selling low volumes of hard-to-find items to many customers, 
instead of only selling large volumes of a reduced number of popular items.

In [None]:
# plot reviews per listings VS listings per host

# compute reviews per listing
temp = listings_summary[["host_id", "calculated_host_listings_count"]]
agg_dict = {"number_of_reviews":sum}
id_reviews = listings_summary.groupby("host_id").agg(agg_dict).reset_index().rename(columns={"number_of_reviews": "total_number_of_reviews"})
host_count_reviews = temp.merge(id_reviews, how = "right", on = "host_id")

host_count_reviews["reviews_per_listing"] = host_count_reviews["total_number_of_reviews"] / (host_count_reviews["calculated_host_listings_count"])
host_count_reviews.sort_values(by = "calculated_host_listings_count", ascending = False)
host_count_reviews.head()

In [None]:
# plot
sns.scatterplot(x = "calculated_host_listings_count", y = "reviews_per_listing", alpha = 0.2, data = host_count_reviews)
plt.xlabel("Listings per host")
plt.ylabel("Reviews per listing")
plt.title("'The long tail'")
plt.show()

### 2.5	Run a regression to explain the price per listing. (Hint: location, reviews, etc. may all explain this).

In [None]:
from sklearn.linear_model import  LinearRegression

lr_model = LinearRegression()
lr_model.fit(X = listings_summary[["neighbourhood_group_num", "neighbourhood_num", "room_type_num", "minimum_nights", 
                                   "number_of_reviews", "reviews_per_month", "availability_365"]], y = listings_summary["price"])
price_est = lr_model.predict(listings_summary[["neighbourhood_group_num", "neighbourhood_num", "room_type_num", "minimum_nights", 
                                   "number_of_reviews", "reviews_per_month", "availability_365"]])

print(lr_model.intercept_)
print(lr_model.coef_)

In [None]:
# model evaluation
from sklearn.metrics import mean_squared_error

price_true = listings_summary["price"]
mse = mean_squared_error(price_true, price_est)
rmse = np.sqrt(mse)
print(rmse)

### 2.6 Find additional data sources to explain the average listing price per neighbourhood. (Hint : think demographics)

### 2.7 Plot how the average prices differ across New York using a color-coded heat map of New York neighborhoods.

In [None]:
# mean prices per neighbourhood
price_data = listings_summary[["neighbourhood", "price"]].groupby("neighbourhood").mean().reset_index()

In [None]:
# read geojson
with open('./data/neighbourhoods.geojson') as json_file:
    geo_json_data = json.load(json_file)

In [None]:
# plot map
m = folium.Map([40.6976637, -74.1197643], tiles='cartodbpositron', zoom_start=10)

m.choropleth(
    geo_data= geo_json_data,
    name='choropleth',
    data= price_data,
    columns=['neighbourhood', 'price'],
    key_on='properties.neighbourhood',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Average price'
)

m.save("./assets/graphs/prices.html")

m

### 2.8 The latitude of Statue of Liberty National Monument, New York, USA is 40.68927, and the longitude is -74.044502. This monument is one of the most popular tourist places in New York. Statistically test wether a distance smaller than 2 miles to the monument increases average listing price.
[Stackoverflow](https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude): calculate distance between 2 points based on longitude and latitude

In [None]:
from math import sin, cos, sqrt, atan2, radians

# approximate radius of earth in km
R = 6373.0

lat1 = radians(52.2296756)
lon1 = radians(21.0122287)
lat2 = radians(52.406374)
lon2 = radians(16.9251681)

dlon = lon2 - lon1
dlat = lat2 - lat1

a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))

distance = R * c

print("Result:", distance)
print("Should be:", 278.546, "km")

### 2.9	Create a timeline and plot for each month the highest, Q1, the median, Q3 and lowest price on one graph. Do this for each neighborhood group as well as for the entire city. Determine which neighborhood group stands out the most and create a comparative graph of this neighborhood with all other groups.

### 2.10 Plot the number of rooms per host in function of the number of reviews per host. 
#### Method 1 (manually)

In [None]:
# compute number of listings and total number of reviews per host
agg_dict = {"id":len, "number_of_reviews":sum}
id_reviews = listings_summary.groupby("host_id").agg(agg_dict).reset_index()

sns.scatterplot(x = "id", y = "number_of_reviews", alpha = 0.4, data = id_reviews) # add jitter?
plt.xlabel("Total number of rooms per host")
plt.ylabel("Total number of reviews per host")
plt.show()

#### Method 2 (with calculated_host_listings_count)

In [None]:
agg_dict = {"number_of_reviews":sum}
temp = listings_summary.groupby("host_id").agg(agg_dict).reset_index()

id_reviews_2 = listings_summary[["host_id", "calculated_host_listings_count"]].merge(temp, how = "right", on = "host_id")

sns.scatterplot(x = "calculated_host_listings_count", y = "number_of_reviews", alpha = 0.4, data = id_reviews_2) # add jitter?
plt.xlabel("Total number of rooms per host")
plt.ylabel("Total number of reviews per host")
plt.show()

> CODE DOES NOT WORK YET

In [None]:
# try to add jitter to scatterplot but does not work
def rand_jitter(arr):
    stdev = .01*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev

def jitter(x, y, s=20, c='b', marker='o', cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, hold=None, **kwargs):
    return sns.scatterplot(rand_jitter(x), rand_jitter(y), s=s, c=c, marker=marker, cmap=cmap, norm=norm, vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths, verts=verts, hold=hold, **kwargs)

jitter(x = id_reviews["id"], y = id_reviews["number_of_reviews"], alpha = 0.4)


### 2.11 Are there a lot of hosts having multiple locations? Do most people just rent their own place? Is there a ‘host long tail’? Make a comprehensive plot.

In [None]:
# histogram

### 2.12 Do hosts with multiple locations stay within the same neighbourhood? (hint: use subset)

In [None]:
# calculating:
# number of hosts with multiple locations
# number of hosts with multiple locations in different neighbourhoods
# number of hosts with multiple locations in the same neighbourhood

mult_loc = listings_summary[["id", "host_id"]].groupby("host_id").apply(lambda x: 1 if len(x) > 1 else 0).sum()
mult_loc_diff_nb = listings_summary[["host_id", "neighbourhood"]].groupby("host_id").apply(lambda x: 1 if len(x["neighbourhood"].unique()) > 1 else 0).sum()
mult_loc_same_nb = mult_loc - mult_loc_diff_nb

In [None]:
print(mult_loc)
print(mult_loc_diff_nb)
print(mult_loc_same_nb)

In [None]:
# barplot
plt.bar(["Same neighbourhood", "Different neighbourhood"], [mult_loc_same_nb, mult_loc_diff_nb])
plt.show()

### 2.13 What are the 5 most used words in reviews that are no stop words? (e.g. the, or, etc. Python can filter these automatically using packages such as NLTK).
#### NLTK data prep
> read text_analysis.csv instead, because some steps like tokenizing take very long

In [None]:
reviews_detail["comments_length"]= reviews_detail["comments"].str.len()

In [None]:
reviews_detail["comments_length"].describe()

In [None]:
reviews_detail[reviews_detail["comments_length"]==1].head(10)

In [None]:
# removing comments with just a space or a dot
text_analysis = reviews_detail[reviews_detail['comments']!=' ']
text_analysis = text_analysis[text_analysis['comments']!='.']

In [None]:
# turning every comment to lower case
text_analysis["comments"] = text_analysis["comments"].str.lower()

In [None]:
# counting how many times a review has been posted
review_counts = text_analysis.comments.value_counts()

In [None]:
# request top 20 most common comments
# A lot of the reviews are automated posts
review_counts[0:20]

In [None]:
# removing automated posts
text_analysis = text_analysis[text_analysis["comments"].str.find("this is an automated posting.") == -1]

In [None]:
# recounting how many times a review has been posted
review_counts = text_analysis.comments.value_counts()

In [None]:
# request top 20 most common comments
review_counts[0:20]

In [None]:
# splits comments into tokens (takes a while)
text_analysis['tokenized_comments'] = text_analysis.apply(lambda row: nltk.word_tokenize(row['comments']), axis=1)

In [None]:
# counting the words in every comment
text_analysis['word_count'] = [ len(words) for words in text_analysis['tokenized_comments'] ]

In [None]:
# removing stop words and punctuation from the tokens
filter_tokens = set(stopwords.words('english'))
filter_tokens.update({".", "?", "!", ",", ";", ":", "(", ")", "{", "}", "[", "]"})

# other not useful words
filter_tokens.update({"'s", "would", "de", "n't", "us"})

text_analysis['tokenized_filtered'] = text_analysis['tokenized_comments'].apply(lambda x: [item for item in x if item not in filter_tokens])

In [None]:
# write text_analysis to csv
text_analysis[["id", "comments_length", "tokenized_filtered", "word_count"]].to_csv("./data/text_analysis.csv", index = False)

#### NLTK analysis

In [None]:
# read csv
text_analysis = pd.read_csv("./data/text_analysis.csv")
text_analysis["tokenized_filtered"] = text_analysis["tokenized_filtered"].apply(lambda x: x[2:-2].split('\', \''))

In [None]:
# stores 100 most common words
flat_list = [item for sublist in text_analysis["tokenized_filtered"] for item in sublist]
most_common_words = Counter(flat_list).most_common(100)

In [None]:
# requests 5 most common words
most_common_words[:5]

In [None]:
# word cloud
# NOT COMPLETE
from wordcloud import WordCloud
amount = np.array([x[1] for x in most_common_words])
normalized_amount = np.round(((amount / amount[0]) * 100))
text = ""
for i in range(0, len(most_common_words)):
    for j in range(0, int(normalized_amount[i])):
        text += " " + most_common_words[i][0]
        
wordcloud = WordCloud(width=480, height=480, margin=0).generate(text[1:])

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

In [None]:
from wordcloud import WordCloud
 
# Create a list of word
text=("Python Python Python Matplotlib Matplotlib Seaborn Network Plot Violin Chart Pandas Datascience Wordcloud Spider Radar Parrallel Alpha Color Brewer Density Scatter Barplot Barplot Boxplot Violinplot Treemap Stacked Area Chart Chart Visualization Dataviz Donut Pie Time-Series Wordcloud Wordcloud Sankey Bubble")
 
# Create the wordcloud object
wordcloud = WordCloud(width=480, height=480, margin=0).generate(text)
 
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()


#### Sentiment analysis

In [None]:
# calculating sentiment for every review
reviews_detail["sentiment"] = reviews_detail["comments"].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
reviews_detail.head()

In [None]:
# calculate the average sentiment per listing
# adds sentiments to the listings_summary
sentiments = reviews_detail[["listing_id", "sentiment"]].groupby("listing_id").mean().reset_index()
listings_summary = pd.merge(listings_summary, sentiments, how = "left", left_on = "id", right_on = "listing_id")
listings_summary.head()

### 2.14 Do these most frequent words differ across neighborhoods? What are the ‘most different’ areas? What distinguishes them? Interpret.

In [None]:
# perhaps areas near monuments have those in its 'most frequest words' list


### 2.15 Plot the amount of reviews across time. 
#### Daily

In [None]:
# number of reviews per date
rev_per_date = reviews_summary.groupby("date").size().reset_index(name="counts")

In [None]:
plt.figure(figsize=(17, 6))
plt.plot(rev_per_date["date"], rev_per_date["counts"])
plt.title("Amount of reviews across time")
plt.xlabel("Date")
plt.ylabel("Amount of reviews")
plt.show()

#### Yearly

In [None]:
# aggregating results by year
rev_per_date['year'] = [t.year for t in rev_per_date.date]
rev_per_year = rev_per_date.groupby("year").sum().reset_index()
rev_per_year["year"] = pd.to_datetime(rev_per_year["year"], format="%Y")

In [None]:
plt.figure(figsize=(17, 6))
plt.plot(rev_per_year["year"], rev_per_year["counts"])
plt.title("Amount of reviews across time")
plt.xlabel("Date")
plt.ylabel("Amount of reviews")
plt.show()

### 2.16 Is there a link between availability (days per year) with the price? Determine both graphically and statistically. 
#### Correlation

In [None]:
np.corrcoef(listings_detail["price"], listings_detail["availability_365"])[0, 1]

#### Regression
${Price_i} = \alpha + \beta \thinspace Availability.365_{i}$

In [None]:
lr_model = LinearRegression()
lr_model.fit(X=listings_detail[["availability_365"]], y=listings_detail["price"])
price_est = lr_model.predict(listings_detail[["availability_365"]])

#### Graphically

In [None]:
plt.scatter(listings_detail["availability_365"], listings_detail["price"])
plt.plot(listings_detail["availability_365"], price_est, "-", color="red", label="predicted y")
plt.xlabel("Days listing is available per year")
plt.ylabel("Price")
plt.show()

### 2.17 Is there a link between how many times the word ‘great’ appears in a review and the listing price? Determine both graphically and statistically. 

In [None]:
# count how many times 'great' is mentioned in a review
reviews_detail["times_great_in_comments"]= reviews_detail["comments"].str.lower().str.count("great")

# group every review with the same listing id together and add up how many times 'great' is mentioned
times_great = reviews_detail[["listing_id", "times_great_in_comments"]].groupby(by = "listing_id").sum()

# join with listings_summary to get price of the listing
times_great_listings = pd.merge(times_great, listings_summary[["id", "price"]], how="inner", left_on="listing_id", right_on = "id")

#### Correlation

In [None]:
# linear correlation
np.corrcoef(times_great_listings["price"], times_great_listings["times_great_in_comments"])[0,1]

#### Regression
${Price_i} = \alpha + \beta \thinspace Times.Great_{i}$

In [None]:
lr_model = LinearRegression()
lr_model.fit(X=times_great_listings[["times_great_in_comments"]], y=times_great_listings["price"])
price_est = lr_model.predict(times_great_listings[["times_great_in_comments"]])

#### Graphically

In [None]:
plt.scatter(x = times_great_listings["times_great_in_comments"], y = times_great_listings["price"], alpha = 0.2)
plt.plot(times_great_listings["times_great_in_comments"], price_est, "-", color="red", label="predicted y")

plt.xlabel("amount that great is mentioned in review")
plt.ylabel("price")
plt.show()

### 2.18 Plot how the number of Airbnb locations are distributed across the city on a map. Plot the number of locations per neighborhood and color code according to neighborhood group.

In [None]:
# calculating the listings per neighbourhood
listings_per_nb = listings_summary[["id", "neighbourhood", "neighbourhood_group"]].groupby(by = ["neighbourhood", "neighbourhood_group"]).size().reset_index(name="counts")
long_lat_per_nb = listings_summary[["neighbourhood", "longitude", "latitude"]].groupby(by = "neighbourhood").mean()

In [None]:
# geomap
m = folium.Map([40.6976637, -74.1197643], tiles='cartodbpositron', zoom_start=10)
# location=[float(long_lat_per_nb.iloc[i]['longitude']), float(long_lat_per_nb.iloc[i]['latitude'])],

for i in range(0,len(listings_per_nb)):
    nbg = listings_per_nb.iloc[i]['neighbourhood_group']
    col = ""
    if nbg == "Queens":
        col = "#F23D4C"
    elif nbg == "Bronx":
        col = "#735571"
    elif nbg == "Brooklyn":
        col = "#04BFBF"
    elif nbg == "Staten Island":
        col = "#C6D93B"
    else:
        col = "#F2B705"
        
    folium.Circle(
      location=[float(long_lat_per_nb.iloc[i]['latitude']), float(long_lat_per_nb.iloc[i]['longitude'])],
      popup=listings_per_nb.iloc[i]['neighbourhood'],
      radius=int(listings_per_nb.iloc[i]['counts']),
      color=col,
      fill=True,
      fill_color=col
    ).add_to(m)

# legend
legend_html =   '''
                <div style="position: fixed; 
                            bottom: 50px; left: 50px; width: 130px; height: 160px; 
                            border:2px solid grey; z-index:9999; font-size:14px;
                            background-color: rgba(242, 243, 245, 0.5)
                            ">
                              &nbsp; Queens &nbsp; <i class="fa fa-map-marker fa-2x" style="color:#F23D4C"></i><br>
                              &nbsp; Bronx &nbsp; <i class="fa fa-map-marker fa-2x" style="color:#735571"></i><br>
                              &nbsp; Brooklyn &nbsp; <i class="fa fa-map-marker fa-2x" style="color:#04BFBF"></i><br>
                              &nbsp; Staten Island &nbsp; <i class="fa fa-map-marker fa-2x" style="color:#C6D93B"></i><br>
                              &nbsp; Manhattan &nbsp; <i class="fa fa-map-marker fa-2x" style="color:#F2B705"></i>
                </div>
                ''' 

m.get_root().html.add_child(folium.Element(legend_html))

m.save("./assets/graphs/listings.html")

m

### 2.19 Williamsburg is a ‘hip’ area in in Brooklyn with a lot of Airbnb locations on offer. Explore how this area differs from other locations and visualize. You may also use external data sources.

In [None]:
# boxplot prices between different neighbourhood groups + williamsburg

### 2.20  Create a stacked bar chart of the distribution of room type per neighborhood group. Statistically test whether these differences are significant.

In [None]:
# counts the number of room types for each neighbourhood group
df = listings_summary[["room_type", "neighbourhood_group"]].groupby(["neighbourhood_group", "room_type"]).size().reset_index(name="counts")

In [None]:
a = np.array(df.groupby("neighbourhood_group").sum().reset_index()["counts"])

In [None]:
for i in range(0, len(df)):
    if df["neighbourhood_group"].iloc[i] == "Bronx":
        df["counts"].iloc[i] = df["counts"].iloc[i] / a[0]
df

In [None]:
# stacked bar chart

# x
nbg = df["neighbourhood_group"].unique()

# y
bars1 = df[df["room_type"] == "Entire home/apt"]["counts"]
bars1
bars2 = df[df["room_type"] == "Private room"]["counts"]
bars3 = df[df["room_type"] == "Shared room"]["counts"]

# Staten Island has no shared rooms
# manually add 0 for shared rooms in staten island
# refactor later
bars3 = bars3.append(pd.Series([0]))

# bottoms
bottom2 = bars1
bottom3 = np.add(bars1, bars2).tolist()

# draw bars
plt.bar(nbg, bars1, label = "Entire home")
plt.bar(nbg, bars2, bottom = bottom2, label = "Private room")
plt.bar(nbg, bars3, bottom = bottom3, label = "Shared room")

# legend
plt.legend(loc="upper right")

plt.show()

In [None]:
type(bars1)

### 2.21 Color-coded plot the most popular room type per neighborhood on a city map.

In [None]:
# compute most popular room_type per neighbourhood
distr_room_type = listings_summary[["neighbourhood", "room_type"]].groupby("neighbourhood")["room_type"].value_counts(normalize = True).rename("percentage").mul(100).reset_index()

agg_dict = {"percentage":np.max}
temp = distr_room_type.groupby("neighbourhood").agg(agg_dict)

popular_room_type = distr_room_type.merge(temp, how = "right", on = ["neighbourhood","percentage"])
print(popular_room_type.head())

# merge together with listings_summary to get latitude and longitude
# listings_room_type = listings_summary[["neighbourhood", "latitude", "longitude"]].merge(popular_room_type, how = "inner", on = "neighbourhood")
# listings_room_type.head()

In [None]:
room_types_per_nb.head(20)

In [None]:
# most popular room types per neighbourhood
room_types_per_nb = listings_summary[["room_type", "neighbourhood"]].groupby(["neighbourhood", "room_type"]).size().reset_index(name="counts")
# g = room_types_per_nb['counts'].groupby(level=0, group_keys=False)
# g.apply(lambda x: x.order(ascending=False).head(3))
# room_types_per_nb.sort_values(['neighbourhood','room_type'],ascending=False).groupby('neighbourhood').head(1)
room_types_per_nb.groupby("neighbourhood")["counts"].nlargest(1)
#["counts"].nlargest(1)

In [None]:
with open('./data/neighbourhoods.geojson') as json_file:
    geo_json_data = json.load(json_file)

In [None]:
# shared room: red, private room: yellow, entire home: green
def my_color_function(feature):
    if feature['properties']['neighbourhood'] not in listings_summary["neighbourhood"].unique():
        return "#d3d3d3"
    room = popular_room_type[popular_room_type["neighbourhood"] == feature['properties']['neighbourhood']]["room_type"]
    a = room.to_string().split()
    room = " ".join(a[1:])
    if room == "Shared room":
        return '#ff0000'
    elif room == "Private room":
        return '#ffff00'
    else:
        return '#008000'

In [None]:
m = folium.Map([40.6976637, -74.1197643], tiles='cartodbpositron', zoom_start=10)

folium.GeoJson(
    geo_json_data,
    style_function=lambda feature: {
        'fillColor': my_color_function(feature),
        'color': 'black',
        'weight': 1,
        'dashArray': '5, 5',
    }
).add_to(m)

# legend
legend_html =   '''
                <div style="position: fixed; 
                            bottom: 50px; left: 50px; width: 200px; height: 100px; 
                            border:2px solid grey; z-index:9999; font-size:14px;
                            background-color: rgba(242, 243, 245, 0.5)
                            ">
                              &nbsp; Entire home/apartment &nbsp; <i class="fa fa-map-marker fa-2x" style="color:green"></i><br>
                              &nbsp; Private room &nbsp; <i class="fa fa-map-marker fa-2x" style="color:yellow"></i><br>
                              &nbsp; Shared room &nbsp; <i class="fa fa-map-marker fa-2x" style="color:red"></i>
                </div>
                ''' 

m.get_root().html.add_child(folium.Element(legend_html))

m.save("./assets/graphs/room_type.html")

m