In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

data_dir = '../data'
filename = 'AB_NYC_2019.csv'
data_path = os.path.join(data_dir, filename)

df = pd.read_csv(data_path)
df.shape

In [None]:
df.head()

In [None]:
df['last_review'] = pd.to_datetime(df['last_review'])

unfilled_cols = df.columns[df.isnull().any()]
unfilled_cols

In [None]:
df.info()

In [None]:
# Descriptive stats excluding non-numeric columns
df.describe()

In [None]:
df.describe(include='object')

In [None]:
no_hostname = df[df['host_name'].isnull()]
df.drop(index=no_hostname.index, inplace=True)

In [None]:
no_name = df[df['name'].isnull()]
df.drop(index=no_name.index, inplace=True)

In [None]:
# About of 10k (almost quarter of full dataset!!!)
# observations don't have useful reviews info 
no_info_cond = df['number_of_reviews'] == 0 & \
               df['last_review'].isnull() & \
               df['reviews_per_month'].isnull()
no_info_sample = df[no_info_cond]

df.drop(index=no_info_sample.index, inplace=True)

df.index = range(len(df))

In [None]:
neighbourhood_vc = df['neighbourhood'].value_counts(normalize=True)
print(f'Count of neighbourhoods {len(neighbourhood_vc)}')
neighbourhood_vc.head()

In [None]:
hostname_vc = df['host_name'].value_counts(normalize=True)
print(f'Count of host names {len(hostname_vc)}')
hostname_vc.head()

In [None]:
by_nhgroup = df.groupby(by='neighbourhood_group')

price_rev_stats = by_nhgroup[
    ['price', 'reviews_per_month']
].agg([np.mean, np.std])

price_rev_stats

In [None]:
by_nhgroup[['number_of_reviews', 'reviews_per_month']].median()

In [None]:
by_room = df.groupby(by='room_type')

def get_stats(group):
    return pd.DataFrame({
        'count': group.count(),
        'mean': group.mean(),
        'std': group.std()
    })

by_room_stats = by_room[
    ['number_of_reviews', 'minimum_nights']
].apply(get_stats)

by_room_stats.unstack()

In [None]:
def avg_per_sample(group):
    return group.sum() / group.count()

transf = by_room[
    ['number_of_reviews', 'minimum_nights', 'reviews_per_month']
].transform(avg_per_sample)

transf.head()

In [None]:
num_df = df.select_dtypes(include=np.number)
diff_mean_df = num_df.apply(lambda x: abs(x - x.mean()))
diff_mean_df.iloc[:, 4:].head()

In [None]:
def plot_hbar(frame: pd.DataFrame, column: str):
    data = frame[column]
    vc = data.value_counts()
    cat_values = vc.index
    y_pos = np.arange(len(cat_values))

    plt.figure(figsize=(8, 4))
    plt.yticks(y_pos, cat_values)
    plt.barh(cat_values, vc, height=0.7)
    plt.title(f'{column} distribution')
    plt.grid()

In [None]:
plot_hbar(df, column='room_type')

In [None]:
plot_hbar(df, column='neighbourhood_group')

In [None]:
nh_vc = df['neighbourhood'].value_counts()
top_nh = nh_vc[:30]
values = top_nh.index
pos = np.arange(len(values))

plt.figure(figsize=(20, 8))
plt.xticks(pos, values, rotation=60)
plt.bar(values, top_nh, width=0.7)
plt.title('neighbourhood distribution')
plt.grid()

In [None]:
cols_to_hist = ['number_of_reviews', 'minimum_nights',
                'reviews_per_month']

fig, ax = plt.subplots(1, 3, figsize=(20, 6), sharey=True)
for (col, axis) in zip(cols_to_hist, ax):
    axis.hist(df[col], bins=30)
    axis.set_title(col)
    axis.grid()
    
plt.subplots_adjust(wspace=0.05)

In [None]:
cols_to_dist = ['price', 'availability_365',
                'calculated_host_listings_count']

fig, ax = plt.subplots(1, 3, figsize=(20, 6))
for col, axis in zip(cols_to_dist, ax):
    sns.distplot(df[col], bins=30, color='red',
                 label=col, ax=axis)
    axis.legend()
    axis.grid()

plt.subplots_adjust(wspace=0.15)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 8))
sns.scatterplot(df['number_of_reviews'], df['reviews_per_month'],
                ax=ax[0], hue=df['room_type'])
sns.scatterplot(df['price'], df['minimum_nights'],
                ax=ax[1], hue=df['room_type'])

ax[0].grid()
ax[1].grid()
ax[0].set_ylim([-1, 30])
plt.subplots_adjust(wspace=0.1)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 8))
sns.scatterplot(df['number_of_reviews'], df['price'],
                ax=ax[0], hue=df['neighbourhood_group'])
sns.scatterplot(df['availability_365'], df['price'],
                ax=ax[1], hue=df['neighbourhood_group'])

ax[0].grid()
ax[1].grid()
plt.subplots_adjust(wspace=0.1)

In [None]:
def extract_date(data):
    return data.dt.year, data.dt.month, data.dt.day

year, month, day = extract_date(df['last_review'])

df[['review_year', 'review_month',
    'review_day']] = pd.DataFrame({'year': year,
                        'month': month,
                        'day': day})

df.drop(columns='last_review', inplace=True)

In [None]:
# FE (1 part)
df[['avg_reviews', 'avg_min_nights',
    'avg_reviews_per_month']] = transf

def diff_avg(group):
    return abs(group - group.mean())

diff_avg = by_nhgroup[
    ['number_of_reviews', 'availability_365']
].apply(diff_avg)
df[['diff_med_reviews', 'diff_med_availability']] = diff_avg

df['year_available'] = df['availability_365'] == 365

df['host_count_mean'] = diff_mean_df['calculated_host_listings_count']

df['review_period'] = df['number_of_reviews'] / df['reviews_per_month']

In [None]:
# FE (2 part)
by_hostname = df.groupby(by='host_name')

def diff_median(group):
    return abs(group - group.median())

diff_med_reviews = by_hostname[
    ['number_of_reviews', 'reviews_per_month']
].apply(diff_median)
df[['reviews_by_hostname',
    'reviews_per_month_by_hostname']] = diff_med_reviews

df['is_rare_type'] = df['room_type'] == 'Private room'

df['reviews_per_host'] = df['number_of_reviews'] / df['calculated_host_listings_count']

df['min_available'] = df['minimum_nights'] * df['availability_365']

In [None]:
df.head()

In [None]:
fg = sns.catplot(x='price', y='neighbourhood_group',
                 data=df, orient='h', legend_out=True)
fg.ax.set_xlim([-100, 2000])
fg.fig.set_figwidth(16)
fg.fig.set_figheight(6)

In [None]:
top_nh = nh_vc[:5]
top_nh_df = df[df['neighbourhood'].isin(top_nh.index)]

fg = sns.catplot(x='price', y='neighbourhood',
                 data=top_nh_df, orient='h', legend_out=True)
fg.ax.set_xlim([-100, 2000])
fg.fig.set_figwidth(16)
fg.fig.set_figheight(6)

In [None]:
by_nh = df.groupby(by='neighbourhood')

length = len(by_nh) 
first_half_price = by_nh['price'].sum()[:length // 2]
second_half_price = by_nh['price'].sum()[length // 2:]

fig, ax = plt.subplots(2, 1, figsize=(20, 18))
first_half_price.plot(kind='bar', rot=90, ax=ax[0], width=0.6)
second_half_price.plot(kind='bar', rot=90, ax=ax[1], width=0.6)
plt.tight_layout()

In [None]:
flier_props = {'marker': 'o', 'markerfacecolor': 'red', 'markersize': 5}
median_props = dict(linestyle='-', linewidth=3, color='firebrick')
meanpoint_props = dict(marker='D', markerfacecolor='blue',
                       linestyle='-', linewidth=2, markersize=8)

plt.figure(figsize=(20, 3))
ax = plt.gca()
ax.set_xlim([-100, 2000])

box = plt.boxplot(df['price'], labels=['price'], vert=False,
                  flierprops=flier_props, medianprops=median_props, 
                  meanprops=meanpoint_props, meanline=True,
                  showmeans=True, widths=0.6)

In [None]:
target = df['price']
df.drop(columns=['price'], inplace=True)

In [None]:
# What to do with zero price ?
print(f'There isn"t price in {len(target[target == 0.0])} observations')
target.replace(to_replace=0, value=target.mean(), inplace=True)

In [None]:
skew = target.skew()
kurt = target.kurtosis()
print(f'Target skewness: {skew:.4f}\nkurtosis: {kurt:.4f}')

# Log-transformation for target normalization
target = np.log(target)

plt.figure(figsize=(4, 4))
stats.probplot(target, plot=plt)
text = plt.title('Price')

In [None]:
room_nhgroup_pivot = df.pivot_table(index='room_type',
    columns='neighbourhood_group', values='price',
    aggfunc=sum).fillna(0).applymap(float)

ax = sns.heatmap(room_nhgroup_pivot, annot=True,
                 fmt=".1f", linewidths=.5)

In [None]:
fg = sns.catplot(x='calculated_host_listings_count',
                 y='price', hue='room_type',
                 kind="bar", data=df)
fg.fig.set_figwidth(17)
fg.fig.set_figheight(7)
fg.ax.tick_params(axis='x', rotation=70)
fg.ax.legend(loc='upper right')
plt.tight_layout()

In [None]:
top_nh = nh_vc[:10]
top_nh_df = df[df['neighbourhood'].isin(top_nh.index)]

nh_pivot = top_nh_df.pivot_table(index='room_type',
    columns='neighbourhood', values='price',
    aggfunc=sum).fillna(0).applymap(float)

plt.figure(figsize=(12, 4))
ax = sns.heatmap(nh_pivot, annot=True,
                 fmt=".1f", linewidths=.5)

In [None]:
fg = sns.catplot(x='price', y='neighbourhood',
                  kind='boxen', data=top_nh_df, orient='h')
fg.fig.set_figwidth(16)
fg.fig.set_figheight(6)

#### Hypotheses and conclusions

1. Entire homes/appartments are more expensive than other room types.
   Obviously, shared rooms is most cheap type. Difference in price is very varying depends on region.
2. Manhattan is very very luxury group comparing to others.
   Second place takes Brooklyn.
3. Williamsburg looks like a most expensive neighbourhood.
4. Majority of neighbourhoods have strongly marked values in price, but not Williamsburg.
   It's very notably in luxury neighbourhood groups.
5. Despite the small count of rooms 'shared' type in dataset,
   hosts more willingly make review about this room type. Similar behavior maybe be the consequense
   of almost permanent availability during year.
6. Minimum count of paid nights doesn't have strong influence on price.
   But interesting: people very often paid for 99, 100 nights.