# 0 - Set-Up

### Environment Set Up 

In [None]:
# Install required packages
!pip install -qq -r ../requirements.txt

import sys

REL_PATH_TO_ROOT = "../"

sys.path.insert(0,REL_PATH_TO_ROOT)

from src.utils import get_root_dir, test_root_dir
from local_variables import ROOT_DIR

test_root_dir(REL_PATH_TO_ROOT)

### Import Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Load Data

In [None]:
files_of_interest = ["calendar.csv","listings.csv"]
years = ["2023","2024"]

In [None]:
cal_df = pd.concat([pd.read_csv(f"{get_root_dir()}/data/{year}_tokyo/calendar.csv") for year in years],ignore_index=True)

In [None]:
list_df = pd.concat([pd.read_csv(f"{get_root_dir()}/data/{year}_tokyo/listings.csv") for year in years],ignore_index=True)

# 1 - Initial EDA

## 1.1 - High-Level View - Calendar

### 1.1.1 - Data Structure

In [None]:
cal_df.head()

In [None]:
print("Each row corresponds to one data and one property based on the listing id")

In [None]:
#Number of unique properties
cal_df["listing_id"].nunique()

### 1.1.2 - Date Range

In [None]:
#Time range
cal_df["date_dt"] = pd.to_datetime(cal_df["date"])
cal_df["date_dt"].dt.to_period('M').value_counts().sort_index().plot(kind='line')

In [None]:
cal_df.shape[0]

In [None]:
# Jump in counts for 2024-07, are there duplicates?
dropped_dups = cal_df.drop_duplicates()

In [None]:
dropped_dups.shape[0]

In [None]:
print("No duplicates found")

### 1.1.3 - Price Distribution

In [None]:
cal_df["price_num"] = cal_df["price"].str.replace("$","")
cal_df["price_num"] = cal_df["price_num"].str.replace(",","")
cal_df["price_num"] = cal_df["price_num"].astype(float)

In [None]:
sns.histplot(data=cal_df,x="price_num",log_scale=True,bins=15)

### 1.1.4 - Average Prices Over Time

In [None]:
def average_x_time_plot(df,datefield,x,show_percentiles=False,percentile=0.95,group_var=None,log_scale=True):
    work_df = df.copy(deep=False)

    work_df[datefield] = pd.to_datetime(work_df[datefield])

    if group_var == None:
        group_fields = datefield
    else:
        group_fields = [datefield,group_var]
    
    if show_percentiles:
        perc = 1-percentile
        summary_stats = work_df.groupby(group_fields)[x].agg(avg='mean',lower_bound=lambda x: x.quantile(perc/2),
    upper_bound=lambda x: x.quantile(1-perc/2)).reset_index()
    else:
        summary_stats = work_df.groupby(group_fields)[x].agg(avg='mean').reset_index()

    if log_scale:
        summary_stats[f"log_{x}"] = np.log(summary_stats['avg'])
        if show_percentiles:
            summary_stats['log_lower_bound'] = np.log(summary_stats['lower_bound'])
            summary_stats['log_upper_bound'] = np.log(summary_stats['upper_bound'])

    plt.figure(figsize=(12, 6))

    sns_palette = sns.color_palette(n_colors=summary_stats[group_var].nunique() if group_var else 1)
    
    sns.lineplot(data=summary_stats, x=summary_stats[datefield],y=f"log_{x}" if log_scale else 'avg',hue=group_var, linewidth=2,palette=sns_palette)
    
    if show_percentiles and group_var:
        for color, (group, group_data) in zip(sns_palette, summary_stats.groupby(group_var)):
            plt.fill_between(
                group_data[datefield],
                group_data['log_lower_bound'] if log_scale else group_data['lower_bound'],
                group_data['log_upper_bound'] if log_scale else group_data['upper_bound'],
                alpha=0.2,
                color=color,
                label=f'{group} {100 * percentile}% Percentile Range'
            )

    elif show_percentiles:  # For no group_var, single fill_between
        plt.fill_between(
            summary_stats[datefield],
            summary_stats['log_lower_bound'] if log_scale else summary_stats['lower_bound'],
            summary_stats['log_upper_bound'] if log_scale else summary_stats['upper_bound'],
            alpha=0.3,
            color=sns_palette[0],
            label=f'{100 * percentile}% Percentile Range'
        )
    
    plt.xlabel('Date')
    plt.ylabel(f'Log ')
    plt.title(f'Average Log {x.capitalize()} Over Time with {100*percentile}% Percentile Bounds')
    plt.legend()
    plt.show()

In [None]:
average_x_time_plot(cal_df,"date","price_num",show_percentiles=True)

In [None]:
print("Huge variability in the price at a given time, what determines the price?")

In [None]:
average_x_time_plot(cal_df,"date","price_num",show_percentiles=True,group_var="available")

In [None]:
print("Doesn't seem to make a huge difference, although unoccupied properties tend to be priced lower")

## 1.2 - Listings

In [None]:
list_df.columns

In [None]:
list_df.head()

In [None]:
print("Unique to id and last_scraped level")

### 1.2.1 - Time Structure in Dataframe

In [None]:
test_ids = list_df["id"].head().values

In [None]:
list_df[list_df["id"].isin(test_ids)].sort_values(by="id")

In [None]:
# How many listings have data for both years?
list_df["id"].value_counts().reset_index()["count"].value_counts(normalize=True)

In [None]:
print("Around 50% only appear for one year")

### 1.2.2 - Review Data

In [None]:
list_df["review_scores_rating"].hist(bins=5)

In [None]:
list_df["review_scores_rating"].isna().mean()

### 1.1.3 - Number of Rooms

In [None]:
average_prices = cal_df[["listing_id","price_num"]].groupby(by="listing_id",as_index=False).agg("mean")

In [None]:
price_room_df = pd.merge(left=average_prices,right=list_df[["id","bedrooms"]].rename(columns={"id":"listing_id"}),how="inner",on="listing_id")

In [None]:
perc=0.05
price_by_room_df = price_room_df[["bedrooms","price_num"]].groupby(by="bedrooms",as_index=True)["price_num"].agg(avg_price="mean",lower_bound=lambda x: x.quantile(perc/2),
    upper_bound=lambda x: x.quantile(1-perc/2)).reset_index()

In [None]:
price_by_room_df.head()

In [None]:
sns.barplot(data=price_by_room_df.sort_values(by="bedrooms",ascending=True),x="bedrooms",y="avg_price")
sns.barplot(data=price_by_room_df.sort_values(by="bedrooms",ascending=True),x="bedrooms",y="lower_bound",color='r',alpha=0.3)
sns.barplot(data=price_by_room_df.sort_values(by="bedrooms",ascending=True),x="bedrooms",y="upper_bound",color='g',alpha=0.3)

In [None]:
print("Bedrooms seems to have some correlation to price for less than 5 bedrooms, perhaps 6 bedrooms represent hostels etc.")

In [None]:
list_df["bedrooms"].value_counts().reset_index().sort_values(by="bedrooms",ascending=True)

### 1.1.4 - Neighbourhood

In [None]:
list_df["neighbourhood"].nunique()

In [None]:
list_df["neighbourhood_cleansed"].nunique()

In [None]:
neighbourhood_price_df = pd.merge(left=average_prices,right=list_df[["id","neighbourhood_cleansed"]].rename(columns={"id":"listing_id"}),how="inner",on="listing_id")

In [None]:
price_by_neigh_df = neighbourhood_price_df.groupby("neighbourhood_cleansed",as_index=False).agg("mean")

In [None]:
sns.barplot(data=price_by_neigh_df.sort_values(by="price_num",ascending=False),x="neighbourhood_cleansed",y="price_num",hue="neighbourhood_cleansed")

In [None]:
print("Neighbourhood seems to have a big impact on price")