In [None]:
import import_ipynb
from util import *

def generate_pricing_model_data(price_bucket_size):
    """
    Puts price values into buckets and normalizes 5 input and the price columns
    :param price_bucket_size: The price bucket size
    """

    listings = read_csv('../data/listings.csv')
    listings['price'] = listings['price'].apply(parse_price)

    model_df = pd.DataFrame()
    model_df['accommodates'] = listings['accommodates']
    model_df['bathrooms'] = listings['bathrooms']
    model_df['bedrooms'] = listings['bedrooms']
    model_df['beds'] = listings['beds']
    model_df['guests_included'] = listings['guests_included']
    model_df['price'] = listings['price']
    model_df = model_df.dropna()

    model_df['accommodates'] = model_df['accommodates'] / max(model_df['accommodates'])
    model_df['bathrooms'] = model_df['bathrooms'] / max(model_df['bathrooms'])
    model_df['bedrooms'] = model_df['bedrooms'] / max(model_df['bedrooms'])
    model_df['beds'] = model_df['beds'] / max(model_df['beds'])
    model_df['guests_included'] = model_df['guests_included'] / max(model_df['guests_included'])
    model_df['price'] = model_df['price'].apply(lambda x: int(x/price_bucket_size))
    model_df['price'] = model_df['price'] / max(model_df['price'])

    model_df.to_csv('../data/pricing_model_train.csv', index=False)
    
    print('Done generating pricing model data')

def plot_box(data_frame, x, y, x_label, y_label):
    """
    Given a data frame, plot a box plot where the x values are discreet
    :param data_frame: The data frame to use
    :param x: The x-axis dataframe column (discreet)
    :param y: The y-axis dataframe column
    :param x_label: The x-axis label
    :param y_label: The y-axis label
    """

    inter_df = pd.DataFrame()
    inter_df[x] = data_frame[x]
    inter_df[y] = data_frame[y]

    inter_df.dropna()

    plot_df = pd.DataFrame()

    num_rows = row_count(inter_df)

    from_val = int(min(inter_df[x]))
    to_val = int(max(inter_df[x]))
    for i in range(from_val, to_val + 1):
        plot_df[str(i)] = pad(inter_df[inter_df[x] == i][y], num_rows)

    standardize_plot_fonts()

    plot_to_show = plot_df.plot.box()

    plot_to_show.set_title(y_label + ' vs ' + x_label)
    plot_to_show.set_xlabel(x_label)
    plot_to_show.set_ylabel(y_label)

    plt.show()