In [None]:
import import_ipynb
from util import *

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import *
from sklearn.metrics import confusion_matrix

import seaborn as sns

def seattle_boston_compare(y):
    """
    Compare the given feature of listings in Seattle vs Boston
    :param y: The feature we want to compare
    """

    listings = read_csv('../data/listings_joined.csv')
    listings = listings.dropna()

    # Create a redundant column to facilitate the creation of a violin plot
    listings[''] = 0

    # Create a violin plot
    plot = sns.violinplot(x='', y=y, hue='City', split=True, data=listings)
    plot.set_xticklabels([''])

    plt.show()

def seattle_boston_compare_price_of(x, yticklabels):
    """
    Compare Seattle vs. Boston prices of the given feature
    :param x: The feature of which we want to compare the price
    :param yticklabels: The labels for the x-axis
    """

    listings = read_csv('../data/listings_joined.csv')
    listings = listings.dropna()
    listings = listings[listings['Price'] <= 20]

    print(max(listings['Price']))

    # Create a violin plot
    plot = sns.violinplot(x=x, y='Price', hue='City', split=True, data=listings)
    plot.set_yticklabels(yticklabels)

    plt.show()

def join_seattle_boston_apt_data(price_bucket_size):
    """
    Join the data for Seattle and Boston into a single csv file
    :param price_bucket_size: The size of the buckets for the price column
    """

    listings_seattle = read_csv('../data/listings.csv')
    listings_boston = read_csv('../data/boston/listings.csv')

    seattle_data = pd.DataFrame()
    seattle_data['Type of Room'] = listings_seattle['room_type']
    seattle_data['People Accommodates'] = listings_seattle['accommodates']
    seattle_data['Number of Bathrooms'] = listings_seattle['bathrooms']
    seattle_data['Number of Bedrooms'] = listings_seattle['bedrooms']
    seattle_data['Number of Beds'] = listings_seattle['beds']
    seattle_data['Price'] = listings_seattle['price'].apply(lambda x: int(parse_price(x)/price_bucket_size))
    seattle_data['City'] = 'Seattle'

    boston_data = pd.DataFrame()
    boston_data['Type of Room'] = listings_boston['room_type']
    boston_data['People Accommodates'] = listings_boston['accommodates']
    boston_data['Number of Bathrooms'] = listings_boston['bathrooms']
    boston_data['Number of Bedrooms'] = listings_boston['bedrooms']
    boston_data['Number of Beds'] = listings_boston['beds']
    boston_data['Price'] = listings_boston['price'].apply(lambda x: int(parse_price(x)/price_bucket_size))
    boston_data['City'] = 'Boston'

    seattle_data.append(boston_data).to_csv('../data/listings_joined.csv', index=False)

def train_pricing_model_on(x, price_multiple):
    """
    Train a model based on the normalized data
    :param x: The input axis
    :param price_multiple: The price multiple is necessary to convert the price into an integer for classification
    """

    # Get training data and turn price into an integer
    model_df = read_csv('../data/pricing_model_train.csv')
    model_df['price'] = model_df['price'].apply(lambda x: int(x * price_multiple))

    # Retrieve x values from training data and transpose the values
    X = [
        list(model_df[x])
    ]
    X = np.array(X).T.tolist()

    # The y values are the price
    Y = list(model_df['price'])

    # Create an MLP Classifer
    clf = MLPClassifier(solver='sgd', activation='relu', alpha=1e-7, hidden_layer_sizes=(9, 7), random_state=15, max_iter=1000000)

    # Split into train and test data
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

    # Fit the model
    clf.fit(x_train, y_train)

    # Make predictions on test input
    y_pred = clf.predict(x_test)

    # Build a confusion matrix
    matrix = confusion_matrix(y_test, y_pred)

    # Show the matrix
    print(matrix)

    # Persist the confusion matrix
    pd.DataFrame(matrix).to_csv('../data/confusion_matrix.csv',index=False)

    # Show a heatmap of the normalized confusion matrix (To visualize correctness as opposed to quantity)
    sns.heatmap(pd.DataFrame(matrix).pipe(normalize_confusion_matrix))
    plt.show()

def generate_pricing_model_data(price_bucket_size):
    """
    Puts price values into buckets and normalizes 5 input and the price columns
    :param price_bucket_size: The price bucket size
    """

    listings = read_csv('../data/listings.csv')
    listings['price'] = listings['price'].apply(parse_price)

    model_df = pd.DataFrame()
    model_df['accommodates'] = listings['accommodates']
    model_df['bathrooms'] = listings['bathrooms']
    model_df['bedrooms'] = listings['bedrooms']
    model_df['beds'] = listings['beds']
    model_df['guests_included'] = listings['guests_included']
    model_df['price'] = listings['price']
    model_df = model_df.dropna()

    model_df['accommodates'] = model_df['accommodates'] / max(model_df['accommodates'])
    model_df['bathrooms'] = model_df['bathrooms'] / max(model_df['bathrooms'])
    model_df['bedrooms'] = model_df['bedrooms'] / max(model_df['bedrooms'])
    model_df['beds'] = model_df['beds'] / max(model_df['beds'])
    model_df['guests_included'] = model_df['guests_included'] / max(model_df['guests_included'])
    model_df['price'] = model_df['price'].apply(lambda x: int(x/price_bucket_size))
    model_df['price'] = model_df['price'] / max(model_df['price'])

    model_df.to_csv('../data/pricing_model_train.csv', index=False)
    
    print('Done generating pricing model data')

def plot_box(data_frame, x, y, x_label, y_label):
    """
    Given a data frame, plot a box plot where the x values are discreet
    :param data_frame: The data frame to use
    :param x: The x-axis dataframe column (discreet)
    :param y: The y-axis dataframe column
    :param x_label: The x-axis label
    :param y_label: The y-axis label
    """

    inter_df = pd.DataFrame()
    inter_df[x] = data_frame[x]
    inter_df[y] = data_frame[y]

    inter_df.dropna()

    plot_df = pd.DataFrame()

    num_rows = row_count(inter_df)

    from_val = int(min(inter_df[x]))
    to_val = int(max(inter_df[x]))
    for i in range(from_val, to_val + 1):
        plot_df[str(i)] = pad(inter_df[inter_df[x] == i][y], num_rows)

    standardize_plot_fonts()

    plot_to_show = plot_df.plot.box()

    plot_to_show.set_title(y_label + ' vs ' + x_label)
    plot_to_show.set_xlabel(x_label)
    plot_to_show.set_ylabel(y_label)

    plt.show()