# Application

In [1]:
import pandas as pd
import numpy as np

In [None]:
property_type = pd.read_csv("final_df.csv")

property_type1 = property_type.iloc[:,1:33]
for i in range(len(property_type1)):
    for j in range(2, len(property_type1.columns)):
        if type(property_type1.iloc[i,j]) != str:
            continue
        elif len(property_type1.iloc[i,j]) <= 4:
            property_type1.iloc[i,j] = property_type1.iloc[i,j]
        else:
            property_type1.iloc[i,j] = property_type1.iloc[i,j].split(",")[0] + property_type1.iloc[i,j].split(",")[1]

property_type2 = property_type1.loc[:, ["Property Type", "Mean Price"]]
property_type2 = property_type2.groupby(["Property Type"]).mean()



In [None]:
plt.figure(figsize = (7,4))

plt.bar(property_type2.index, property_type2["Mean Price"], color=('red','yellow','orange','blue','green','purple','black','grey'))

plt.title("Mean Price of Different Property Types")

plt.xlabel("Property Type")
plt.xticks(rotation=90)

plt.ylabel("Mean Price")

plt.show()

In [None]:
import numpy as np
import pandas as pd 
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import webbrowser
from threading import Timer
import dash_table
import dash_table.FormatTemplate as FormatTemplate
import plotly.express as px

#Import datasets 
df_details = pd.read_csv('dfclean_1adult.csv')
df_details = df_details.rename(columns = {'Unnamed: 0':'Name',
                                         'reviews': 'no. of reviews'})

df_dates = pd.read_csv('final_df.csv').drop('Unnamed: 0', 1)

# Merge datasets
df = df_details.merge(df_dates,  on='Name')
df = df.replace(to_replace = ['Y','N'],value = [1,0])

df.iloc[:,7:37] = df.iloc[:,7:37].apply(lambda x: x.astype(str))
df.iloc[:,7:37] = df.iloc[:,7:37].apply(lambda x: x.str.replace(',', '').astype(float), axis=1)

user_df = df.copy()
date_cols = user_df.columns[7:37]
hotel_types = user_df['Property Type'].unique()
features = ['Price'] + list(user_df.columns[2:5]) + list(user_df.columns[37:])
continuous_features = features[:9]
continuous_features_A = ['Price', 'Distance to Mall', 'Distance to MRT']

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.title = 'Hotel Booking'

def generate_table(dataframe, max_rows=5):
    df_drop_link = dataframe.drop(columns='link')
    
    return html.Table([
        html.Thead(
            html.Tr([html.Th(col) for col in df_drop_link.columns]) 
        ),
        html.Tbody([
            html.Tr([
            html.Td(dataframe.iloc[i][col]) if col != 'Name' else html.Td(html.A(href=dataframe.iloc[i]['link'], children=dataframe.iloc[i][col], target='_blank')) for col in df_drop_link.columns
            ]) for i in range(min(len(dataframe), max_rows))
        ])
    ])

colors = {'background': '#111111', 'text': '#7FDBFF'}

app.layout = html.Div([
    
    #introduction
    html.Div([
    html.H2(children='Hello!',
            style={'color': colors['text']}),
    
    #inputs for date and hotel type    
    html.Div([html.H4("Step 1: Input Date (eg. 4Nov): "),
              dcc.Input(id='date-input', value='4Nov', type='text')],
            style={'width':'30%', 'float':'left'}),
    
    html.Div(id='date-output-hotel'),
    
    html.Div([ 
    html.H4('Step 2: Select Your Preferred Hotel Types:'),
    dcc.Dropdown(id='hotel-input',
                options=[{'label': i, 'value': i} for i in hotel_types],
                value= hotel_types,
                multi=True)],
    style={'width':'70%', 'float':'right'}),
    html.Br(), html.Br()
    ]),
    
    #return available hotels for given date
    html.Div([
    html.Br(), html.Br(), html.Hr(),
    dcc.Graph(id='output-submit'),
    html.Hr(),
    ]),
    
    #input top 3 features
    html.Div([
    html.H4(children='Step 3: Select Your Top 3 Features:'),
    ]),
    
    html.Div([
    dcc.Dropdown(
        id='feature1',
        options=[{'label': i, 'value': i} for i in features],
                value= features[0]
    ), html.Br(), 
    dcc.Slider(id='weight1',
        min= 10, max= 90, step= 10,
        marks={i: '{}%'.format(i) for i in np.arange(10, 90, 10).tolist()},
        value=50)
    ], style={"display": "grid", "grid-template-columns": "20% 10% 70%", "grid-template-rows": "50px"}
    ),
    
    html.Div([
    dcc.Dropdown(
        id='feature2',
        options=[{'label': i, 'value': i} for i in features],
                value= features[1]
    ), html.Br(),
    dcc.Slider(id='weight2',
        min= 10, max= 90, step= 10,
        marks={i: '{}%'.format(i) for i in np.arange(10, 90, 10).tolist()},
        value=30)
    ], style={"display": "grid", "grid-template-columns": "20% 10% 70%", "grid-template-rows": "50px"}
    ),
    
    html.Div([
    dcc.Dropdown(
        id='feature3',
        options=[{'label': i, 'value': i} for i in features],
                value= features[2]
    ), html.Br(),
    dcc.Slider(id='weight3',
        min= 10, max= 90, step= 10,
        marks={i: '{}%'.format(i) for i in np.arange(10, 90, 10).tolist()},
        value=20)
    ], style={"display": "grid", "grid-template-columns": "20% 10% 70%", "grid-template-rows": "50px"}
    ),
    
    #return top 5 hotels recommended
    html.Div([ 
    html.Hr(),
    html.H2(children='Top 5 Hotels Recommended For You',
            style={'color': colors['text']}),
    html.Div(id='output-feature'),
    html.Hr()
    ])
])

#update available hotels for given date
@app.callback(Output('output-submit', 'figure'),
                [Input('hotel-input', 'value'), Input('date-input', 'value')])
def update_hotels(hotel_input, date_input):
    user_df = df.copy()
    user_df = user_df[user_df[date_input].notnull()]
    user_df = user_df[user_df['Property Type'].isin(hotel_input)]
    plot_df = pd.DataFrame(user_df.groupby('Property Type')['Name'].count()).reset_index()
    fig = px.bar(plot_df, x='Property Type', y='Name', color="Property Type", title="Hotel Types available on {}:".format(date_input))
    fig.update_layout(transition_duration=500)
    return fig

#update top 5 hotels recommended
@app.callback(Output('output-feature', 'children'),
                [Input('hotel-input', 'value'), Input('date-input', 'value'), 
                 Input('feature1', 'value'),  Input('feature2', 'value'), Input('feature3', 'value'),
                 Input('weight1', 'value'), Input('weight2', 'value'), Input('weight3', 'value')])
def update_features(hotel_input, date_input, feature1, feature2, feature3, weight1, weight2, weight3):
    user_df = df.copy()
    user_df = user_df[user_df[date_input].notnull()]
    user_df['Price'] = user_df[date_input]
    user_df = user_df[user_df['Property Type'].isin(hotel_input)]
    features= [feature1, feature2, feature3]
    selected_features = features.copy()
    selected_continuous = set(selected_features) & set(continuous_features)

    for i in selected_continuous:
        col = i + str(' rank')

        if i in continuous_features_A:
            user_df[col] = user_df[i].rank(ascending=False) #higher value, lower score
        else:
            user_df[col] = user_df[i].rank(ascending=True) #higher value, higher score

        selected_features[selected_features.index(i)] = col #replace element in list name with new col name

    #Scoring: weight * feature's score
    user_df['Score'] = (((weight1/100) * user_df[selected_features[0]]) 
                      + ((weight2/100) * user_df[selected_features[1]]) 
                      + ((weight3/100) * user_df[selected_features[2]])).round(1)
    
    #Score-to-Price ratio
    user_df['Value_to_Price ratio'] = (user_df['Score'] / user_df['Price']).round(1)
    user_df = user_df.sort_values(by=['Value_to_Price ratio'], ascending = False).reset_index()
    features_result = [i for i in features if i != 'Price']
    selected_features_result = [i for i in selected_features if i not in features_result]
    user_df_results = user_df[['Name', 'Property Type', 'Price', 'Score', 'Value_to_Price ratio'] + ['link'] + features_result + selected_features_result] 

    return generate_table(user_df_results.head(5))

port = 8050
url = "http://127.0.0.1:{}".format(port)
def open_browser():
    webbrowser.open_new(url)

if __name__ == '__main__':
    Timer(0.5, open_browser).start();
    app.run_server( debug= False, port=port)

# Price Prediciton

In [None]:
import glob
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
%matplotlib inline

from sklearn.metrics import mean_squared_error, r2_score
from sklearn import datasets, linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
import random
import xgboost as xgb


dfs = glob.glob("*Novhotels.csv")

# for df in dfs:
train_features = pd.read_csv("10Novhotels.csv")

#Preliminary data cleaning
col_names = train_features.columns
list1 = []
for i in col_names:
    prop_na = sum(train_features.loc[:,i].isnull())/train_features.loc[:,"Laundry Service"].count()
    if prop_na >= .9:
        list1.append(i)

title = ['Price', 'Property Type', 'Number of Stars', 'Review Score',
       'Cleanliness', 'Distance to Mall', 'Distance to MRT',
       'Early Check-in (Before 3pm)', 'Late Check-out (After 12pm)',
       'Pay Later', 'Free Cancellation', 'Gym', 'Swimming Pool', 'Car Park',
       'Airport Transfer', 'Breakfast', 'Hygiene+ (Covid-19)',
       '24h Front Desk', 'Laundry Service', 'Bathtub', 'Balcony', 'Kitchen',
       'TV', 'Internet', 'Air Conditioning', 'Ironing', 'Non-Smoking']

train_features = train_features.drop(columns = list1)
train_features = train_features.drop(['Unnamed: 0', 'Name'], axis = 1) 
#train_features.rename(columns={'*Nov': 'Price'}, inplace=True)
train_features.columns = title

pd.options.display.max_columns = None
pd.options.display.max_rows = None
# display(train_features.head())

train_features = train_features.replace(['Y', 'N'], [1, 0])
train_features = train_features[train_features["Price"].notna()]

train_features["Price"] = train_features["Price"].astype(str).str.replace(',','')
#     train_features["Price"] = train_features["Price"].str.replace(',','')
train_features["Price"] = pd.to_numeric(train_features["Price"])

#Change stars to categorical
train_features["Number of Stars"] = train_features["Number of Stars"].astype(str)


#One hot encoding
train_features = pd.get_dummies(train_features)

#Check for missing data
#     check = train_features.isnull().sum()

mean_val_distmall = round(train_features['Distance to Mall'].mean(),0)
train_features['Distance to Mall']=train_features['Distance to Mall'].fillna(mean_val_distmall)
mean_val_distmrt = round(train_features['Distance to MRT'].mean(),0)
train_features['Distance to MRT']=train_features['Distance to MRT'].fillna(mean_val_distmrt)
mean_val_price = round(train_features['Price'].mean(),0)
train_features['Price']=train_features['Price'].fillna(mean_val_price)

#     print(train_features.isnull().sum())

# Create correlation matrix
corr_matrix = train_features.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features 
train_features.drop(to_drop, axis=1, inplace=True)

labels = []

for i in train_features.columns:
    labels.append(i)
labels.remove('Price')

training_features = labels 
target = 'Price'

random.seed(5)
#Perform train-test split
#creating 90% training data and 10% test data
X_train, X_test, Y_train, Y_test = train_test_split(train_features[training_features], train_features[target], train_size = 0.9)

colsample = np.arange(0.0, 1.1, 0.1)
learningrate = np.arange(0.0, 1.1, 0.1)
maxdepth = list(range(1, 1000))
alpha_val = list(range(1, 1000))
n_estimators_val = list(range(1, 1000))

# for a in range(len(maxdepth)):
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 1, n_estimators = 20)

xg_reg.fit(X_train,Y_train)

predicted = xg_reg.predict(X_test)
# print(n_estimators_val[a])
#the mean squared error
print('Mean squared error: %.2f' % mean_squared_error(Y_test, predicted))

#explained variance score: 1 is perfect prediction
print('R square score: %.2f' % r2_score(Y_test,predicted))


df = pd.read_csv("prices_1adult.csv")
df = df.replace(to_replace ="[]", value =np.nan) 
df = pd.melt(df, id_vars='Unnamed: 0')
df.columns = ["Name","Date","Price"]
df.head()

df_second = pd.read_csv("Predicted_Price.csv")
df_second.head()
df_second = df_second.drop_duplicates()

df_merge_col = pd.merge(df, df_second, on=['Name','Date'])
# df_merge_col.to_csv("Predicted_Price.csv")
