In [54]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [55]:
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

In [56]:
df = pd.read_csv('data/listings_summary.csv')

# Explaining the goal:

My goal is to use NLP to, given a range of suggested prices for a property, determine if the host should select a value on the higher or lower end of that range.

The range of suggest prices is determined by using the XGBoost model used in the project, then plus and minus 20% of that value to get an upper and lower limit for suggested price. (20% is a little aibitrary. I found the MAE for predictions grouped by number of bedrooms. They were a little above 20%, but much higher gave too extreme of a range)

The suggestion for roughly where on that range the host should aim is determined by: 
- labeling the data by whether it is above or below the median market value for a unit with that number of bedrooms (number of bedrooms was selected because it had the highest feature importance in price prediction).
- tokenized and vectorized the descriptions of each unit, limiting myself to descriptions I'm fairly confident are mostly in English.
- Use logistic regression to predict probability of a description being associated with a price above median market value.
- Use that probability to give a general suggestion for where a user should aim in their price range


## Restricting to columns to use NLP with

In [57]:
df1 = df[['description','bedrooms','price']]

In [192]:
df1.head()

Unnamed: 0,description,bedrooms,price
0,Great location! 30 of 75 sq meters. This wood...,1.0,60.0
1,In the summertime we are spending most of our ...,1.0,17.0
2,This beautiful first floor apartment is situa...,1.0,90.0
3,First of all: I prefer short-notice bookings. ...,1.0,26.0
4,Cozy and large room in the beautiful district ...,1.0,42.0


#### 'description' is the column we'll do NLP on, 'bedrooms' was determined in the exploratory notebook to have the highest association with price, and 'price' will be used to separate descriptions into 'high' and 'low' for nlp classification

## Cleaning Data

In [58]:
df1['price'] = df1['price'].str.replace('$','').str.replace(',','').astype(float)

In [59]:
df1 = df1.dropna()

In [60]:
# I was a bit more relaxed with the limits on this one
df1 = df1[(df1['price'] > 0) & (df1['price'] < 900)]

In [61]:
len(df1)

22281

In [62]:
# using common words to select only rows in English
df2 = df1[(df1['description'].str.contains(' the ')) |\
          (df1['description'].str.contains(' this ')) |\
          (df1['description'].str.contains(' and '))]

In [63]:
len(df2)

16169

In [67]:
df2.groupby(['bedrooms']).median()

Unnamed: 0_level_0,price
bedrooms,Unnamed: 1_level_1
0.0,50.0
1.0,40.0
2.0,80.0
3.0,120.0
4.0,150.0
5.0,240.0
6.0,349.0
7.0,460.0
8.0,584.0


In [70]:
df2_0 = df2[df2['bedrooms'] == 0]
df2_1 = df2[df2['bedrooms'] == 1]
df2_2 = df2[df2['bedrooms'] == 2]
df2_3 = df2[df2['bedrooms'] == 3]
df2_4 = df2[df2['bedrooms'] == 4]
df2_5 = df2[df2['bedrooms'] == 5]
df2_6 = df2[df2['bedrooms'] == 6]
df2_7 = df2[df2['bedrooms'] == 7]
df2_8 = df2[df2['bedrooms'] == 8]

In [81]:
df2_0['is_above_median'] = df2_0['price'].apply(lambda x: x >= df2_0['price'].median())
df2_1['is_above_median'] = df2_1['price'].apply(lambda x: x >= df2_1['price'].median())
df2_2['is_above_median'] = df2_2['price'].apply(lambda x: x >= df2_2['price'].median())
df2_3['is_above_median'] = df2_3['price'].apply(lambda x: x >= df2_3['price'].median())
df2_4['is_above_median'] = df2_4['price'].apply(lambda x: x >= df2_4['price'].median())
df2_5['is_above_median'] = df2_5['price'].apply(lambda x: x >= df2_5['price'].median())
df2_6['is_above_median'] = df2_6['price'].apply(lambda x: x >= df2_6['price'].median())
df2_7['is_above_median'] = df2_7['price'].apply(lambda x: x >= df2_7['price'].median())
df2_8['is_above_median'] = df2_8['price'].apply(lambda x: x >= df2_8['price'].median())

In [82]:
separated_frames = [df2_0,df2_1,df2_2,df2_3,df2_4,df2_5,df2_6,df2_7,df2_8]

In [101]:
df3 = pd.concat(separated_frames)

In [216]:
tf_counts = df3['is_above_median'].value_counts()

### Baseline to beat

In [219]:
tf_counts[1] / (tf_counts[1] + tf_counts[0])

0.5367060424268663

In [120]:
import re
# cleaning the description of non-alphanumeric values
df3['description_cleaned'] = df3['description'].apply(lambda x: re.sub(r'[^a-zA-Z ^0-9]', '', x))
# setting the string to lowercase, then splitting to create tokens
df3['tokens'] = df3['description_cleaned'].apply(lambda x: x.lower().split())

## Using tfidfVectorizer


In [135]:
# This sorts by reviews that are above or below median
df3 = df3.sort_values(by='is_above_median')

In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [313]:
vectorizer = TfidfVectorizer(
    min_df=2,
    strip_accents = 'ascii'
)

In [314]:
tfidf = vectorizer.fit_transform(df3['description'])

In [315]:
# Pickling tfidf for app
import pickle
pickle.dump(tfidf, open('tfidf.pkl','wb'))

In [316]:
tfidf = tfidf.toarray()

In [317]:
tfidf.shape

(16169, 16692)

In [237]:
words = vectorizer.get_feature_names()

In [238]:
X = tfidf
y = df3['is_above_median'].astype(int)

In [239]:
from sklearn.model_selection import train_test_split

In [240]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=333)

In [241]:
from sklearn.linear_model import LogisticRegression

In [242]:
model = LogisticRegression().fit(X_train, y_train)

In [318]:
# Pickling logistic regression with word vectors for app
import pickle
pickle.dump(tfidf, open('nlp_lr_model.pkl','wb'))

In [243]:
model.predict_proba(X_test)

array([[0.56343433, 0.43656567],
       [0.6339595 , 0.3660405 ],
       [0.91804421, 0.08195579],
       ...,
       [0.57899581, 0.42100419],
       [0.38214468, 0.61785532],
       [0.20434294, 0.79565706]])

In [312]:
model.predict(X_test)

array([0, 0, 0, ..., 0, 1, 1])

In [245]:
model.score(X_test, y_test)

0.6950284442245857

## Exploring tokens associated with units above and below median price

In [159]:
import numpy as np

In [160]:
coef = model.coef_.reshape(-1)

In [161]:
np.argmax(coef)

9369

In [163]:
words[9369]

'living'

In [173]:
high_idx = np.argsort(coef)[-10:]

In [174]:
high_idx

array([ 9423,  4643,  8240, 10137, 14748,  2527, 10103, 12481,  1648,
        9369], dtype=int64)

In [181]:
low_idx = np.argsort(coef)[:10]

In [182]:
low_idx

array([12656, 13305, 13304,  4659, 16092, 16671,  6145, 14444, 16213,
        2367], dtype=int64)

In [183]:
words = np.array(words)

In [184]:
words[high_idx]

array(['loft', 'design', 'interior', 'modern', 'terrace', 'berlin',
       'mitte', 'restaurants', 'apartment', 'living'], dtype='<U36')

In [185]:
words[low_idx]

array(['room', 'shared', 'share', 'desk', 'wedding', 'zimmer', 'flatmate',
       'supermarkets', 'wg', 'bedrooms'], dtype='<U36')

## Testing model with user-generated description

In [246]:
example = ["This is a great living envorinment. We're located in the center of Berlin. Fully redisigned interior. Local restaurants exist right outside our door"]

In [247]:
example_transformed = vectorizer.transform(example)

In [248]:
model.predict_proba(example_transformed)

array([[0.19393825, 0.80606175]])

## Cleaning example of dataframe to test predictions

In [249]:
df_example = df.head(5)

In [253]:
df_example = df_example[['description','neighbourhood_group_cleansed','room_type','accommodates','bathrooms','bedrooms','beds',
          'bed_type','amenities','security_deposit','cleaning_fee', 'minimum_nights', 'price']]

In [254]:
df_example

Unnamed: 0,description,neighbourhood_group_cleansed,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,security_deposit,cleaning_fee,minimum_nights,price
0,Great location! 30 of 75 sq meters. This wood...,Mitte,Entire home/apt,3,1.0,1.0,2.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",$200.00,$30.00,4,$60.00
1,In the summertime we are spending most of our ...,Pankow,Private room,2,1.0,1.0,1.0,Real Bed,"{Wifi,Kitchen,Elevator,Heating,Washer,Essentia...",$0.00,$0.00,2,$17.00
2,This beautiful first floor apartment is situa...,Pankow,Entire home/apt,4,1.0,1.0,2.0,Real Bed,"{Internet,Wifi,Kitchen,""Buzzer/wireless interc...",$200.00,$50.00,62,$90.00
3,First of all: I prefer short-notice bookings. ...,Tempelhof - Schöneberg,Private room,2,1.0,1.0,1.0,Pull-out Sofa,"{Internet,Wifi,""Pets allowed"",""Pets live on th...",$250.00,$30.00,5,$26.00
4,Cozy and large room in the beautiful district ...,Pankow,Private room,2,1.0,1.0,2.0,Real Bed,"{Wifi,Heating,""Family/kid friendly"",Essentials...",$0.00,$0.00,2,$42.00


In [255]:
# cleaning dollar-amount columns
monetary_columns = ['security_deposit','cleaning_fee','price']
for item in monetary_columns:
    df_example[item] = df_example[item].str.replace('$','').str.replace(',','').astype(float)

In [256]:
used_amenities = ['Washer', 'Hair dryer', 'Laptop friendly workspace', 'Hangers',
       'Iron', 'Shampoo', 'TV', 'Hot water', 'Family/kid friendly', 'Internet',
       'Host greets you', 'Smoke detector', 'Buzzer/wireless intercom',
       'Lock on bedroom door', 'Free street parking', 'Elevator', 'Bed linens',
       'Smoking allowed', 'First aid kit', 'Cable TV']
for item in used_amenities:
    df_example[item] = df_example['amenities'].str.contains(item).astype(int)
df_example = df_example.drop(columns='amenities')

In [257]:
df_example

Unnamed: 0,description,neighbourhood_group_cleansed,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,security_deposit,cleaning_fee,minimum_nights,price,Washer,Hair dryer,Laptop friendly workspace,Hangers,Iron,Shampoo,TV,Hot water,Family/kid friendly,Internet,Host greets you,Smoke detector,Buzzer/wireless intercom,Lock on bedroom door,Free street parking,Elevator,Bed linens,Smoking allowed,First aid kit,Cable TV
0,Great location! 30 of 75 sq meters. This wood...,Mitte,Entire home/apt,3,1.0,1.0,2.0,Real Bed,200.0,30.0,4,60.0,0,1,1,1,1,1,1,1,1,0,0,1,0,1,0,0,1,0,0,1
1,In the summertime we are spending most of our ...,Pankow,Private room,2,1.0,1.0,1.0,Real Bed,0.0,0.0,2,17.0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,This beautiful first floor apartment is situa...,Pankow,Entire home/apt,4,1.0,1.0,2.0,Real Bed,200.0,50.0,62,90.0,1,1,1,1,1,1,0,0,1,1,0,1,1,0,0,0,0,0,1,0
3,First of all: I prefer short-notice bookings. ...,Tempelhof - Schöneberg,Private room,2,1.0,1.0,1.0,Pull-out Sofa,250.0,30.0,5,26.0,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,0,1,0,0,0
4,Cozy and large room in the beautiful district ...,Pankow,Private room,2,1.0,1.0,2.0,Real Bed,0.0,0.0,2,42.0,0,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0


# First predicting the price range, then offering a recommendation on where to select in the range based on the NLP prediction

In [259]:
import pickle

In [261]:
price_model = pickle.load(open('model.pkl', 'rb'))

In [262]:
# Dropping columns to use with price prediction model
df_price_prediction = df_example.drop(columns=['description','price'])

In [275]:
price_predictions = price_model.predict(df_price_prediction)

In [276]:
price_predictions

array([76.14437 , 36.75857 , 74.26539 , 37.7499  , 38.892452],
      dtype=float32)

In [266]:
df_above_median = df_example[['description','bedrooms']]
df_above_median['price'] = price_predictions

In [277]:
df_example_description_transformed = vectorizer.transform(df_example['description'])
predict_probas = model.predict_proba(df_example_description_transformed)

In [290]:
predict_probas

array([[0.3407018 , 0.6592982 ],
       [0.70646548, 0.29353452],
       [0.21735763, 0.78264237],
       [0.62607432, 0.37392568],
       [0.6621181 , 0.3378819 ]])

In [289]:
predict_probas[:, 1]

array([0.6592982 , 0.29353452, 0.78264237, 0.37392568, 0.3378819 ])

In [279]:
d = {'predicted_prices': price_predictions, 'probability_above_median': predict_probas}

In [284]:
predictions_df = df_price_prediction.copy()

In [286]:
predictions_df['predicted_prices'] = price_predictions
predictions_df['probability_above_median'] = predict_probas[:, 1]

In [296]:
preds_df = predictions_df[['predicted_prices','probability_above_median']]

In [297]:
preds_df['price_range_low'] = preds_df['predicted_prices'].apply(lambda x: x * .8)
preds_df['price_range_high'] = preds_df['predicted_prices'].apply(lambda x: x * 1.2)

In [299]:
preds_df

Unnamed: 0,predicted_prices,probability_above_median,price_range_low,price_range_high
0,76.144371,0.659298,60.915497,91.373245
1,36.758572,0.293535,29.406857,44.110286
2,74.265388,0.782642,59.412311,89.118466
3,37.749901,0.373926,30.199921,45.299881
4,38.892452,0.337882,31.113962,46.670943


In [309]:
def price_range_recommendation(df):
    upper_range = int(df['price_range_high'])
    lower_range = int(df['price_range_low'])
    proba = df['probability_above_median']
    if proba < 0.20:
        message = "the low end of that range."
    elif proba >= 0.20 and proba < 0.4:
        message = "the medium-low end of that range."
    elif proba >=0.4 and proba < 0.6:
        message = "about the middle of that range."
    elif proba >=0.6 and proba < 0.8:
        message = "the medium-high end of that range."
    elif proba >= 0.8:
        message = "the high end of the range"
    print(f'Based on your inputs, the model recommends your unit cost between ${lower_range} and ${upper_range}.')
    print(f'Based on your text discription, the model recommends you select a price at {message}')
    

In [311]:
price_range_recommendation(preds_df.iloc[0])

Based on your inputs, the model recommends your unit cost between $60 and $91.
Based on your text discription, the model recommends you select a price at the medium-high end of that range.
