## Introduction

Notebook ini digunakan untuk membuat recommender system berdasarkan preferensi pelanggan AirBnB

## Import Libraries

In [1]:
import re
import ast
import pickle
import numpy as np
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from feature_engine.outliers import Winsorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Exploratory Data Analysis

In [2]:
df = pd.read_csv('listings_clustered.csv')
df.head()

Unnamed: 0,id,listing_url,picture_url,name,description,property_type,room_type,accommodates,number_of_reviews,bedrooms,beds,price,review_scores_rating,city,latitude,longitude,bathrooms,cluster
0,52438122,https://www.airbnb.com/rooms/52438122,https://a0.muscache.com/pictures/miso/Hosting-...,Stunning New Cottage minutes to downtown Ashev...,Come relax in this brand new cottage with tast...,Entire cottage,Entire home/apt,4,63,2.0,2.0,225.0,4.98,Asheville,35.56967,-82.63193,1.5,1
1,22119778,https://www.airbnb.com/rooms/22119778,https://a0.muscache.com/pictures/bac6ce5d-d2ff...,Large king suite with private balcony and soak...,Whether you're looking for a romantic Ashevill...,Private room in bed and breakfast,Private room,3,1,2.0,2.0,306.0,5.0,Asheville,35.60284,-82.56727,1.0,1
2,47812966,https://www.airbnb.com/rooms/47812966,https://a0.muscache.com/pictures/324713f3-ea1c...,Blue Ridge Magic: Artist's Hideaway (Brand New),New listing: Blue Ridge Magic named for our pa...,Entire rental unit,Entire home/apt,4,25,1.0,2.0,108.0,5.0,Asheville,35.58475,-82.57182,1.0,1
3,46284932,https://www.airbnb.com/rooms/46284932,https://a0.muscache.com/pictures/miso/Hosting-...,Simple and Cozy Bedroom in Central Area,Looking for a simple bedroom and bathroom to r...,Private room in bungalow,Private room,2,3,1.0,1.0,70.0,5.0,Asheville,35.5787,-82.61582,1.0,1
4,48366092,https://www.airbnb.com/rooms/48366092,https://a0.muscache.com/pictures/prohost-api/H...,"Hot tub, Fire pit, 5 miles to downtown Asheville","Beautiful 2 bedroom oasis. Custom built, fully...",Entire cottage,Entire home/apt,4,143,2.0,2.0,114.0,4.92,Asheville,35.55106,-82.51424,1.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28923 entries, 0 to 28922
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    28923 non-null  int64  
 1   listing_url           28923 non-null  object 
 2   picture_url           28923 non-null  object 
 3   name                  28923 non-null  object 
 4   description           28923 non-null  object 
 5   property_type         28923 non-null  object 
 6   room_type             28923 non-null  object 
 7   accommodates          28923 non-null  int64  
 8   number_of_reviews     28923 non-null  int64  
 9   bedrooms              28923 non-null  float64
 10  beds                  28923 non-null  float64
 11  price                 28923 non-null  float64
 12  review_scores_rating  28923 non-null  float64
 13  city                  28923 non-null  object 
 14  latitude              28923 non-null  float64
 15  longitude          

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28923 entries, 0 to 28922
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    28923 non-null  int64  
 1   listing_url           28923 non-null  object 
 2   picture_url           28923 non-null  object 
 3   name                  28923 non-null  object 
 4   description           28923 non-null  object 
 5   property_type         28923 non-null  object 
 6   room_type             28923 non-null  object 
 7   accommodates          28923 non-null  int64  
 8   number_of_reviews     28923 non-null  int64  
 9   bedrooms              28923 non-null  float64
 10  beds                  28923 non-null  float64
 11  price                 28923 non-null  float64
 12  review_scores_rating  28923 non-null  float64
 13  city                  28923 non-null  object 
 14  latitude              28923 non-null  float64
 15  longitude          

In [5]:
df.nunique()

id                      28923
listing_url             28923
picture_url             28767
name                    28567
description             28184
property_type              97
room_type                   4
accommodates               16
number_of_reviews         571
bedrooms                   13
beds                       25
price                    1272
review_scores_rating      160
city                       31
latitude                27200
longitude               27093
bathrooms                  22
cluster                     3
dtype: int64

In [6]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'])

In [8]:
# Initialize NLTK resources
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer(language='english')

def text_preprocessing(text):
    # Case folding
    text = text.lower()

    # Mention, hashtags, URL, and newline removal
    text = re.sub(r"@[A-Za-z0-9_]+|#[A-Za-z0-9_]+|http\S+|www.\S+|\\n", " ", text)

    # Non-letter removal
    text = re.sub("[^A-Za-z\s']", " ", text)

    # Tokenization and stopwords removal
    tokens = [word for word in word_tokenize(text) if word not in stop_words]

    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]

    # Combining Tokens
    text = ' '.join(tokens)

    return text

def safe_text_preprocessing(x):
    try:
        return text_preprocessing(x)
    except Exception as e:
        print(f"Error occurred: {e}")
        return None


In [9]:
X = df[['description','city','accommodates','number_of_reviews','bedrooms','beds','price','review_scores_rating','bathrooms']].copy()

# X_rec = X.drop(['description'], axis=1).copy()

In [10]:
# Applying Text Preprocessing to the Dataset

X['description'] = X['description'].apply(safe_text_preprocessing)
X

Unnamed: 0,description,city,accommodates,number_of_reviews,bedrooms,beds,price,review_scores_rating,bathrooms
0,come relax brand new cottag tast mid centuri m...,Asheville,4,63,2.0,2.0,225.0,4.98,1.5
1,whether re look romant ashevill weekend getawa...,Asheville,3,1,2.0,2.0,306.0,5.00,1.0
2,new list blue ridg magic name paint local arti...,Asheville,4,25,1.0,2.0,108.0,5.00,1.0
3,look simpl bedroom bathroom rest day mountain ...,Asheville,2,3,1.0,1.0,70.0,5.00,1.0
4,beauti bedroom oasi custom built fulli stock k...,Asheville,4,143,2.0,2.0,114.0,4.92,1.0
...,...,...,...,...,...,...,...,...,...
28918,unbeliev dc oldest hous chevi chase dc state t...,Washington DC,3,112,1.0,3.0,49.0,4.91,1.0
28919,back rent renov neighbor bit br br love br ba ...,Washington DC,4,42,2.0,2.0,112.0,5.00,1.0
28920,look explor dc live comfort budget place right...,Washington DC,2,97,1.0,1.0,141.0,4.37,2.0
28921,locat judiciari squar spectacular view washing...,Washington DC,8,124,2.0,3.0,310.0,4.89,2.0


In [11]:
num_col = ['accommodates','number_of_reviews','bedrooms','beds','price','review_scores_rating','bathrooms']

pipeline = Pipeline([
    ('winsorizer', Winsorizer(capping_method='iqr', tail='both', fold=3, variables=num_col)),
    ('scaler', RobustScaler())
    ])

num_scaled = pipeline.fit_transform(X[num_col])

# Add to the feature matrix
feature_matrix = hstack([tfidf_matrix, num_scaled])

# X_rec[num_col] = pipeline.fit_transform(X_rec[num_col])

In [9]:
# X_rec = pd.get_dummies(X_rec)

In [12]:
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

## Model Saving

In [13]:
# Save model menggunakan pickle
filename = 'rec_sys.pkl'
pickle.dump(cosine_sim, open(filename, 'wb'))

In [14]:
df_rec = pd.DataFrame(cosine_sim, index=df['name'], columns=df['name'])

def sorting(name):
  tmp = df_rec[name].drop(index=name).sort_values(ascending=False).iloc[:5]
  print(f'You like {name}, so based on our recommender system, We recommend you to stay in:')
  for i,name in enumerate(tmp.index):
    print(f'{i+1}. {name}')

In [15]:
df_rec

name,Stunning New Cottage minutes to downtown Asheville,Large king suite with private balcony and soaking tub,Blue Ridge Magic: Artist's Hideaway (Brand New),Simple and Cozy Bedroom in Central Area,"Hot tub, Fire pit, 5 miles to downtown Asheville",Montford Urban Treehouse,Perfect South Asheville Townhome Getaway W/ Hot Tub,Red Gate-Main/Top Floors - No Shared Space,Traveler's Home. Monthly rates.,River Cottage,...,Brand new modern Luxury 1Bedroom,2BD/1.5BTH - Two Story Apt Home 14th/Ust NW,Bright & Cozy 2x2,Newly renovated attic bedroom,Private Room in U St / Columbia Heights,All Bunked Up! New bathroom & washer/dryer!,2BR/1BA Capitol Hill Apt (Potomac Ave Metro)!,203【Private Room - Queen bed in CoHi DC!】,Ultra Chic & Modern Garden Condo by Union Station,"Fantastic 2BR in DC, Furnished + Pet-Friendly"
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Stunning New Cottage minutes to downtown Asheville,1.000000,0.414192,0.176683,-0.060545,0.612327,-0.035699,0.592438,0.478748,-0.039397,0.532598,...,-0.074950,0.688548,0.536844,0.017763,-0.122097,0.342017,0.547962,0.256441,0.668640,0.480960
Large king suite with private balcony and soaking tub,0.414192,1.000000,-0.029817,0.026328,0.055459,0.040016,0.351310,-0.175769,-0.111269,0.373896,...,0.074145,0.257597,0.082563,-0.026629,-0.125773,-0.234891,0.325536,-0.193256,0.181314,0.505603
Blue Ridge Magic: Artist's Hideaway (Brand New),0.176683,-0.029817,1.000000,0.208574,0.177950,0.245800,0.084571,0.201268,0.208210,0.059279,...,0.224771,0.059883,-0.095853,0.186130,0.174037,0.209602,0.219215,-0.117298,0.033509,0.030529
Simple and Cozy Bedroom in Central Area,-0.060545,0.026328,0.208574,1.000000,-0.081273,0.393527,-0.238718,0.027797,0.506746,-0.319084,...,0.285828,-0.185259,-0.398573,0.504749,0.513767,-0.057362,0.125440,-0.098176,-0.396609,0.038637
"Hot tub, Fire pit, 5 miles to downtown Asheville",0.612327,0.055459,0.177950,-0.081273,1.000000,-0.066891,0.348437,0.769545,0.084271,0.279403,...,-0.133695,0.699871,0.598166,0.115890,-0.007384,0.735830,0.545985,0.454187,0.670737,0.067947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
All Bunked Up! New bathroom & washer/dryer!,0.342017,-0.234891,0.209602,-0.057362,0.735830,0.010828,0.130809,0.712458,0.134498,0.137781,...,-0.105880,0.449031,0.434275,0.166476,0.064740,1.000000,0.336275,0.436493,0.472969,-0.150304
2BR/1BA Capitol Hill Apt (Potomac Ave Metro)!,0.547962,0.325536,0.219215,0.125440,0.545985,0.168948,0.506629,0.315273,0.115607,0.418338,...,0.108137,0.528080,0.290934,0.162203,0.119186,0.336275,1.000000,0.054217,0.387862,0.356351
203【Private Room - Queen bed in CoHi DC!】,0.256441,-0.193256,-0.117298,-0.098176,0.454187,-0.219860,0.039066,0.614811,0.178796,-0.063163,...,-0.319632,0.443010,0.691707,0.145040,0.116638,0.436493,0.054217,1.000000,0.396500,-0.002680
Ultra Chic & Modern Garden Condo by Union Station,0.668640,0.181314,0.033509,-0.396609,0.670737,-0.259078,0.588391,0.551212,-0.334376,0.584878,...,-0.232932,0.757810,0.761725,-0.267120,-0.362281,0.472969,0.387862,0.396500,1.000000,0.345231


In [16]:
sorting('House in West Asheville/River Arts District')

You like House in West Asheville/River Arts District, so based on our recommender system, We recommend you to stay in:
1. BEAUTIFUL  REMODELED NEAR GREEN VALLEY AREA
2. Hawaiian Mountain House ~ For the entire family
3. Spacious East Austin Home | Stocktank Pool | WFH
4. The Harriet House | Lakes, Fire Pit & Air Hockey
5. Beautiful and peaceful home in twin cities suburbs


### Work in Progress (for Future Improvements)

In [16]:
# def get_recommendations(name=None, city=None, price=150, accommodates=2, beds=1, bedrooms=1, bathrooms=1, number_of_reviews=15, review_scores_rating=2):

#     # Create a user profile
#     user_profile = {'name': name, 'city': city, 'price': price, 'accommodates':accommodates, 'beds':beds,
#                     'bedrooms':bedrooms, 'bathrooms':bathrooms, 'number_of_reviews':number_of_reviews,
#                     'review_scores_rating':review_scores_rating}

#     # Convert the user profile to a DataFrame
#     user_df = pd.DataFrame([user_profile])

#     user_df[num_col] = pipeline.transform(user_df[num_col])
#     user_df = pd.get_dummies(user_df)

#     # Ensure user_df has the same features as X_rec
#     missing_features = set(X_rec.columns) - set(user_df.columns)
#     for feature in missing_features:
#         user_df[feature] = 0

#     # Ensure X_rec has the same features as user_df
#     extra_features = set(user_df.columns) - set(X_rec.columns)
#     user_df = user_df.drop(columns=extra_features)

#     # Compute similarity scores
#     sim_scores = cosine_similarity(user_df, X_rec)
    
#     recommended_indices = sim_scores.argsort()[0][-5:][::-1]  # Reverse to get top recommendations

#     # Print the recommended items
#     tmp = df['name'].iloc[recommended_indices]

#     for i,name in enumerate(tmp):
#         print(f'{i+1}. {name}')

In [17]:
# get_recommendations(name=None, city=None, price=150, accommodates=2, beds=1, bedrooms=1, bathrooms=1, number_of_reviews=15, review_scores_rating=2)
