In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon') #Lexicon for Sentiment Analyze
#nltk.download('words')

# Language detection
from langdetect import detect
# Reference: https://stackoverflow.com/questions/3182268/nltk-and-language-detection

import warnings
warnings.filterwarnings('ignore')

In [2]:
airbnbs = pd.read_csv("listings.csv")
reviews = pd.read_csv("reviews.csv")
reviews.sample(3)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
72798,7324041,51953450,2015-10-25,45693003,Janice,Tim is super friendly and welcomed us warmly! ...
80816,5760902,56385722,2015-12-13,6637400,Julia,Our stay at Ryan's place was perfect. The apar...
19994,2459519,49585711,2015-10-05,8208960,Linda,Megan made us feel comfortable the house was w...


In [3]:
print(reviews.shape)
# Too big a dataset for my computer to process without waiting for a long time
# For demonstration purpose, will only use 50000 reviews
reviews = reviews.sample(50000)
print(reviews.shape)

(84849, 6)
(50000, 6)


In [4]:
reviews['comments'].isnull().sum()

8

In [5]:
# Drop reviews with empty comments
reviews.dropna(inplace=True)

# Drop reviews that have less than 30 characters. Will not be very helpful
reviews.drop(reviews[reviews['comments'].apply(lambda x : len(x)) < 50].index, inplace=True)

In [6]:
# There are non-english reviews
# This function is too slow in processing large datasets -> Use langdetect library instead
def lang_detect(text):
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    text_vocab = set(w.lower() for w in text if w.isalpha())
    return len(text_vocab.difference(english_vocab)) == 0

# Langdetect library can be quite inaccurate when the comment is short. But we already removed short comments
reviews["English"] = reviews['comments'].map(detect)
reviews[reviews['English'] != 'en'].sample(5)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,English
39109,4565370,48111514,2015-09-23,41624164,Serge,"B&B agréable et bien placé, rapidement au cent...",fr
72173,2391699,23263169,2014-11-27,3278100,Luis Gustavo,"Sandy foi incrível, o local é espetacular, Jac...",pt
40614,2489283,19519679,2014-09-14,14367966,Ana Beatriz,O período no apartamento foi absolutamente PER...,pt
9927,3626162,31984707,2015-05-12,26405720,Xiaoyi,Elizabeth一家人真是非常好，非常友善的一家人，她家地理位置各方面都非常舒服。我像回到...,zh-cn
35788,4105164,55475072,2015-12-01,48835796,Funny,THANK YOU!Harry！\n他是个非常好的房东，初到西雅图气温只有1°左右，当见到H...,zh-cn


In [8]:
# Drop all non-enlish reviews
reviews.drop(reviews[reviews['English'] != 'en'].index, inplace=True)

## Sentiment Analysis using VADER
- VADER library is used for valence-based sentiment analysis where the intensity of the sentiment is taken into account. For example, the word ‘excellent’ would be treated as more positive than ‘good’
- VADER works based on its lexicon where each positive or negative word is assigned with a rating
- Given a text, it generates negative, neutral, positive and compound scores.
- Compound score is the most accurate reflection of the sentiment of a review
    - It is a sum of all the 3 other scores which has a value between -1 and 1.
    - 1 being very positive
    - -1 being very negative
- Reference: http://t-redactyl.io/blog/2017/04/using-vader-to-handle-sentiment-analysis-with-social-media-text.html

In [24]:
# Sample sentiment score for 3 comments
senti_analyzer = SentimentIntensityAnalyzer()
for sentence in reviews['comments'].sample(3):
    score = senti_analyzer.polarity_scores(sentence)
    print('{}\n{}, '.format(sentence, str(score)), end='\n\n')

This was my 1st AirBnb experience and Scooter definitely set the bar high. He was a great host and had everything ready when my fiancé and I arrived.  Scooter was helpful throughout the booking process and even when we met.  Also he has a very friendly and cute dog named Bart. 

The location was great.  A 2min walk to nearby restaurants and 10min drive to downtown.   Parking was  easy to find, especially with the parking passed provided by Scooter.  The studio itself was clean and spacious. We definitely recommend this place and would come back if we are ever in the area.

 
{'neg': 0.0, 'neu': 0.72, 'pos': 0.28, 'compound': 0.9858}, 

Amy was great to deal with - kind, prompt to respond, and very welcoming.  The apartment is in a great location, near many great restaurants and bars.  And, as mentioned, the views really are great from the living / dining area.  The only word of caution would be for someone who struggles with stairs -- the bedroom is separated from the living / dining /

- VADER did a faily good job in capturing the sentiment of the comment, both positive and negative
- The intensity is also reflected in the score. So a really good review with stronger emotions will have higher compound score
- So if an listing has a higher mean sentiment score, means its reviews are generally more postive and reviewers are more happy with the stay

In [25]:
def gen_senti_score(comment):
    senti_analyzer = SentimentIntensityAnalyzer()
    score = senti_analyzer.polarity_scores(comment)
    return score['compound']

reviews['review_senti_score'] = reviews['comments'].map(gen_senti_score)

In [26]:
# Generate mean score for each listing ID
listing_review_score = reviews.groupby(['listing_id'])['review_senti_score'].mean()
len(listing_review_score)

3008

In [27]:
# Merge the senti_score with Airbnb listings
senti_score = pd.Series.to_frame(listing_review_score)
senti_score['id'] = list(senti_score.index)
airbnbs = airbnbs.merge(senti_score, on='id', how='left')
airbnbs['review_senti_score'].isnull().sum()

810

- Although some of our listings do not have a review sentiment score, it is okay for demonstration
- Since we cut down on the number of reviews above drastically for faster processing, if all reviews were processed, given the 84849 number of reviews, very likely almost all listing will have a score generated.

In [28]:
# Fill listings with no review sentiment score with 0
airbnbs['review_senti_score'].fillna(0, inplace=True)

In [29]:
# Export the new Airbnb dataset for recommendation
airbnbs.to_csv("listings_review_sent_score.csv", index=False)

In [30]:
# Export the processed review dataset too for identifying top bigrams from recommended listing's reviews
reviews.to_csv("reviews_processed.csv", index=False)