# Import libraries

In [1]:
import pandas as pd
import numpy as np
from langdetect import detect
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
pd.set_option('display.max_rows', 300)
pd.options.display.max_colwidth = 10000

# Read data

In [2]:
reviews_df = pd.read_csv('reviews.csv')

In [3]:
reviews_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,49091,8243238,2013-10-21,8557223,Jared,Fran was absolutely gracious and welcoming. Made my stay a great experience. Would definitely recommend this cozy and peaceful place to anyone.
1,50646,11909864,2014-04-18,1356099,James,A comfortable room in a smart condo development. Everything was kept very clean and I had the use of my own bathroom. Sujatha and her husband are great hosts - very friendly and accommodating. I'll be staying here again.
2,50646,13823948,2014-06-05,15222393,Welli,"Stayed over at Sujatha's house for 3 good nights with my boyfriend. Sujatha and her husband are great hosts, very welcoming and friendly. The room is comfortable and clean. I'm happy to have my own bathroom as i'm particular with shared bathroom. \r\nThe location is accessible. A few minutes walk from the house to nearest bus stop which can bring you to town.\r\nGood place, good hosts, good price.\r\nHighly recommended!"
3,50646,15117222,2014-07-02,5543172,Cyril,It's been a lovely stay at Sujatha's. The room is clean and the location is just perfect for a stop-over in Singapore. I really enjoyed relaxing at the swimming pool after spending most of the day in the city. Thank you Sujatha.
4,50646,15426462,2014-07-08,817532,Jake,"We had a great experience. A nice place, an amazing complex and easy access to public transit"


In [4]:
reviews_df['length'] = reviews_df['comments'].str.len()
reviews_df = reviews_df[reviews_df['length'] >= 5]
reviews_df.sort_values(by=['length'], ascending=True, inplace=True)

In [5]:
reviews_df.shape

(90108, 7)

In [6]:
import re
def general_cleaning(x):
    x = str(x)
    pattern = '[^a-zA-Z0-9\ ]'
    x = re.sub(pattern,'',x)
    x = x.strip()
    return x

In [7]:
reviews_df['comments'] = reviews_df['comments'].apply(general_cleaning)
reviews_df = reviews_df[reviews_df['comments'] != '']

In [8]:
reviews_df.shape

(84447, 7)

In [9]:
def truncate_comment(text):
    text = text[:15]
    return text
reviews_df['short_comment'] = reviews_df['comments'].apply(truncate_comment)

In [None]:
def translate_lang(text):
    lang = detect(text)
    return lang 

reviews_df['language'] = reviews_df['short_comment'].apply(translate_lang)

In [None]:
reviews_df.to_csv('reviews_cleaned.csv')

In [None]:
reviews_df = reviews_df[reviews_df['language'] == 'en']

In [None]:
analyzer = SentimentIntensityAnalyzer()
reviews_df['compound'] = [analyzer.polarity_scores(x)['compound'] for x in reviews_df['comments']]
reviews_df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in reviews_df['comments']]
reviews_df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in reviews_df['comments']]
reviews_df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in reviews_df['comments']]

In [None]:
reviews_df.to_csv('reviews_cleaned_sentiment.csv')