### Translate all guest reviews into English with Google translate API

In [None]:
import pandas as pd
reviews_summary = pd.read_csv('reviews_summary.csv')  # Text data to translate into English
reviews_summary.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2015,69544350,2016-04-11,7178145,Rahel,Mein Freund und ich hatten gute gemütliche vie...
1,2015,69990732,2016-04-15,41944715,Hannah,Jan was very friendly and welcoming host! The ...
2,2015,71605267,2016-04-26,30048708,Victor,Un appartement tres bien situé dans un quartie...
3,2015,73819566,2016-05-10,63697857,Judy,"It is really nice area, food, park, transport ..."
4,2015,74293504,2016-05-14,10414887,Romina,"Buena ubicación, el departamento no está orden..."


In [None]:
# Translate!
from googletrans import Translator
import time

reviews_list = reviews_summary['comments'].tolist()
# print(reviews_list)

translator = Translator()

english_comments = []
count = 0

for review in reviews_list:
  try:
    eng_comment = translator.translate(review, dest='en')
    print(eng_comment.text, count)
    english_comments.append(eng_comment.text)
    count += 1
  except:
    pass
    english_comments.append(' ')
    count += 1

  time.sleep(1)

english_comments_df = pd.DataFrame(english_comments)
english_comments_df.to_csv('english_reviews_summary.csv')

### Sentiment analysis with Afinn

In [None]:
# Drop useless columns
reviews = pd.read_csv('english_reviews_summary.csv')
reviews.drop(['Unnamed: 0', 'index', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], axis=1, inplace=True)
print(reviews.shape)
reviews.head()

(401963, 2)


Unnamed: 0,listing_id,english_comments
0,2015,My friend and I had good cozy four nights in J...
1,2015,Jan was very friendly and welcoming host! The ...
2,2015,An apartment very well located in a very nice ...
3,2015,"It is really nice area, food, park, transport ..."
4,2015,"Good location, the department is not ordered, ..."


In [None]:
# pip install afinn
from afinn import Afinn
import numpy as np
afinn = Afinn(language='en')

def afinn_score(text):
    try:
        score = afinn.score(text)
        return score
    except:
        return np.nan

reviews['comments_afinn'] = reviews['english_comments'].apply(afinn_score)
reviews = reviews[reviews["comments_afinn"].isnull()==False]
print(reviews.shape)

#### Calculate the average Afinn score for each room

In [None]:
afinn_list = reviews['comments_afinn'].tolist()
room_list = reviews['listing_id'].tolist()

score_dict = {}
for i in range(len(room_list)):
    if room_list[i] not in score_dict:
        score_dict[room_list[i]] = [afinn_list[i]]
    else:
        score_dict[room_list[i]].append(afinn_list[i])

print(len(score_dict))

for key, val in score_dict.items():
    score_dict[key] = round(sum(val) / len(val), 2)
print(score_dict)
print(len(score_dict))

# Average Afinn socre for each room
score_df = pd.DataFrame.from_dict(score_dict, orient='index').reset_index()
score_df.columns = ['Listing_id', 'afinn_avg']
score_df.head()

# Output
score_df.to_csv('data_reviews.csv')

### Merge review text data output and description data output and other feature data
* Description text data output code: TextMining_Descriptions.ipynb
* Other feature data code: Data_Preprocessing_Visualization.ipynb

In [None]:
pd.set_option('display.max_columns', None)

# Other feature data
df = pd.read_csv('listings_summary_cleaned.csv')
df.head()


# Description text data (PCA analysis)
description_score = pd.read_csv('data_des.csv')
# Only remain columns needed
use_cols = ['id', 'des_pc0', 'des_pc1',	'des_pc2', 'des_pc3', 'des_pc4', 'des_pc5',	'des_pc6', 'des_pc7', 'des_afinn_score']
description_score = description_score[use_cols]
print(description_score.shape)
description_score.head()

# Merge different data sources：Other feature data + description text data
df_description = df.merge(description_score, left_on='id', right_on='id', how='left', right_index=False)
df_description.isna().sum()[df_description.isna().sum() > 0].to_frame(name="Num of NaN")  # Check null value

# Fill null values with average for description text data
fill_cols = ['des_pc0', 'des_pc1', 'des_pc2', 'des_pc3', 'des_pc4', 'des_pc5', 'des_pc6', 'des_pc7', 'des_afinn_score']
for col in fill_cols:
    df_description[col].fillna(description_score[col].mean(), inplace=True)

df_description.isna().sum()[df_description.isna().sum() > 0].to_frame(name="Num of NaN")  # Check again
print(df_description.shape)
df_description.head()  # df_description: after combining description text data and other feature data

In [None]:
# Merge df_description & reviews text data (score_df)
df_description_reviews = df_description.merge(score_df, left_on='id', right_on='Listing_id', how='left', right_index=False)
df_description_reviews.drop('Listing_id', axis=1, inplace=True)

# Fill null values with average for reviews text data
df_description_reviews['afinn_avg'].fillna(score_df['afinn_avg'].mean(), inplace=True)
df_description_reviews.isna().sum()[df_description_reviews.isna().sum() > 0].to_frame(name="Num of NaN")  # Check: no more null values

print(df_description_reviews.shape)
df_description_reviews.head()  # df_description_reviews: final data to train models
df_description_reviews.to_csv('listings_summary_cleaned_text.csv')  # Output