In [None]:
from enum import Enum

import numpy as np
import pandas as pd

import matplotlib as plt

from google.cloud import translate_v2 as translate

%matplotlib inline

%env GOOGLE_APPLICATION_CREDENTIALS translationKeys.json

In [None]:
#The dataset provided consists of three csv files, each with slightly different schema
DATASET_1 = "dataset/7282_1.csv"
DATASET_2 = "dataset/Datafiniti_Hotel_Reviews.csv"
DATASET_3 = "dataset/Datafiniti_Hotel_Reviews_Jun19.csv"

DATASETS_IN_USE = [DATASET_1, DATASET_2, DATASET_3]

MIN_REVIEW_LENGTH = 0

TRAIN_PROPORTION = 0.1

USE_TRANSLATOR = True

TRANSLATOR_OPTIONS = Enum("OPTIONS", "TRANSLATE REMOVE THRESHOLD")
TRANSLATOR_FUNCTIONALITY = TRANSLATOR_OPTIONS.TRANSLATE #options are "translate", which translates non-english text to english, and 
                                        # "remove", which drops records with non-english text from the training set
TRANSLATOR_THRESHOLD=0.05

## 1. Pre-Preparation and Exploratory Analysis

### Dataset 1

In [None]:
df_1 = pd.read_csv(DATASET_1)

#print(df)
#df_1.head()

df_1.boxplot(column="reviews.rating")


In [None]:
print(df_1["reviews.rating"].value_counts())

In [None]:

df_1[df_1["reviews.rating"] > 5].head()
df_1[df_1["reviews.rating"] == 0].head()

From the unique values in DATASET_1, it seems that, while most of the reviews are rated on an integer scale between 1 and 5, at least some of the reviews are rated from 1.0-10.0. These reviews should be removed or re-scaled (I have chosen to remove them, as there are comparatively very few of them and there may be differences between what a user means by choosing two equivalent ratings on the two scales - ie. a 8/10 rating may have different connotations on average to a 4/5 rating.)

Note: It is impossible to tell which of the remaining reviews are actually rated out of 10 rather than 5 - however, assuming the distribution of ratings out of 10 is similar to the distribution for ratings out of 5, there should be very few of these and their effect on any statistical analysis should be minor.

Additionally, a quick glance at the 0 ratings suggests that these are not real reviews, but some artifact of the scraping process, and so should be removed as well.

In [None]:
df_1 = df_1[df_1["reviews.rating"].isin([1.0,2.0,3.0,4.0,5.0])]

print(df_1["reviews.rating"].value_counts())

### Dataset 2

In [None]:
df_2 = pd.read_csv(DATASET_2)
df_2.head()

df_2.boxplot(column="reviews.rating")

In [None]:
print(df_2["reviews.rating"].value_counts())

### Dataset 3

In [None]:
df_3 = pd.read_csv(DATASET_3)
df_3.head()

df_3.boxplot(column="reviews.rating")
print(df_3["reviews.rating"].value_counts())

In [None]:
#Find the columns that are present in each of the three datasets, and make sure that there aren't any important columns
# (review text, title, rating, etc.) that are functionally identical, but named differently
columns_intersection = set.intersection(set(df_1.columns.values),set(df_2.columns.values), set(df_3.columns.values))
print(columns_intersection)

In [None]:
#now concatenate the three data sources on the schema intersection

df = pd.concat([df_1, df_2, df_3], join="inner")

print(df.head())

In [None]:
df.count()

## 2. Translation of non-English reviews 

In [None]:
#uses the google cloud translate API to detect the text language, returning the language and the confidence
def detect_language(text):
    client = translate.Client()
    
    detection = client.detect_language(text)
    
    language = detection["language"]
    
    #because the API says that the confidence value is not always available, give a confidence value of 1 
    # (full confidence) if the confidence is not returned
    
    confidence = detection["confidence"] if "confidence" in detection else 1
    
    return language, confidence
    
#uses the google cloud translate API to
print(detect_language("aaaaaaa"))

def translate_text(text, source_lang="en", target_lang="en"):
    client = translate.Client()
    
    translation = client.translate(text, source_language=source_lang, target_language=target_lang)
    
    return translation["translatedText"]
    
print(translate_text("aaaaaaa", source_lang="es"))

In [None]:
def translate(df):
    
    df["detected_language"].map(detect_language)
    
