# 1. Imports

## Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import keras
import tensorflow
import pickle
import seaborn as sns
import time
import matplotlib
import pickle
import pyltr

from datetime import datetime
from datetime import timedelta

from sklearn import svm
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


# 2. Reading data

In [3]:
hotel = pd.read_csv('hotel.csv')

In [4]:
journey = pd.read_csv('journey.csv')

In [5]:
feedback = pd.read_csv('feedback.csv')

# 3. Treating the data

## 3.1 Cleaning up
Dropping **check-in dates before the booking** and **check-out dates before check-in**

In [6]:
journey = journey[journey['booking_date'] <= journey['checkin_date']] #Because they can reserve in the same day

In [7]:
journey = journey[journey['checkin_date'] < journey['checkout_date']]

In [8]:
print("Missing values for Journey:")
display(journey.isna().sum())
print("-----------------------------")
print("Missing values for Journey:")
display(hotel.isna().sum())
print("-----------------------------")
print("Missing values for Journey:")
display(feedback.isna().sum())
print("-----------------------------")

Missing values for Journey:


journey_id         0
booking_date       0
checkin_date       0
checkout_date      0
ota_id             0
language         248
is_in_app        248
item_id            0
dtype: int64

-----------------------------
Missing values for Journey:


item_id              0
city                 0
country              0
stars                0
room_count        7960
overall_liking    1393
based_on             0
hotel_type_id        0
dtype: int64

-----------------------------
Missing values for Journey:


time          0
journey_id    0
question      0
answer        0
dtype: int64

-----------------------------


Joining tables for treating the data all together. After that:
* "Room_count" is not relevant for the prediction as does not add any value neither for the hotels nor the journeys or feedback, so we can remove it safely.
* All the rows that contain NaNs are cleaned out.

In [9]:
#Join tables by common id
journey_hotel = journey.join(hotel.set_index('item_id'), on='item_id')

In [10]:
journey_hotel = journey_hotel.drop('room_count', axis = 1)
journey_hotel = journey_hotel.dropna()

In [11]:
#Checking who else has NaN values
journey_hotel.isna().sum()

journey_id        0
booking_date      0
checkin_date      0
checkout_date     0
ota_id            0
language          0
is_in_app         0
item_id           0
city              0
country           0
stars             0
overall_liking    0
based_on          0
hotel_type_id     0
dtype: int64

In [12]:
allInfo = journey_hotel.join(feedback.set_index('journey_id'), on='journey_id')

Transforming the data for classification:
* -1 means no value available (NaN)
* 0 means the answer is numeric
* 1 means the answet is a text

In [13]:
allInfo = allInfo.replace('text data', -2) #Text data
allInfo['answer'] = allInfo['answer'].fillna(-1) #NaN Data
allInfo['answer'] = allInfo['answer'].astype('int64') #Convert all to ints

In [14]:
allInfo.loc[allInfo['answer'] > 0, 'answer'] = 0
allInfo.loc[allInfo['answer'] == -2, 'answer'] = 1

## 3.2 Adding new features
ML approaches that work with DateTime types require slightly more data treatment. In this approach, we derive new relevant features from data in order to remove the dates and times. Most specifically we will have:
* "length_of_stay: Number of days the Users are staying at the hotel.
* "time_until_feedback": Time elapsed since check-out and the feedback registration. Basically, hoy much time they take to put the review/rating.
* "high_season_period: The month the Users are staying in the hotel. Serves for identifying high seasons.

Also, an additional feature is added as a result of mixing two existing ones:
* "average_liking": Based on the number of Trivago reviews and the overall score, we extract an average

In [15]:
# Lenght of Stay
length_of_stay = pd.to_datetime(allInfo['checkout_date']) - pd.to_datetime(allInfo['checkin_date'])
allInfo['length_of_stay'] = pd.to_datetime(length_of_stay).dt.day

# Time Until Feedback
time_until_feedback = pd.to_datetime(allInfo['checkout_date']) - pd.to_datetime(allInfo['time'])
allInfo['time_until_feedback'] = pd.to_datetime(time_until_feedback).dt.day

# High Season Period
allInfo['high_season_period'] = pd.to_datetime(allInfo['checkin_date']).dt.month

# Average Liking
allInfo['average_liking'] = allInfo['overall_liking']/allInfo['based_on']

## 3.3 Dropping data
All the IDs are not relevant for the prediction so they are ruled out. Same happens with the dates (reason named above).

In [16]:
allInfo = allInfo.drop(['journey_id', 'item_id', 'ota_id', 'based_on', 'overall_liking', 'booking_date', 'checkin_date', 'checkout_date', 'time'], axis = 1)

## 3.4 Encoding
All the categorical data is now transformed by a Label Encoder, in order to help the predictor work better.

In [17]:
def encode(df, key):
    le = preprocessing.LabelEncoder()
    le.fit(df[key])
    df[key] = le.transform(df[key])
    return df

In [18]:
encode(allInfo, 'country')
encode(allInfo, 'city')
encode(allInfo, 'language')
encode(allInfo, 'is_in_app')
encode(allInfo, 'question')

Unnamed: 0,ota_id,language,is_in_app,city,country,stars,hotel_type_id,question,answer,length_of_stay,time_until_feedback,high_season_period,average_liking
2,1806,5,0,1719,58,4.0,2.0,0,-1,2,,2,61.712121
3,1232,5,0,1719,58,5.0,2.0,0,-1,2,,2,20.409611
4,1806,5,0,3728,17,0.0,2.0,0,-1,3,,2,15.885375
5,38,5,0,102,58,3.0,2.0,0,-1,2,,2,5.537821
6,2024,5,0,3669,106,5.0,2.0,0,-1,2,,2,11.085025
7,1512,5,0,2200,30,4.0,2.0,0,-1,2,,2,10.249337
8,38,5,0,1003,58,4.0,2.0,0,-1,2,,2,1.309415
9,1304,5,0,4295,14,3.0,2.0,0,-1,2,,2,8.620422
10,2024,5,0,2244,8,3.0,2.0,0,-1,2,,2,14.056466
11,277,5,0,2407,58,4.0,2.0,0,-1,2,,2,5.227926


In [19]:
allInfo['time_until_feedback'] = allInfo['time_until_feedback'].fillna(-1)

# 4. Model
A Support Vector Machine technique is used for prediction. The problem to solve boils down to classify a sample as "text feedback" and "not text feedback", which is why a classification approach is used.

The data is split in 80% training ('X_train' and 'y_train') and 20% test sets ('X_test' and 'y_test').

In [47]:
X = allInfo.drop('answer', axis = 1)
y = allInfo['answer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = np.random)

# Scaling and centering of the data due to the heterogeneous type of info.
X_train = pd.DataFrame(preprocessing.scale(X_train))

  """


For this SVC model, the following parameters have been used:
* *rbf* kernel: to set a non linear hyper-plane
* *gamma* = 1: to set a small trade-off between generalization and the effort of "exactly fitting the training data set". The higher, the more exact. But we don't want a so-strict boundary.
* *C* = 10: for a middle level smooth decision boundary

In [74]:
model = svm.SVC(kernel = 'rbf', gamma = 1, C = 10) 
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

#Predict Output
predicted_svm = model.predict(X_test)

0.8102262443438915


To check the predicted results, we use a function which compares if the predicted result (using the training data) matches the real result (the test data held out), giving a percentage of accuracy. This is used by both 'accuracy_score' and '.score' functions given.

In [62]:
accuracy_score(y_test, predicted_svm)

0.8102262443438915

#### Feature Importance
Building a SVC that uses a linear kernel, and then retreiving the most significant features shows that the **type of question** is the most determinant factor for the User leaving a text feedback. This is no surprise since only one type of question allows text.

The second influent factor is unexpectedly the **length of the stay** (derived from the dropped dates, which mean they were relevant, or at least partially). This makes sense since the more time the cliend spends in the hotel, the more experiences has, so a simple rating might not include his/her satisfaction.

Unfortunately, the features could not be retreived from the model that uses a *rbf* kernel, since the spece is reshaped and has no relation with the input anymore.

#### Further Discussion

Extra information could be gathered in order to help the model discern better if a feedback could be left by the Users. For example, the price they have paid for the room would give us a sneak peak into the deal the User got, other conclusions related to high season periods, relation with stars.

Another nice feature would be the services available in the hotel. This would help us predict also the rating given, but that is not the point now. Having different services gives us a higher probability of having a User that might require it, and therefore a higher probability of complaint/compliment (although this variable is prone to missing values as not everyone would use all the services).