PREPARING DATASET

PREPROCESSING

In [None]:
try:
    import pandas as pd
except:
    !pip install pandas
    import pandas as pd
# load dataset
df=pd.read_csv("Datasets/TripAdvisor.csv")
print("################ ORGINAL DATASET LOADED ######################.")
print(df.shape,df.iloc[0,3])
print(df.columns)

# print(df.head(5)['recommend_list'])
print("##############################################################.")


# There is a overall rating and 6 class(Value, Location, Sleep Quality, Rooms, Cleanliness, Service) for rating the hotel in recommend_list column
# Some users also rated for the Checkin and business service (so we are removing those ratings for consistency among datasets)
df=df[~df["recommend_list"].str.contains("check",case=False)]
df=df[~df["recommend_list"].str.contains("business",case=False)]


# we are removing duplication member,hotel pairs (removing users who have rated the same hotel many times)
df.drop_duplicates(subset=["member_id","hotel_id"],inplace=True)


# adding extra attributes from recommend_list column as columns in df

# Function to extract attributes from 'recommend_list' string
def extract_attributes(row):
    attributes = {}
    recommendations = row['recommend_list'].split(';')
    for rec in recommendations:
        key, value = rec.split(':')
        attributes[value] = float(key)
    return pd.Series(attributes)

# Apply the function to each row and add new columns to the DataFrame
df_attributes = df.apply(extract_attributes, axis=1)
df = pd.concat([df, df_attributes], axis=1)


# now dropping the unrelevant columns from our dataset
df.drop(["review_id","review_text", "recommend_list"],inplace=True,axis=1)
df.to_csv("./Datasets/TripAdvisorClean.csv",index=False)

print("################ CLEANED DATASET LOADED ######################.")
df = pd.read_csv("./Datasets/TripAdvisorClean.csv")
print(df.shape)
print(df.head(5))
print("##############################################################.")


# zeroing nan values
df = df.fillna(0)

In [None]:
def stats(df):
    print("users:",df["member_id"].value_counts().size)
    print("Hotels:",df["hotel_id"].value_counts().size)
    print("Reviews:",len(df),'\n')  
    # Max hotels rated by one user
    max_hotels_rated = df.groupby('member_id')['hotel_id'].nunique().max()
    print(f"Max hotels rated by one user: {max_hotels_rated}")

    # Min hotels rated by one user
    min_hotels_rated = df.groupby('member_id')['hotel_id'].nunique().min()
    print(f"Min hotels rated by one user: {min_hotels_rated}")

    # Avg hotels rated by one user
    avg_hotels_rated = df.groupby('member_id')['hotel_id'].nunique().mean()
    print(f"Avg hotels rated by one user: {avg_hotels_rated:.2f}")


    print(f"hotels rated by one user: {avg_hotels_rated:.2f}")

    # User x avg rating Mat
    user_avg_rating = df.groupby('member_id')[['rating', 'Cleanliness', 'Location', 'Rooms', 'Service', 'Sleep Quality', 'Value']].mean()
    print("\nUser x avg rating Mat:")
    print(user_avg_rating.head(5))

    # Hotel x avg rating
    hotel_avg_rating = df.groupby('hotel_id')[['rating', 'Cleanliness', 'Location', 'Rooms', 'Service', 'Sleep Quality', 'Value']].mean()
    print("Hotel x avg rating:")
    print(hotel_avg_rating.head(5))
stats(df)

DEFINING AUTOENCODER

In [None]:
try:
    import torch
    import torch.nn as nn
except:
    print("PyTorch not found. Install from: https://pytorch.org/get-started/locally/")
    exit(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AutoEncoder(nn.Module):
    def __init__(self, L, activation_fn_type_inner = 'sigmoid', activation_fn_type_outer = 'sigmoid'): 
        """
            Class for Symmetrical AutoEncoder Network
            params:
                L => List of layers
                
                    for ex: [500, 20, 10] will result in: 
                    2 Layer Encoder: 500x20, 20x10 -> final representation size = 10
                    2 Layer Decoder: 10x20, 20x500 -> input - like
        """
        # parent constuctor
        super().__init__()

        # gets layer dimensions 
        # example -> [ [500, 20], [20, 10] ]
        self.layers = self.createNnStructure(L)

        # getting and saving activation functions
        self.inner_activation_fn = self.getActivationFn(activation_fn_type_inner)
        self.outer_activation_fn = self.getActivationFn(activation_fn_type_outer)

        # create a list of affine forward layers
        self.linears = nn.ModuleList([])
        for i, layer_size in enumerate(self.layers):
            linear_layer = nn.Linear(layer_size[0], layer_size[1])
            # Custom weight initialization, used in paper 2
            # nn.init.normal_(linear_layer.weight, mean=0.0, std=0.02)
            nn.init.xavier_normal_(linear_layer.weight)
            nn.init.constant_(linear_layer.bias, 0.0) # fill bias with 0s
            self.linears.append(linear_layer)

    def getActivationFn(self, activation_fn_type):
        """
            Input: str, activation function name
            Output: respective activation function from torch.nn
        """
        if activation_fn_type == 'relu':
            return nn.ReLU()
        elif activation_fn_type == 'sigmoid':
            return nn.Sigmoid()
        elif activation_fn_type == 'elu':
            return nn.ELU()
        elif activation_fn_type == 'tanh':
            return nn.Tanh()
        elif activation_fn_type == 'softmax':
            return nn.Softmax()
        elif activation_fn_type == 'linear':
            return nn.Identity()
        else:
            raise ValueError('Undefined activation function used')

    def forward(self, x):
        # forward pass for auto encoder
        last_layer_idx = len(self.linears) - 1
        out = x.clone()
        
        # applying inner actviation function for all layers except last one.
        for i, layer in enumerate(self.linears):
            # apply inner activation function to all layers except last
            if i < last_layer_idx:
                out = self.inner_activation_fn( layer(out) )
        
        # applying outer actviation function for last layer
        out = self.outer_activation_fn( self.linears[-1](out) )
        return out

    def createNnStructure(self, L):
        # creates nn structure
        # for ex: [500, 20, 10] will result in: 
        #   2 Layer Encoder: 500x20, 20x10 -> final representation size = 10
        #   2 Layer Decoder: 10x20, 20x500 -> input - like
        # output: [ [500, 20], [20, 10], [10, 20], [20, 500] ]
        max_ind = len(L) - 1
        layers = []
        
        # encoder layers
        for i, v in enumerate(L):
            if i < max_ind:
                layer = [v, L[i + 1]]
                layers.append(layer)
        
        # decoder layers
        encoder_layers = layers[:]
        # reversing encoder layers
        for l in encoder_layers[::-1]:
            # reversing layer structure as well
            decoder_layer = l[::-1]
            
            layers.append(decoder_layer)

        return layers

Methodology. Will train an autoencoder for each rated parameter. First training on overall rating

In [None]:
# splitting train test 
try:
    from sklearn.model_selection import train_test_split
except:
    !pip install scikit-learn
    from sklearn.model_selection import train_test_split

try:
    import numpy as np
except:
    !pip install numpy
    import numpy as np

# train test split
First we get random 10% records from out dataset.
## train set (overall_rating_train)
the basic logic is, all the users, whose records are in test set, should be removed from train set
## test set (overall_rating_test, overall_rating_test_labels)
there will be 2 test tests. 

### overall_rating_test 
One will be the matrix which will be passed to auto encoder, i.e., it will have ratings, but our target ( to test ) ratings will be hidden.
<code>

ex: [1, 3, 4, 0, 0, 0, 4, 5, 2, 1]

                  ^  ^  ^
              
                to be tested ratings hidden 
</code>

### overall_rating_test_labels
One will be the matrix which will be passed to auto encoder, i.e., it will have ratings, but our target ( to test ) ratings will be hidden.

<code>
ex: [0, 0, 0, 1, 2, 3, 0, 0, 0, 0]

                  ^  ^  ^

                to be tested ratings 
</code>


In [None]:
from typing import Any

class AEMC:
    def __init__(self, L, activation_fn_type_inner = 'sigmoid', activation_fn_type_outer = 'sigmoid', metric = 'rating'):
        self.model = AutoEncoder(L, activation_fn_type_inner, activation_fn_type_outer)
        self.model = self.model.to(device)
        self.metric = metric

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        return self.model(x)

    def train(self, overall_rating_train, batch_size = 100, num_epochs = 200, weight_decay = 0.1, learning_rate = 0.001, optimiser = 'rmsprop', verbose = False):
        # Calculate the number of batches and the number of elements in the last batch
        num_batches = len(overall_rating_train) // batch_size
        remaining_elements = len(overall_rating_train) % batch_size

        # Split the array into batches
        batches = [overall_rating_train[i*batch_size:(i+1)*batch_size] for i in range(num_batches)]
        # Add the smaller last batch if there are remaining elements
        if remaining_elements > 0:
            batches.append(overall_rating_train[-remaining_elements:])

        criterion = nn.MSELoss()
        if optimiser == 'rmsprop':
            optimiser = torch.optim.RMSprop(self.model.parameters(), lr= learning_rate, weight_decay= weight_decay)
        elif optimiser == 'adam':
            optimiser = torch.optim.Adam(self.model.parameters(), lr= learning_rate, weight_decay= weight_decay)
        elif optimiser == 'adagrad':
            optimiser = torch.optim.Adagrad(self.model.parameters(), lr= learning_rate, weight_decay= weight_decay)
        elif optimiser == 'sgd':
            optimiser = torch.optim.SGD(self.model.parameters(), lr= learning_rate, weight_decay= weight_decay)
        else:
            print("UNDEFINED OPTIMISER")

        for epoch in range(num_epochs):
            train_loss = 0
            for batch in batches:
                target = batch.copy()
                batch = torch.from_numpy(batch).to(device)

                target = torch.from_numpy(target).to(device)

                optimiser.zero_grad()

                # forward pass
                output = self.model(batch.clone())
                batch.requires_grad = True

                # don't consider unrated hotels in loss calculation
                output[batch == 0] = 0
                loss = criterion(output, target)
                train_loss += loss.data

                # backpropragation and gradient descent
                loss.backward()
                optimiser.step()
            
            if verbose:
                print(f"EPOCH: {epoch} loss: {train_loss/len(batches)}")


    def test(self, overall_rating_test, overall_rating_test_labels):
        infer_rating_test = self.model( torch.from_numpy(overall_rating_test).to(device) ).to('cpu').detach().numpy()
        infer_rating_test[overall_rating_test_labels == 0] = 0

        # calculating MSE 
        error = 0
        error_w_avg = 0
        count = 0
        for  i in range(len(overall_rating_test_labels)):

            a = overall_rating_test_labels[i][overall_rating_test_labels[i] != 0]
            b = infer_rating_test[i][overall_rating_test_labels[i] != 0]
            if(i == 0):
                print(f"EX: ")
                print(a)
                print(b)
            error = error + np.sum(np.square(a - b))
            error_w_avg = error_w_avg + np.sum(np.square(a - 2.5))
            count += len(a)

        error = np.sqrt(error / count)
        error_w_avg = np.sqrt(error_w_avg / count)
        print(f"test point count: {count}")
        print(f"RMSE [w 2.5]: {error_w_avg} | RMSE AE: {error} | difference: {np.abs(error_w_avg - error)} | % improvement {100 * (error_w_avg - error)/error_w_avg}%")
        return 100 * (error_w_avg - error)/error_w_avg

In [None]:
# getting test data, model inferring from all 4 models and then avg of those ratings
def get_total_test_data(df:pd.DataFrame, test_split = 0.05, verbose = False, random_state = 42):

    # Filter rows where any of the ratings is 0
    # splitting by sklearn split
    filtered_df = df[(df['rating'] != 0) & (df['Value'] != 0) & (df['Sleep Quality'] != 0) & (df['Service'] != 0) & (df['Rooms'] != 0) & (df['Location'] != 0) & (df['Cleanliness'] != 0) ]
    df_train, df_test_records = train_test_split(filtered_df, test_size= test_split, random_state= random_state)

    # removing all records of users who are in set, from train set
    df_train = df_train[~df_train['member_id'].isin(df_test_records['member_id'])]
    df_test_users = df[df['member_id'].isin(df_test_records['member_id'])]

    # assigning indexes instead of member_id and hotel_id
    member_id_mapping_train = {member_id: idx for idx, member_id in enumerate(df_train['member_id'].unique())}
    hotel_id_mapping = {hotel_id: idx for idx, hotel_id in enumerate(df['hotel_id'].unique())}
    
    if verbose:
        print('####################### MAPPED TRAIN DATA TO INDEXES ###########################.')
        print(df_train.head())
        print('################################################################################.')

    overall_rating_train = {}

    # iterating over rows to add records to train set
    for metric in ['rating', 'Cleanliness', 'Location', 'Rooms', 'Service', 'Sleep Quality', 'Value']:
        overall_rating_train_ = np.zeros( (len(member_id_mapping_train), len(hotel_id_mapping)) ).astype(np.float32)
        for _, row in df_train.iterrows():
            overall_rating_train_[ member_id_mapping_train[row['member_id']], hotel_id_mapping[row['hotel_id']]] = row[metric]
        overall_rating_train[metric] = overall_rating_train_
        

    if verbose:
        print(f"train_data shape: {overall_rating_train.shape}")

    # testing 
    # assigning indexes instead of member_id and hotel_id
    member_id_mapping_test = {member_id: idx for idx, member_id in enumerate(df_test_users['member_id'].unique())}

    if verbose:
        print('####################### MAPPED TEST DATA TO INDEXES ###########################.')
        print(df_test_users.head())
        print('###############################################################################.')

    overall_rating_test = {}
    overall_rating_test_labels = {}

    for metric in ['rating', 'Cleanliness', 'Location', 'Rooms', 'Service', 'Sleep Quality', 'Value']:
        # creating test set
        overall_rating_test_ = np.zeros( (len(member_id_mapping_test), len(hotel_id_mapping)) ).astype(np.float32)

        for _, row in df_test_users.iterrows():
            overall_rating_test_[ member_id_mapping_test[row['member_id']], hotel_id_mapping[row['hotel_id']]] = row[metric]

        # these are actual to be tested entries
        overall_rating_test_labels_ = np.zeros( (len(member_id_mapping_test), len(hotel_id_mapping)) ).astype(np.float32)

        # removing to-be-tested entries from test set
        for _, row in df_test_records.iterrows():
            overall_rating_test_[ member_id_mapping_test[row['member_id']], hotel_id_mapping[row['hotel_id']] ] = 0
            overall_rating_test_labels_[ member_id_mapping_test[row['member_id']], hotel_id_mapping[row['hotel_id']]] = row[metric]

        overall_rating_test[metric] = overall_rating_test_
        overall_rating_test_labels[metric] = overall_rating_test_labels_


        if verbose:
            print(f"Test data shape: {overall_rating_test_.shape}")

    return (overall_rating_train, overall_rating_test, overall_rating_test_labels)


RATING MODEL

In [None]:
# acts_in = ['sigmoid', 'tanh', 'softmax']
# acts_out = ['elu', 'relu','linear']

classes = ['rating', 'Cleanliness', 'Location', 'Rooms', 'Service', 'Sleep Quality', 'Value']

overall_rating_train, overall_rating_test, overall_rating_test_labels = get_total_test_data(df, 0.05)


best_improve = 0
models = {}
for param_class in classes:
    for sz in [512]:
        for optim in ['rmsprop']:
            for in_act, out_act in zip(['sigmoid'], ['elu']):
                print(f"{param_class}\n X_TRAIN DIM: {overall_rating_train[param_class].shape} X_TRAIN NON ZERO VALUES: {(overall_rating_train[param_class]!=0).sum()}\n X_TEST DIM: {overall_rating_test[param_class].shape} X_TEST NON ZERO VALUES: {(overall_rating_test[param_class]!=0).sum()}\n Y_TEST DIM: {overall_rating_test_labels[param_class].shape} Y_TEST NON ZERO VALUES: {(overall_rating_test_labels[param_class]!=0).sum()}")
                print(f"################################# Param Class: {param_class} sz: {sz} actin: {in_act} actout: {out_act} ################################")

                models[param_class] = AEMC(L=[1811, sz], activation_fn_type_inner= in_act, activation_fn_type_outer=out_act, metric=param_class)
                models[param_class].train(overall_rating_train[param_class], batch_size= 100, num_epochs= 200, weight_decay= 0.1, learning_rate= 0.001, optimiser= optim, verbose = False)
                improvement = models[param_class].test(overall_rating_test[param_class], overall_rating_test_labels[param_class])

                if(best_improve < improvement):
                    best_improve = improvement
                    best = (sz, optim, in_act, out_act)
                
                print(f"#############################################################################################")

In [None]:
infer_rating_test = np.zeros_like(overall_rating_test['rating'], dtype= np.float32)

metrics = ['rating', 'Cleanliness', 'Location', 'Rooms', 'Service', 'Sleep Quality', 'Value']
for metric in metrics:
    infer_rating_test_ = models[metric]( torch.from_numpy(overall_rating_test[metric]).to(device) ).to('cpu').detach().numpy()
    infer_rating_test_[overall_rating_test_labels[metric] == 0] = 0
    infer_rating_test += infer_rating_test_

infer_rating_test /= len(metrics) # 7 metrics



# calculating MSE 
error = 0
error_w_avg = 0
count = 0
for  i in range(len(overall_rating_test_labels['rating'])):

    a = overall_rating_test_labels['rating'][i][overall_rating_test_labels['rating'][i] != 0]
    b = infer_rating_test[i][overall_rating_test_labels['rating'][i] != 0]
    if(i == 0):
        print(f"EX: ")
        print(a)
        print(b)
    error = error + np.sum(np.square(a - b))
    error_w_avg = error_w_avg + np.sum(np.square(a - 2.5))
    count += len(a)

error = np.sqrt(error / count)
error_w_avg = np.sqrt(error_w_avg / count)
print(f"test point count: {count}")
print(f"RMSE [w 2.5]: {error_w_avg} | RMSE AE: {error} | difference: {np.abs(error_w_avg - error)} | % improvement {100 * (error_w_avg - error)/error_w_avg}%")