# Dataset Preprocessing and Split

In [1]:
from glove import *
from myLayers import CustomAttention

import pandas as pd
from pandas.io.json._normalize import nested_to_record
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Load GloVe model and dataset

In [2]:
glove_model = GloveModel.from_pretrained('trained_models/el.glove.300.txt')

In [3]:
review_dict = myUtils.read_json_v2('data/reviews_revision.json')
# Create dataframe from json generated dictionary
reviews = pd.DataFrame.from_dict(nested_to_record(review_dict, sep='_'))

# Drop unused columns
reviews.drop(columns=['_id_$oid', 'meta_review_sentiment', 'meta_product_name'], inplace=True)

# Drop all product types except top 4 with most instances
reviews.drop(index=reviews[(reviews['meta_product_type'] != 'Κινητά Τηλέφωνα') & \
                           (reviews['meta_product_type'] != 'Gaming Headsets') & \
                           (reviews['meta_product_type'] != 'PS4 Games') & \
                           (reviews['meta_product_type'] != 'Activity Trackers')].index.tolist(),
             inplace=True)

# Create a list of all labels per review by concatenating the three sentiment lists
reviews['meta_review_labels'] = reviews['meta_review_pros'] + reviews['meta_review_so-so'] + reviews['meta_review_cons']

## Get ids from words using GloVe model

In [4]:
reviews['ids'] = reviews['text'].apply(glove_model.string_to_ids)
reviews['ids_length'] = reviews['ids'].apply(len)

## Remove outliers (entries with very long length)

In [5]:
reviews = reviews[reviews['ids_length'] <= 256].reset_index(drop=True)

## Split dataset into train and test sets

In [6]:
reviews_train, reviews_test = train_test_split(reviews, test_size=0.2)

reviews_train.to_csv('data/reviews_revision_train.csv')
reviews_test.to_csv('data/reviews_revision_test.csv')