In [116]:
import pandas as pd
import numpy as np
import re, sys, json
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

import matplotlib.pyplot as plt
from collections import defaultdict
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [122]:
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /home/vagner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/vagner/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [123]:
def pre_process(txt):
    txt = txt.lower() #All to lower
    txt = re.sub(r'\W+', ' ', txt) #Remove special chars
    tokens = word_tokenize(txt) #Tokenizing text
    tokens = [w for w in tokens if w not in stop_words] # Removing stopwords    
    tokens = [w for w in tokens if re.match(r'[a-z]+$', w) != None]
    
    txt = ' '.join(tokens)
    if len(txt) == 0:
        return None

    return txt

def train_test_val_split(df):
    train_df, temp_df = train_test_split(df, test_size=0.4, random_state=666)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=666)
    return train_df, val_df, test_df

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [124]:
df = pd.read_csv('Hotel_Reviews.csv')
#df = df.iloc[:1000]
df.tail(3)

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
35909,1088 Powdermill Rd,"Cabins,Resorts & Vacation Cottages,Wedding Cha...",Gatlinburg,US,35.754868,-83.457295,Elk Springs Resort,37738,TN,2015-11-03T00:00:00Z,2016-06-23T16:50:58Z,,,5.0,My husband and I stayed at Tree Tops which is ...,Amazing time (both times!!),Charlotte,Carol W.,NC
35910,3747 29th St S E,"Hotels-Apartment,Corporate Lodging,Hotels,Lodging",Grand Rapids,US,42.90948,-85.57359,Extended Stay America Grand Rapids - Kentwood,49512,MI,,2015-12-01T19:40:54Z,,,0.0,to share your opinion of this businesswith YP ...,,,write a review,
35911,3747 29th St S E,"Hotels-Apartment,Corporate Lodging,Hotels,Lodging",Grand Rapids,US,42.90948,-85.57359,Extended Stay America Grand Rapids - Kentwood,49512,MI,,2015-12-01T19:40:54Z,,,0.0,xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,,,xxxxxxxxxxxxxx,


In [125]:
df_train, df_val, df_test = train_test_val_split(df)
df_train['partition'] = 'train'
df_val['partition'] = 'val'
df_test['partition'] = 'test'
combined_df = pd.concat([df_train, df_val, df_test])[['reviews.text', 'partition', 'reviews.rating']].dropna()

In [126]:
combined_df['txt'] = combined_df['reviews.text'].apply(pre_process)
combined_df['label'] = combined_df['reviews.rating'].apply(int)
corpus_df = combined_df[['txt', 'partition', 'label']].dropna().reset_index(drop=True)
labels = sorted(combined_df['label'].unique())
corpus_df

Unnamed: 0,txt,partition,label
0,nice residence inn rural area stayed uofa fami...,train,4
1,valet full park adjacent city owned parking ga...,train,3
2,bed bugs back,train,1
3,stayed wife got married staff great especially...,train,5
4,dont wash blanket bedsheet well hair found bed...,train,1
...,...,...,...
34945,great stay unique home good location close eve...,test,5
34946,hope services better time lol,test,4
34947,excellent hotel excellent location great style...,test,4
34948,stayed cabin elk springs resort cajun hideaway...,test,5


In [127]:
corpus_df.to_csv('corpus.tsv', sep='\t', index=False, header=False)

vocab = sorted({w for s in corpus_df.txt for w in s.split(' ')})
open('vocabulary.txt', 'w+').write('\n'.join(vocab))

metadata = {
    "total_documents": len(corpus_df),
    "vocabulary_length": len(vocab),
    "preprocessing-info": [],
    "labels": labels,
    "total_labels": len(labels),
    "last-training-doc": np.where(corpus_df.partition == 'train')[0][-1],
    "last-validation-doc": np.where(corpus_df.partition == 'val')[0][-1]
}
meta_raw = json.dumps(metadata, cls=NpEncoder, indent=4)
open('metadata.json', 'w+').write(meta_raw)

328