In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

pd.options.display.float_format = '{:.2f}'.format

%matplotlib inline

[nltk_data] Downloading package wordnet to /Users/tomlin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Function to print bold text in print().
from IPython.display import Markdown, display

def printmd(string):
    """
    Ref: https://discuss.analyticsvidhya.com/t/how-to-make-a-text-bold-within-print-statement-in-ipython-notebook/14552
    """
    display(Markdown(string))

In [36]:
dataset = pd.read_pickle('../Playground-dataset/01-Classification-Modeling-on-Hotel-Scoring/part1_dataset.pickle')
cols = [col.lower() for col in dataset.columns.values]
dataset.columns = np.array(cols)

printmd('**Columns in the dataframe:**\n')
dataset.info()

**Columns in the dataframe:**


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515738 entries, 0 to 515737
Data columns (total 17 columns):
hotel_address                                 515738 non-null object
additional_number_of_scoring                  515738 non-null int64
review_date                                   515738 non-null datetime64[ns]
average_score                                 515738 non-null float64
hotel_name                                    515738 non-null object
reviewer_nationality                          515738 non-null object
negative_review                               515738 non-null object
review_total_negative_word_counts             515738 non-null int64
total_number_of_reviews                       515738 non-null int64
positive_review                               515738 non-null object
review_total_positive_word_counts             515738 non-null int64
total_number_of_reviews_reviewer_has_given    515738 non-null int64
reviewer_score                                515738 non-

## Create Binned Target Variable

In [5]:
# create customized bin width
def bin_cut(df, col, bin_range):
    """Specify customized bin range for discretization."""
    target_bin = pd.cut(df[col], 
        bin_range, duplicates='drop') # drop off bins with the same index, incur less bin number

    print('Share of Each Bin:')
    bin_share = target_bin.groupby(target_bin).agg({'size': lambda x: x.size,
                                                    'share': lambda x: x.size/len(target_bin)})
    display(bin_share)

    map_class = {}
    for i, key in enumerate(sorted(target_bin.unique())):
        map_class[key] = i
    print('Bin and Class Label Correspondence:')
    display(map_class)

    target_bin_2 = target_bin.replace(map_class)
    target_bin = pd.concat([target_bin, target_bin_2], axis=1)
    target_bin.columns = ['{}_bin'.format(col), '{}_class'.format(col)]

    return target_bin

In [37]:
discrete_target = bin_cut(df=dataset, col='reviewer_score', bin_range=[0,8,9.5,10])

Share of Each Bin:


Unnamed: 0_level_0,size,share
reviewer_score,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0.0, 8.0]",180597,0.35
"(8.0, 9.5]",148121,0.29
"(9.5, 10.0]",187020,0.36


Bin and Class Label Correspondence:


{Interval(0.0, 8.0, closed='right'): 0,
 Interval(8.0, 9.5, closed='right'): 1,
 Interval(9.5, 10.0, closed='right'): 2}

In [38]:
discrete_target = discrete_target[['reviewer_score_class']]
display(discrete_target.head())

Unnamed: 0,reviewer_score_class
0,0
1,0
2,0
3,0
4,0


In [39]:
discrete_target.shape

(515738, 1)

In [40]:
dataset.shape

(515738, 17)

In [41]:
dataset_2 = pd.concat([dataset, discrete_target], axis=1)

In [42]:
dataset_2.shape

(515738, 18)

In [43]:
dataset_2.head()

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng,reviewer_score_class
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.36,4.92,0
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.36,4.92,0
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.36,4.92,0
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.36,4.92,0
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-24,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.36,4.92,0


In [52]:
# Concatenate the reviews
dataset_2['negative_review'] = dataset_2['negative_review'].apply(lambda x: x.strip())
dataset_2['positive_review'] = dataset_2['positive_review'].apply(lambda x: x.strip())
dataset_2["combined_review"] = dataset_2[
    ["negative_review","positive_review"]].apply(lambda x: " ".join(x), axis=1)


In [53]:
dataset_2.head()

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng,reviewer_score_class,combined_review
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available ...,397,1403,Only the park outside of the hotel was beautiful,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.36,4.92,0,I am so angry that i made this post available ...
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great l...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.36,4.92,0,No Negative No real complaints the hotel was g...
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficult...,42,1403,Location was good and staff were ok It is cute...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.36,4.92,0,Rooms are nice but for elderly a bit difficult...
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk bar...,210,1403,Great location in nice surroundings the bar an...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.36,4.92,0,My room was dirty and I was afraid to walk bar...
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-24,7.7,Hotel Arena,New Zealand,You When I booked with your company on line yo...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.36,4.92,0,You When I booked with your company on line yo...


In [54]:
# Sample the dataset
idx = np.random.RandomState(1).permutation(dataset_2.shape[0])[:1000]
print(len(idx))
print(idx[:5])

1000
[356054 395957 468352 281462 498978]


In [55]:
dataset_3 = dataset_2.iloc[idx,:]
dataset_3.shape

(1000, 19)

## Prepare Training Dataset as Spacy Requires

In [79]:
train_df = dataset_3['combined_review']
classes = pd.get_dummies(dataset_3['reviewer_score_class'])
classes.columns = ['low','medium','high']

In [117]:
train_ls = [(train_df.iloc[i], {'cats':classes.iloc[i].to_dict()}) for i in range(len(train_df))]
train_ls[:5]

[('No Negative Nothing was too much trouble The staff were attentive at all times and the hotel staff made the stay very comfortable',
  {'cats': {'low': 0, 'medium': 0, 'high': 1}}),
 ('I had a Junior suite The bed was only a queen size My brother got a king in his I loved there shower It felt like you were under a fire hose Great hotel',
  {'cats': {'low': 0, 'medium': 0, 'high': 1}}),
 ('staff could be less rude the pool area is horrible beds really comfy and the location is great also maid was such a sweetheart',
  {'cats': {'low': 1, 'medium': 0, 'high': 0}}),
 ('No Negative Really nice hotel good facilities great staff and very clean Only minor issue was temperature of main pool Would definitely recommend though',
  {'cats': {'low': 0, 'medium': 0, 'high': 1}}),
 ('No Negative Everything is super And room and design Very clean Super',
  {'cats': {'low': 0, 'medium': 0, 'high': 1}})]

In [118]:
def load_data(list_, sample_limit = 0, split = 0.7):
    train_data = list_
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    split_idx = int(len(train_data)*split)
    train_ls, valid_ls = train_data[:split_idx], train_data[split_idx:]
    return train_ls, valid_ls

In [119]:
train_ls, valid_ls = load_data(train_ls)

In [None]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

## Build up Spacy Classifier

In [229]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [230]:
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

In [231]:
# add label to text classifier
textcat.add_label('low')
textcat.add_label('medium')
textcat.add_label('high')

1

In [232]:
textcat.labels

['low', 'medium', 'high']

In [233]:
from spacy.util import minibatch, compounding
from sklearn.metrics import f1_score

In [234]:
n_iter = 10
# Train model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    # print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        true_labels = list()
        pdt_labels = list()
        
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_ls, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            docs = [nlp.tokenizer(text) for text in valid_text]
            for i, doc in enumerate(textcat.pipe(docs)):
                true_series = pd.Series(valid_label[i]['cats'])
                true_label = map_ls[true_series.argmax()]
                true_labels.append(true_label)
    
                pdt_series = pd.Series(doc.cats)
                pdt_label = map_ls[pdt_series.argmax()]
                pdt_labels.append(pdt_label)
            score = f1_score(true_labels,pdt_labels, average='weighted')
            print('textcat loss: {:.3f}\tf1 score: {:.3f}'.format(losses['textcat'],score))


Training the model...
textcat loss: 149.624	f1 score: 0.489
textcat loss: 122.105	f1 score: 0.545
textcat loss: 97.492	f1 score: 0.555
textcat loss: 75.275	f1 score: 0.556
textcat loss: 62.243	f1 score: 0.593
textcat loss: 48.667	f1 score: 0.583
textcat loss: 37.764	f1 score: 0.569
textcat loss: 33.416	f1 score: 0.576
textcat loss: 28.159	f1 score: 0.579
textcat loss: 20.856	f1 score: 0.572


## Use Pretrain Vectors

In [244]:
import srsly

In [239]:
text, lab = list(zip(*train_ls))

In [241]:
final_ls = []
for doc in text:
    final_ls.append({'text':doc})

In [243]:
final_ls[:2]

[{'text': 'Our room did not have a window so no natural light was very dark Wifi was limited to an hour for the price of the hotel should have been free wifi Drinks at the bar very expensive The location was excellent The facilities are very good the gym was open 24 hours the staff very friendly and polite'},
 {'text': 'Everything Couldn t fault it in anyway I would definitely stay there again and recommend it to friends and family'}]

In [245]:
# data = [{"text": "Some text"}, {"text": "More..."}]
srsly.write_jsonl("/Users/tomlin/Desktop/text.jsonl", text)