## Outline of steps
* [step0](#step0): import necessary packages
* [step1](#step1): import dataset part4_dataset.pickle as part5_dataset
* [step2](#step2): combine Positive_Review and Negative_Review into one text column
* [step3](#step3): replace the punctuation in the string `combined_review`
* [step4](#step4): save the output as `part5_dataset.pickle`

In [1]:
# import necessary packages
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # module for missing value visualization
from scipy import stats # implement box-cox transformation
from math import ceil
from string import strip # Return a copy of the string with leading and trailing characters removed
from sklearn.utils import shuffle # shuffling the dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer


# Pretty display for notebooks
%matplotlib inline

<a id="step1"></a>
## step1: import dataset part4_dataset.pickle as part5_dataset

In [2]:
part5_dataset = pd.read_pickle("part4_dataset.pickle")

<a id="step2"></a>
## step2: combine Positive_Review and Negative_Review into one text column

In [3]:
# combine Positive_Review and Negative_Review into one text column
# strip the whitespace at both ends
part5_dataset.Negative_Review = part5_dataset.Negative_Review.apply(lambda x: strip(x))
part5_dataset.Positive_Review = part5_dataset.Positive_Review.apply(lambda x: strip(x))

# combine the two text column
part5_dataset["combined_review"] = part5_dataset[["Negative_Review","Positive_Review"]].apply(lambda x: " ".join(x), axis=1)

# have a look at the result
display(part5_dataset[["combined_review","Negative_Review","Positive_Review"]].iloc[2,0])

'Rooms are nice but for elderly a bit difficult as most rooms are two story with narrow steps So ask for single level Inside the rooms are very very basic just tea coffee and boiler and no bar empty fridge Location was good and staff were ok It is cute hotel the breakfast range is nice Will go back'

<a id="step3"></a>
## step3: replace the punctuation in the string `combined_review`

In [4]:
# replace the punctuation in the string "combined_review" except alphanumeric character and white-space
part5_dataset["combined_review"] = part5_dataset["combined_review"].str.replace("[^\w\s]","")

<a id="step4"></a>
## step4: save the output as `part5_dataset.pickle`

In [5]:
part5_dataset.to_pickle("part5_dataset.pickle")