In [45]:
import pandas as pd
import re
from tqdm import tqdm
from googletrans import Translator
import string
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package punkt to /home/rayhanadi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rayhanadi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rayhanadi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
df_train = pd.read_csv('./dataset/train.csv')
df_test = pd.read_csv('./dataset/test.csv')


In [3]:
df_train.review_text[1200]

' Both the food and the location were wonderful The staff was very helpful and quick to comply all of our requests We ll definitely be coming back again '

In [4]:
df_train.head(10)

Unnamed: 0,hotel_name,hotel_location,review_id,review_date,review_text,review_language,review_rating
0,Cobblestone Inn Suites-eads,Eads US,3mcTZLLqVE5ztNyE,28-11-2015,This is a new hotel. Great staff. Loved intera...,en,8.0
1,DoubleTree by Hilton London Islington,London 9LA United Kingdom,7HCg4Hk7ZbpQY60X,29-12-2015,Lovely attentive and welcoming staff Clean an...,en,10.0
2,citizenM Tower of London,London United Kingdom,yABlhfdJX4UlnNqA,08-09-2016,Cozy space to chill \n Not much variety of fo...,en,7.1
3,The Savoy,London United Kingdom,hS252WXcgeRVdch6,15-10-2011,The hotel has certainly benefited from the inv...,en,10.0
4,Millennium Gloucester Hotel London,,Fo44un6pn2M7XoVJ,23-10-2015,Excellent location for the Albert Hall and pl...,en,10.0
5,Radisson Blu Champs Elys es Paris,Paris France,u5GIYVDPKzyelPUE,14-11-2015,No Positive\n The sink was not practical The g...,en,7.9
6,The Cumberland A Guoman Hotel,London 7DL United Kingdom,KMQ7WwisR6N9ifT9,05-06-2017,Friendly staff particularly in the bar\n Room...,en,9.2
7,Hotel Bristol,Milan Italy,abRtDdSsOHIf2j0S,09-07-2016,No Positive\n The kitchen staff was non cooper...,en,7.1
8,Grange Tower Bridge Hotel,London United Kingdom,tYLpCYokKtyShCCV,01-03-2017,No Positive\n The fact food was not included i...,en,10.0
9,Park Plaza Victoria London,London United Kingdom,EvMXR91LcGbAUbdr,01-01-2017,Didn t have breakfast,da,8.8


# Text Data Preprocessing

## Fillna

In [5]:
# Fill NaN values in 'hotel_location' column with an empty string

def fill_null_value(df):
    df['review_text'].fillna('', inplace=True)
    df['review_language'].fillna('en', inplace=True)
    return df

df_train = fill_null_value(df_train)
df_test = fill_null_value(df_test)

## Translate to English

Disini kita import preprocessed data saja

In [6]:
non_english_df_train = pd.read_csv('./dataset/train_to_english.csv')
non_english_df_train.set_index('Unnamed: 0', inplace=True)
non_english_df_test = pd.read_csv('./dataset/test_to_english.csv')
non_english_df_test.set_index('Unnamed: 0', inplace=True)

In [7]:
def translate_to_english_preprocessed(df, non_english_df):
    for index_value in non_english_df.index:
        df.loc[df.index == index_value, 'review_text'] = non_english_df.loc[non_english_df.index == index_value, 'review_text'].values


translate_to_english_preprocessed(df_train, non_english_df_train)
translate_to_english_preprocessed(df_test, non_english_df_test)

## Handle Contractions

In [8]:
contractions_dict = {
    "ain't": "are not",
    "'s": " is",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "that'd": "that would",
    "that'd've": "that would have",
    "there'd": "there would",
    "there'd've": "there would have",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what've": "what have",
    "when've": "when have",
    "where'd": "where did",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who've": "who have",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}


In [9]:
contractions_re = re.compile('(%s)'%'|'.join(contractions_dict.keys()))

def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)

def handle_contractions(df):
    for i in tqdm(range(len(df['review_text']))):
        value = str(df['review_text'][i])
        value_handled_contractions = expand_contractions(value)
        df.loc[i, "review_text"] = value_handled_contractions

# Assuming df_train and df_test are DataFrames with a 'review_text' column
handle_contractions(df_train)
handle_contractions(df_test)

100%|██████████████████████████████████████████████████████████████████████████| 278435/278435 [20:34<00:00, 225.52it/s]
100%|█████████████████████████████████████████████████████████████████████████| 278436/278436 [00:31<00:00, 8774.09it/s]


## Drop Link

Sementara tidak dipakai

In [10]:
link_pattern = r'\s*https?://\S+$'

def drop_link(df):
    for i in tqdm(range(len(df['review_text']))):
        # Perform the desired manipulation
        value = str(df['review_text'][i])
        
        output = re.sub(link_pattern, '', value)
        
        #print(output)
        # Change the value in place
        df.loc[i,"review_text"]= output

# drop_link(df_train)
# drop_link(df_test)

## Drop Escape Character

In [11]:
def remove_escape_sequences(text):
    # Remove escape sequences
    text = re.sub(r'\\\\', '', text)     # \\
    text = re.sub(r"\\'", "'", text)     # \'
    text = re.sub(r'\\"', '"', text)     # \"
    text = re.sub(r'\\n', '', text)      # \n
    text = re.sub(r'\n\n', '', text)
    text = re.sub(r'\\t', '', text)      # \t
    text = re.sub(r'\\b', '', text)      # \b
    text = re.sub(r'\\r', '', text)      # \r
    text = re.sub(r'\\f', '', text)      # \f
    text = re.sub(r'\\012', '', text)    # \012
    text = re.sub(r'\\x0A', '', text)    # \x0A
    
    return text

def drop_escape_sequences(df):
    for i in tqdm(range(len(df['review_text']))):
        # Perform the desired manipulation
        value = str(df['review_text'][i])
        value = remove_escape_sequences(value)
        df.loc[i,"review_text"]= value

drop_escape_sequences(df_train)
drop_escape_sequences(df_test)

100%|██████████████████████████████████████████████████████████████████████████| 278435/278435 [20:09<00:00, 230.12it/s]
100%|████████████████████████████████████████████████████████████████████████| 278436/278436 [00:22<00:00, 12496.20it/s]


In [12]:
df_train.head()

Unnamed: 0,hotel_name,hotel_location,review_id,review_date,review_text,review_language,review_rating
0,Cobblestone Inn Suites-eads,Eads US,3mcTZLLqVE5ztNyE,28-11-2015,This is a new hotel. Great staff. Loved intera...,en,8.0
1,DoubleTree by Hilton London Islington,London 9LA United Kingdom,7HCg4Hk7ZbpQY60X,29-12-2015,Lovely attentive and welcoming staff Clean an...,en,10.0
2,citizenM Tower of London,London United Kingdom,yABlhfdJX4UlnNqA,08-09-2016,Cozy space to chill \n Not much variety of fo...,en,7.1
3,The Savoy,London United Kingdom,hS252WXcgeRVdch6,15-10-2011,The hotel has certainly benefited from the inv...,en,10.0
4,Millennium Gloucester Hotel London,,Fo44un6pn2M7XoVJ,23-10-2015,Excellent location for the Albert Hall and pl...,en,10.0


## Drop Punctuation

In [13]:
def remove_punctuation(sentence):
    # Create a translation table mapping punctuation characters to None
    translator = str.maketrans("", "", string.punctuation)
    
    # Remove punctuation using the translation table
    sentence_without_punctuation = sentence.translate(translator)
    
    return sentence_without_punctuation

def drop_punctuation(df):
    for i in tqdm(range(len(df['review_text']))):
        value = str(df['review_text'][i])
        output =  remove_punctuation(value)
        # Change the value in place
        df.loc[i,"review_text"]= output

drop_punctuation(df_train)
drop_punctuation(df_test)

100%|██████████████████████████████████████████████████████████████████████████| 278435/278435 [20:41<00:00, 224.34it/s]
100%|████████████████████████████████████████████████████████████████████████| 278436/278436 [00:21<00:00, 12986.54it/s]


## Lowercasing

In [14]:
def lowercasing(df):
    for i in tqdm(range(len(df['review_text']))):
        # Perform the desired manipulation
        
        value = str(df['review_text'][i])
        
        output =  value.lower()

        # Change the value in place
        df.loc[i,"review_text"]= output
        
lowercasing(df_train)
lowercasing(df_test)


100%|██████████████████████████████████████████████████████████████████████████| 278435/278435 [21:07<00:00, 219.76it/s]
100%|████████████████████████████████████████████████████████████████████████| 278436/278436 [00:19<00:00, 14170.15it/s]


In [17]:
# Checkpoint

df_train.to_csv('train_lowercased.csv', index = True)
df_test.to_csv('test_lowercased.csv', index = True)

df_train.head()

Unnamed: 0,hotel_name,hotel_location,review_id,review_date,review_text,review_language,review_rating
0,Cobblestone Inn Suites-eads,Eads US,3mcTZLLqVE5ztNyE,28-11-2015,this is a new hotel great staff loved interact...,en,8.0
1,DoubleTree by Hilton London Islington,London 9LA United Kingdom,7HCg4Hk7ZbpQY60X,29-12-2015,lovely attentive and welcoming staff clean an...,en,10.0
2,citizenM Tower of London,London United Kingdom,yABlhfdJX4UlnNqA,08-09-2016,cozy space to chill \n not much variety of fo...,en,7.1
3,The Savoy,London United Kingdom,hS252WXcgeRVdch6,15-10-2011,the hotel has certainly benefited from the inv...,en,10.0
4,Millennium Gloucester Hotel London,,Fo44un6pn2M7XoVJ,23-10-2015,excellent location for the albert hall and pl...,en,10.0


## Tokenization

In [54]:
df_train = pd.read_csv('./train_lowercased.csv')
df_test = pd.read_csv('./test_lowercased.csv')

In [55]:
def tokenization(df):
    df['review_text'] = df['review_text'].apply(lambda x: nltk.word_tokenize(str(x)))

tokenization(df_train)
tokenization(df_test)

In [56]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,hotel_name,hotel_location,review_id,review_date,review_text,review_language,review_rating
0,0,Cobblestone Inn Suites-eads,Eads US,3mcTZLLqVE5ztNyE,28-11-2015,"[this, is, a, new, hotel, great, staff, loved,...",en,8.0
1,1,DoubleTree by Hilton London Islington,London 9LA United Kingdom,7HCg4Hk7ZbpQY60X,29-12-2015,"[lovely, attentive, and, welcoming, staff, cle...",en,10.0
2,2,citizenM Tower of London,London United Kingdom,yABlhfdJX4UlnNqA,08-09-2016,"[cozy, space, to, chill, not, much, variety, o...",en,7.1
3,3,The Savoy,London United Kingdom,hS252WXcgeRVdch6,15-10-2011,"[the, hotel, has, certainly, benefited, from, ...",en,10.0
4,4,Millennium Gloucester Hotel London,,Fo44un6pn2M7XoVJ,23-10-2015,"[excellent, location, for, the, albert, hall, ...",en,10.0


## Stemming

In [57]:
ps = PorterStemmer()

def stemming(df):
    for i in tqdm(range(len(df['review_text']))):
        # Perform the desired manipulation
        value = df.at[i, 'review_text']  # Assuming review_text is a list of words

        # Perform stemming on each word in the list
        stemmed_text = [ps.stem(word) for word in value]

        # Change the value in place
        df.at[i, 'review_text'] = stemmed_text

stemming(df_train)
stemming(df_test)

100%|█████████████████████████████████████████████████████████████████████████| 278435/278435 [02:04<00:00, 2235.77it/s]
100%|█████████████████████████████████████████████████████████████████████████| 278436/278436 [02:04<00:00, 2238.08it/s]


## Lemmatization

Note: ada kemungkinan lemmatization jelek, jd nanti coba gapake dulu aja

In [58]:
lemmatizer = WordNetLemmatizer()

def lemmatization(df):
    for i in tqdm(range(len(df['review_text']))):
        value = df['review_text'][i]  # No need for str() here
        
        # Perform lemmatization on each word in the list
        output = [lemmatizer.lemmatize(word) for word in value]
        
        # Replace the list in the DataFrame with a string representation of the list
        df.loc[i, "review_text"] = str(output)

# lemmatization(df_train)
# lemmatization(df_test)

In [59]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,hotel_name,hotel_location,review_id,review_date,review_text,review_language,review_rating
0,0,Cobblestone Inn Suites-eads,Eads US,3mcTZLLqVE5ztNyE,28-11-2015,"[thi, is, a, new, hotel, great, staff, love, i...",en,8.0
1,1,DoubleTree by Hilton London Islington,London 9LA United Kingdom,7HCg4Hk7ZbpQY60X,29-12-2015,"[love, attent, and, welcom, staff, clean, and,...",en,10.0
2,2,citizenM Tower of London,London United Kingdom,yABlhfdJX4UlnNqA,08-09-2016,"[cozi, space, to, chill, not, much, varieti, o...",en,7.1
3,3,The Savoy,London United Kingdom,hS252WXcgeRVdch6,15-10-2011,"[the, hotel, ha, certainli, benefit, from, the...",en,10.0
4,4,Millennium Gloucester Hotel London,,Fo44un6pn2M7XoVJ,23-10-2015,"[excel, locat, for, the, albert, hall, and, pl...",en,10.0


In [60]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,hotel_name,hotel_location,review_id,review_date,review_text,review_language
0,0,Ohla Barcelona,Barcelona Spain,9p62eIN5NkEadRe7,18-03-2017,"[no, posit, the, shower, wa, open, plan, to, t...",en
1,1,Radisson Blu Edwardian Mercer Street,London United Kingdom,WfQozacJ1lzXMFQg,12-02-2017,"[the, staff, made, us, feel, welcom, as, soon,...",en
2,2,The May Fair Hotel,London United Kingdom,oahzyhSR3ZNHJBuI,06-01-2017,"[great, locat, be, just, off, piccadilli, but,...",en
3,3,Park Plaza County Hall London,United Kingdom,lcO2pUMVQ09RmbHZ,07-02-2017,"[good, personel, and, smile, face, addit, bed,...",en
4,4,Best Western Premier Hotel Couture,Delflandlaan Amsterdam Netherlands,DmMu1z2SozPxykUS,26-11-2015,"[we, had, a, wonder, corner, room, veri, good,...",en


## Remove Stopwords

In [61]:
# Assuming 'review_text' is the column containing your text data
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    filtered_text = [word for word in text if word.lower() not in stop_words]
    return filtered_text

# Function to remove stopwords from the 'review_text' column
def drop_stopwords(df):
    for i in tqdm(range(len(df['review_text']))):
        value = df['review_text'][i]  # No need for str() here
        
        # Perform lemmatization on each word in the list
        output = remove_stopwords(value)
        
        # Replace the list in the DataFrame with a string representation of the list
        df.loc[i, "review_text"] = str(output)

drop_stopwords(df_train)
drop_stopwords(df_test)

100%|████████████████████████████████████████████████████████████████████████| 278435/278435 [00:24<00:00, 11151.73it/s]
100%|████████████████████████████████████████████████████████████████████████| 278436/278436 [00:27<00:00, 10040.29it/s]


In [62]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,hotel_name,hotel_location,review_id,review_date,review_text,review_language,review_rating
0,0,Cobblestone Inn Suites-eads,Eads US,3mcTZLLqVE5ztNyE,28-11-2015,"['thi', 'new', 'hotel', 'great', 'staff', 'lov...",en,8.0
1,1,DoubleTree by Hilton London Islington,London 9LA United Kingdom,7HCg4Hk7ZbpQY60X,29-12-2015,"['love', 'attent', 'welcom', 'staff', 'clean',...",en,10.0
2,2,citizenM Tower of London,London United Kingdom,yABlhfdJX4UlnNqA,08-09-2016,"['cozi', 'space', 'chill', 'much', 'varieti', ...",en,7.1
3,3,The Savoy,London United Kingdom,hS252WXcgeRVdch6,15-10-2011,"['hotel', 'ha', 'certainli', 'benefit', 'inves...",en,10.0
4,4,Millennium Gloucester Hotel London,,Fo44un6pn2M7XoVJ,23-10-2015,"['excel', 'locat', 'albert', 'hall', 'place', ...",en,10.0


In [63]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,hotel_name,hotel_location,review_id,review_date,review_text,review_language
0,0,Ohla Barcelona,Barcelona Spain,9p62eIN5NkEadRe7,18-03-2017,"['posit', 'shower', 'wa', 'open', 'plan', 'roo...",en
1,1,Radisson Blu Edwardian Mercer Street,London United Kingdom,WfQozacJ1lzXMFQg,12-02-2017,"['staff', 'made', 'us', 'feel', 'welcom', 'soo...",en
2,2,The May Fair Hotel,London United Kingdom,oahzyhSR3ZNHJBuI,06-01-2017,"['great', 'locat', 'piccadilli', 'also', 'rel'...",en
3,3,Park Plaza County Hall London,United Kingdom,lcO2pUMVQ09RmbHZ,07-02-2017,"['good', 'personel', 'smile', 'face', 'addit',...",en
4,4,Best Western Premier Hotel Couture,Delflandlaan Amsterdam Netherlands,DmMu1z2SozPxykUS,26-11-2015,"['wonder', 'corner', 'room', 'veri', 'good', '...",en


## Save to file

In [64]:
df_train.to_csv('train_preprocessed.csv', index = True)
df_test.to_csv('test_preprocessed.csv', index = True)