In [1]:
import pandas as pd
import re
from tqdm import tqdm
from googletrans import Translator
import string
pd.set_option('display.max_rows', None)

In [2]:
df_train = pd.read_csv('./dataset/train.csv')
df_test = pd.read_csv('./dataset/test.csv')


In [3]:
df_train.review_text[1200]

' Both the food and the location were wonderful The staff was very helpful and quick to comply all of our requests We ll definitely be coming back again '

In [6]:
df_train.head()

Unnamed: 0,hotel_name,hotel_location,review_id,review_date,review_text,review_language,review_rating
0,Cobblestone Inn Suites-eads,Eads US,3mcTZLLqVE5ztNyE,28-11-2015,This is a new hotel. Great staff. Loved intera...,en,8.0
1,DoubleTree by Hilton London Islington,London 9LA United Kingdom,7HCg4Hk7ZbpQY60X,29-12-2015,Lovely attentive and welcoming staff Clean an...,en,10.0
2,citizenM Tower of London,London United Kingdom,yABlhfdJX4UlnNqA,08-09-2016,Cozy space to chill \n Not much variety of fo...,en,7.1
3,The Savoy,London United Kingdom,hS252WXcgeRVdch6,15-10-2011,The hotel has certainly benefited from the inv...,en,10.0
4,Millennium Gloucester Hotel London,,Fo44un6pn2M7XoVJ,23-10-2015,Excellent location for the Albert Hall and pl...,en,10.0


# Text Data Preprocessing

## Fillna

In [4]:
# Fill NaN values in 'hotel_location' column with an empty string

def fill_null_value(df):
    df['review_text'].fillna('', inplace=True)
    df['review_language'].fillna('en', inplace=True)
    return df

df_train = fill_null_value(df_train)
df_test = fill_null_value(df_test)

## Translate to English

In [7]:
non_english_df_train = df_train[df_train['review_language'] != 'en']
non_english_df_test = df_test[df_test['review_language'] != 'en']

def translate_to_english(df):
    translator = Translator()#service_urls=['translate.googleapis.com'])
    translated_texts = []
    for index, row in tqdm(df.iterrows(), total=len(df)):
        translated = translator.translate(row['review_text'], dest='en').text
        translated_texts.append(translated)
    df['review_text'] = translated_texts

translate_to_english(non_english_df_train)
translate_to_english(non_english_df_test)

 75%|███████████████████████████████████████████████████████████▎                   | 5573/7423 [16:46<05:34,  5.54it/s]


ConnectError: [Errno -3] Temporary failure in name resolution

In [None]:
df_train.to_csv('train_english.csv', index = False)
df_test.to_csv('test_english.csv', index = False)

## Drop Link

Sementara tidak dipakai

In [None]:
link_pattern = r'\s*https?://\S+$'

def drop_link(df):
    for i in tqdm(range(len(df['review_text']))):
        # Perform the desired manipulation
        value = str(df['review_text'][i])
        
        output = re.sub(link_pattern, '', value)
        
        #print(output)
        # Change the value in place
        df.loc[i,"review_text"]= output

# drop_link(df_train)
# drop_link(df_test)

## Drop Escape Character

In [None]:
def remove_escape_sequences(text):
    # Remove escape sequences
    text = re.sub(r'\\\\', '', text)     # \\
    text = re.sub(r"\\'", "'", text)     # \'
    text = re.sub(r'\\"', '"', text)     # \"
    text = re.sub(r'\\n', '', text)      # \n
    text = re.sub(r'\n\n', '', text)
    text = re.sub(r'\\t', '', text)      # \t
    text = re.sub(r'\\b', '', text)      # \b
    text = re.sub(r'\\r', '', text)      # \r
    text = re.sub(r'\\f', '', text)      # \f
    text = re.sub(r'\\012', '', text)    # \012
    text = re.sub(r'\\x0A', '', text)    # \x0A
    
    return text

def drop_escape_sequences(df):
    for i in tqdm(range(len(df['review_text']))):
        # Perform the desired manipulation
        value = str(df['review_text'][i])
        value = remove_escape_sequences(value)
        df.loc[i,"review_text"]= value

drop_escape_sequences(df_train)
drop_escape_sequences(df_test)

In [None]:
df_train.head()

## Drop Punctuation

In [None]:
def remove_punctuation(sentence):
    # Create a translation table mapping punctuation characters to None
    translator = str.maketrans("", "", string.punctuation)
    
    # Remove punctuation using the translation table
    sentence_without_punctuation = sentence.translate(translator)
    
    return sentence_without_punctuation

def drop_punctuation(df):
    for i in tqdm(range(len(df['review_text']))):
        value = str(df['content'][i])
        output =  remove_punctuation(value)
        # Change the value in place
        df.loc[i,"content"]= output

drop_punctuation(df_train)
drop_punctuation(df_test)

## 