# Data pre-processing

## Import packages

In [110]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_colwidth', 0)

## Read data from .csv

In [111]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

## Make a copy to operate

In [123]:
train = train_df.copy()
test = test_df.copy()

In [124]:
# check if success on reading data
train.head()

Unnamed: 0,review_id,review,rating
0,0,Ga disappointed neat products .. Meletot Hilsnyaa Speed ​​of delivery is good.,1
1,1,"Rdtanya replace broken glass, broken chargernya",1
2,2,Nyesel bngt dsni shopping antecedent photo message pictures gk according foto.di existing collagen super fit nyampe holo my house open ehhh collagen contents even in the face pdahal jg description super existing collagen originalnya.pas writing my check lg in photo captions already ma The change ma pictures that the face.,1
3,3,Sent a light blue suit goods ga want a refund,1
4,4,Pendants came with dents and scratches on its surface. The coating looks like it will change colour quickly.,1


## Clean data

### Initially clean data

In [125]:
def initially_clean(df):
    df.drop('review_id', axis=1, inplace=True)
    df['review'] = df['review'].str.strip().str.lower()

In [126]:
initially_clean(train)
initially_clean(test)

### Save the initially cleaned data

In [127]:
train_apostrophe = train.copy()
test_apostrophe = test.copy()

In [159]:
train_apostrophe.to_csv('data/cleaned_data/train_symbols.csv', index=False)
test_apostrophe.to_csv('data/cleaned_data/test_symbols.csv', index=False)

### Clean some meaningless symbol
Note that we want to exclude all the meaningless symbol except the emoji😂 since emoji😂 can be a feature for sentiment.

In [135]:
symbol_list = [chr(i) for i in range(33, 48)]
symbol_list.extend([chr(i) for i in range(58, 65)])
symbol_list.extend([chr(i) for i in range(91, 97)])
symbol_list.extend([chr(i) for i in range(123, 127)])

In [137]:
def clean_ascii_symbols(df, col):
    for s in symbol_list:
        df[col] = df[col].str.replace(s, '')

In [140]:
clean_ascii_symbols(train, 'review')
clean_ascii_symbols(test, 'review')

### Clean all the rows with blank review

In [146]:
# remove the rows with blank review
def clean_blank_rows(df, col):
    blank_index_list = df[df[col].str.match('^[^\S]*$')].index
    df.drop(blank_index_list, inplace=True)

In [147]:
clean_blank_rows(train, 'review')
clean_blank_rows(test, 'review')

In [155]:
# pick 30 samples to check if they looks good
train.sample(30)

Unnamed: 0,review,rating
34054,quickly up good product seller friendly,3
29493,good quality of goods packaging is perfect looks very texture but little other big stars are great,3
123392,has been successfully completed through sensitive shipping good quality products,5
62924,thanks for the goods have been received,3
8149,this does not cover its not like a sponge for polishing but it made me a sponge thoks why im here so i cut sales but the pressure to perform well as for you,1
49189,goods have been received good to order good seller,3
48437,weight slippers turns,3
118720,excellent quality very accommodating seller wellpackaged item shipped immediately item shipped immediately item shipped immediately item shipped immediately item shipped immediately item shipped immediately item shipped immediately item shipped immediately will order again,5
136524,excellent service by seller,5
139401,the product quality is excellent the original product the product price is very good delivery speed is very baik☺️⭐⭐⭐⭐⭐,5


### Save the symbols cleaned data

In [160]:
train.to_csv('data/cleaned_data/train_nosymbols.csv', index=False)
test.to_csv('data/cleaned_data/test_nosymbols.csv', index=False)