# Feature Engineering

## Import packages

In [1]:
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)

from tqdm import tqdm

In [2]:
dest = 'data/cleaned_data/'

## Read raw data

In [3]:
_df = pd.read_csv('data/train.csv')

In [4]:
df = _df.copy()

## Initially clean data

In [5]:
df['review'] = df['review'].str.lower().str.strip().str.replace(u'\u200b', ' ')

In [6]:
# good quality
gqdf = df[df['review'] == 'good quality']

In [7]:
indexlist = list(gqdf.index)
for i in indexlist:
    df.loc[i, 'rating'] = 5

========================================================================================================================

## Emoji Transformation
<i class="fa fa-exclamation-circle"></i> 方向2

In [8]:
import emoji
import emojis

In [9]:
df['review'] = df['review'].str.replace(r':[\(]+', 'dislike').str.replace(r':\'[\(]+', 'dislike').str.replace(r'(:[\(]+|=[\(]+)', 'smile')

In [10]:
def widen_emoji(text):
    for c in text:
        if c in emoji.UNICODE_EMOJI.keys():
            text = ' '.join(text.replace(c, (' '+c+' ')).split())
    return text

In [11]:
df['review'] = df['review'].apply(widen_emoji)

In [12]:
# 檢查一下是否轉換成功
emoji_df = df[df['review'].str.contains(u'[\U00002600-\U000027BF]|[\U0001f300-\U0001f64F]|[\U0001f680-\U0001f6FF]')]
print(len(emoji_df))
emoji_df.sample(5)['review']

19163


114573    tq .. brg until properly once ... 😅 ....                                                                                                                                              
4037      the cover already broken when i received the item 😩                                                                                                                                   
112152    min until the stuff really fast. asker also nice gemes really cute. thanks y min 👍 😍                                                                                                  
32616     it's full, but wait a little longer. 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍 👍
98769     originally purchased through a slow response tp wa tried through shoppe alhamdulillah cepet d process. always like produl almahyra 😘 😍                                                
Name: review, dtype: object

In [13]:
# 用空白分開emoji後的資料
df.to_csv('data/cleaned_data/train_emoji.csv', index=False)

In [14]:
df['review'] = df['review'].apply(emoji.demojize)
df['review'] = df['review'].str.replace(':', ' ').str.replace('_', ' ').str.replace('-', ' ')

In [15]:
# 用空白分開emoji且把emoji轉成單詞後的資料
df.to_csv('data/cleaned_data/train_emoji2word.csv', index=False)

## Contractions Decompose
<i class="fa fa-exclamation-circle"></i> 方向4

In [16]:
# 檢查含有 xx'xx 的review
def check_abbr():
    abbr_pat = re.compile(r'[\w]+\'[\w]+')
    return df[df['review'].str.contains(abbr_pat)]

abbr_df = check_abbr()
print(len(abbr_df))
abbr_df.sample(10)

9300


Unnamed: 0,review_id,review,rating
79320,79320,"but i have a good sized sewing white. lest the dirty clothes look good strong price sensitivity shipped within 3 days, it's very cut package delivery okay.",4
119580,119580,excellent product quality excellent product price is very good delivery speed is very good seller's response,5
143156,143156,awesome awesome speed of the ship's delivery speed fabulous quality of goods awesome value cp,5
27943,27943,"nice quality for the price. smiling face i'm 4 months preggy. ordered l, there is room in the tummy area which is perfect because more baby boys. snug fit in the hips (too comfortable and just right for a snug fit really support). not too itchy skin. thumbs up",3
51735,51735,put it on pretty well with many colors to choose from. a vintage bit lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop lollipop . it's a little late in the process.,3
65609,65609,inexpensive thumbs up light skin tone thumbs up light skin tone the seller is super good buddha's heart to the thumbs up light skin tone thumbs up light skin tone,4
61067,61067,very good product. easy to install. quite bright. don't know how long batteries can last but will definately buy more cause it really helps to see things in the cabinet. really like this product very much.it's the cheapest compared to the rest. very fast delivery too but no bubble wrap.,3
30766,30766,according to the product's image a slightly damaged,3
142478,142478,"original goods to order, the material seems quite hot passable but it's ok, yes suit was the same price makasihh hehe kakk beaming face with smiling eyes",5
108501,108501,awesome awesome speed of the ship's delivery speed fabulous quality of goods awesome awesome service value cp there are a hand writing. very warm red heart,5


In [17]:
mapping = {'\'s': ' ', '\'m': ' am', '\'re': ' are', '\'ll': ' will', '\'d': ' would', '\'t': ' not', '\'ve': ' have'}

for abbr in mapping.keys():
    df['review'] = df['review'].str.replace(abbr, mapping[abbr])

In [18]:
# 剩下的很多都是打錯或印尼後綴詞 不管了
df['review'] = df['review'].str.replace('\'', '').str.replace('oclock', 'o\'clock')

In [19]:
df.to_csv('data/cleaned_data/train_sol24.csv', index=False)

## Spelling Correction
<i class="fa fa-exclamation-circle"></i> 方向1  
I have no idea.... 
交給 JiaLing 了

In [20]:
from sol1 import trim_letters, detect_language, spell_checker

In [21]:
pattern = re.compile(r"([a-zA-z_])\1{2,}", re.DOTALL)
df[df['review'].str.match(pattern)].sample(10)

Unnamed: 0,review_id,review,rating
48186,48186,dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd ddddddddddddddddxxdddddddddddddddddddddddd,3
143528,143528,"aaaaa ... his clothes were cool bgt smiling face with heart eyes love deh, because i am sure next order klo gini lah way, selernya good jg though i nag nanya bgt tp ttp mulu patient, shipping a little long anyway emng feedback is yes from abroad i jg fitting cny orders, good stuff genuine bgt tp rekomended",5
134305,134305,bbbbbbbbbbbbbbbaaaaaaaaaaaagggggggggguuuuuuuuuuusssssssss bbbbbaaaaaannnnnggggggeeeeeettt aaaaakkkkkkuuuuuu sssssuuuuuukkkkkkkkkkkaaaaa red heart,5
136002,136002,iiiiniii baraaang diipackiiing dengaaan sangaaatt amaat aaaamaaan. suuukaa dengaan waangiinyaaa. fast absorbing,5
134170,134170,mmmmmmmmmaaannnnntttuuullllllll mmmaaakkkaasssiiihhh,5
76623,76623,ssssssssssuuuuuuuuuuukkkkkkkkkkkkkkkkaaaaaaaaaa anyway,4
19222,19222,aaa grade quality equivalent to the common good is not good enough grades authentic touch better chance subsidize new page slows it.,2
17629,17629,dddssxxxcccccccxcxxxxcxxxccvvvxzzsxxxcccc hjgssddvhjolkjhvcffffdwqafhhjkkkkkgddfffttrrtfcfvvvcvvvgggg,2
78619,78619,mmmmmmmmmmaaaaaaaaaannnnnnnnnnttttttttttuuuuuuuuuullllllllll smiling face with smiling eyes thumbs up,4
2235,2235,gggggghgghhhghghhggghhgggggggggggggggghgggggggggghghggggggggggg gggggggggghhggghghhhghhhghhgghgggggggggyggggggggggggggggtgygggg. gggfggfggghggfffff,1


In [22]:
df['review'] = df['review'].apply(trim_letters)

In [23]:
pattern = re.compile(r"([a-zA-z_])\1", re.DOTALL)
df[df['review'].str.match(pattern)].sample(10)

Unnamed: 0,review_id,review,rating
145016,145016,kkeerreenn really really,5
118782,118782,kk mantull bag smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes pict real bag bangaet deh uda his umpteenth kli order in sni gk prnah its quality is really good disappointed smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes smiling face with heart eyes,5
64709,64709,"pp 1 2 days there is a change, but in use this week is ko bruntusan began to disappear. acne skin began to feel flat and smooth. thanks a lot pinkies",4
81449,81449,kk ookkee paass ....... ..... siipp ..... thanks.,4
4175,4175,mmg x ok..,1
55834,55834,ggeed ddee,3
109166,109166,xxckhxlflycluclhchkxkgxkgxcn ngxnxgjxjgxhkfkhchkckhckhckh,5
84350,84350,nnkkjjhhgvbb. nhjmjfdssdghhjj jkkllkgffcbhjkmmnnb,4
79789,79789,tthhaann kk tthhaann kk tthhaann kk tthhaann kk tthhaann kk tthhaan,4
35883,35883,pp trying once new and visible results,3


In [None]:
# lang detect
from tqdm import tqdm

langs = []
for index, row in tqdm(df.iterrows()):
    try:
        if detect(row['review']) != 'en':
            langs.append('id')
        else:
            langs.append('en')
    except Exception:
        langs.append('en')
df['language']= langs

In [24]:
# 印尼單詞轉英文
id2en = {'bagus': 'very good', 'bgus': 'very good', 
         'banget': 'really', 'bnget': 'really', 
         'sip': 'ok', 'siip': 'ok', 'ssiipp':'ok', 
         'baunya': 'smell', 
         'pesenan': 'purchase'}

for idon in id2en.keys():
    df['review'] = df['review'].str.replace(idon, id2en[idon])

In [None]:
# 縮寫轉換


In [None]:
# 拼字糾正
import nltk
from nltk.corpus import stopwords

wordslist = df['review'].apply(nltk.word_tokenize)
corrections = []
for words in tqdm(wordslist):
    correction = spell_checker(words)
    corrections.append(' '.join(correction))

df['review'] = corrections

In [25]:
df.to_csv(dest+'train_sol241.csv', index=False)

## Meaningless Spelling Filter
<span style='color:#FF0000'><i class="fa fa-exclamation-circle"></i> 這個cell必須建立在方向2、4、1都解決的前提下才能執行。</span>  
<i class="fa fa-exclamation-circle"></i> 方向5  
過濾掉那些亂打字的字串，目前的做法是詞頻小於n個的都拿掉，可能會拿掉一些正常單詞，但出現數量太少也不會影響學習。

In [26]:
import nltk
# nltk.download('stopwords') 沒下載過的話把註解拿掉
# nltk.download('punkt') 沒下載過的話把註解拿掉

import string
from nltk.corpus import stopwords

In [27]:
# 讀取處理好的檔
df = pd.read_csv(dest+'train_sol241.csv')

In [28]:
df[df['review'].isna()]

Unnamed: 0,review_id,review,rating


In [29]:
# 先把符號過濾掉
df['review'] = df['review'].str.replace(r'[^\w\s\r\n]', '')

In [39]:
# 定義最小詞頻(3就有很強的效果了，4以上明顯會過濾掉有意義單詞)
TOL = 2

In [31]:
# 把所有單詞丟進words
words = []
stops = stopwords.words('english') + list(string.punctuation)
wordslist = df['review'].apply(nltk.word_tokenize)
for i in range(len(df)):
    words.extend([word for word in wordslist[i] if word not in stops])

In [32]:
# 統計每個單詞出現的量(詞頻)
wordfreqs = nltk.FreqDist(words)
wordfreqs

FreqDist({'good': 85420, 'face': 67314, 'product': 57892, 'quality': 51183, 'smiling': 50172, 'thumbs': 38304, 'delivery': 37334, 'eyes': 34455, 'heart': 33278, 'seller': 24261, ...})

In [40]:
# 看一下哪些東西可以刪
def get_meanless_words(tolerance=3, maxlen=15):
    removables = {}
    wordfreq_arr = np.array(list(wordfreqs.items()))
    for wf in wordfreq_arr:
        if (int(wf[1]) <= tolerance) and (len(wf[0]) >= maxlen):
            removables[wf[0]] = wf[1]

    return removables

meanless_words_dict = get_meanless_words(TOL, 15)

In [41]:
len(meanless_words_dict)

2741

In [42]:
meanless_words_dict

{'masaalhterukbetul': '1',
 'hrdjdkodpdbdbekaoksvf': '1',
 'dhsjsjgdvdvfvgvtvtbhfklfk': '1',
 'makipagcooperate': '1',
 'hshsjsosnsjussh': '1',
 'jsjsjsjjsjsjsnsnsjsjsusj': '1',
 'hsusjsjbshsjsisjsbsjsis': '1',
 'hdhshdhbdbdjsjsjsksis': '1',
 'udjdjekkemsjsnsjsjs': '1',
 'designperothnksksi': '1',
 'jadinyaapackingan': '1',
 'deliveryfunction': '1',
 'diinginkanbarang': '1',
 'incompatibility': '1',
 'mengecewakankekecilan': '1',
 'nyalaaqaaqaqaaqqa': '1',
 'orderhariygsama': '1',
 'nakakadissappoint': '1',
 'hargaterimakasih': '2',
 'costumerwasting': '1',
 'returnrefundand': '1',
 'defectivedeformed': '1',
 'nakakapanghinayang': '1',
 'sebenarnyakecewa': '1',
 '1batnotconnected': '1',
 'disappointedpuro': '1',
 'jaitannyanyesel': '1',
 'bbhhjkkjhffddgghgg': '1',
 'deliveredcontacted': '1',
 '1234567890abcdefghijklmn': '1',
 'mesinnyaikhlasin': '1',
 'replyingresponding': '1',
 'тιααααααккккккккк': '1',
 'gghgghhghghhgghhgghgghghgg': '1',
 'gghhgghghhghhghhgghggyggtgygg': '1',
 'dissa

In [43]:
# 過濾掉無意義的詞，丟回df裡
cleaned_texts = []
meanless_words = list(meanless_words_dict.keys())
for wl in tqdm(wordslist):
    cleaned_words = []
    for w in wl:
        if w not in meanless_words:
            cleaned_words.append(w)
    cleaned_texts.append(' '.join(cleaned_words))

df['review'] = cleaned_texts

100%|██████████| 146811/146811 [01:37<00:00, 1499.17it/s]


In [44]:
# 檢查看看處理完的資料裡有沒有全是空白的review
df[df['review'].str.match(r'^[\s]+$')]

Unnamed: 0,review_id,review,rating


In [45]:
df.to_csv(dest+'train_sol2415.csv', index=False)

## Stopwords

## Meanless Review Filter

## Test Processing
<i class="fa fa-exclamation-circle"></i> 等所有的處理流程都定好後，也要對 `test.csv` 做同樣處理

In [46]:
import preprocess
import pipline
from sol1 import trim_letters

In [47]:
test = pd.read_csv('data/test.csv')

In [48]:
preprocess.initially_clean(test)

In [49]:
test['review'] = test['review'].str.replace(r':[\(]+', 'dislike').str.replace(r':\'[\(]+', 'dislike').str.replace(r'(:[\(]+|=[\(]+)', 'smile')
test['review'] = test['review'].apply(pipline.emoji_transform)
test['review'] = test['review'].str.replace(':', ' ').str.replace('_', ' ').str.replace('-', ' ')

In [50]:
test = pipline.contractions_decompose(test, 'review')

In [51]:
test.head()

Unnamed: 0,review_id,review
0,1,"great danger, cool, motif and cantik2 jg models. delivery cepet. tp packing less okay krn only wear clear plastic nerawang klihtan contents jd"
1,2,one of the shades don not fit well
2,3,very comfortable
3,4,fast delivery. product expiry is on dec 2022. product wrap properly. no damage on the item.
4,5,it sooooo cute! i like playing with the glitters better than browsing on my phone now. item was also deliered earlier than i expected. thank you seller! may you have more buyers to come. blush blush blush


In [52]:
test.to_csv(dest+'test_sol24.csv', index=False)

In [53]:
test['review'] = test['review'].apply(trim_letters)

In [54]:
# 印尼單詞轉英文
id2en = {'bagus': 'very good', 'bgus': 'very good', 
         'banget': 'really', 'bnget': 'really', 
         'sip': 'ok', 'siip': 'ok', 'ssiipp':'ok', 
         'baunya': 'smell', 
         'pesenan': 'purchase'}

for idon in id2en.keys():
    df['review'] = df['review'].str.replace(idon, id2en[idon])

In [55]:
test.to_csv(dest+'test_sol241.csv', index=False)