# Feature Engineering

## Import packages

In [1]:
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)

In [2]:
dest = 'data/cleaned_data/'

## Read raw data

In [3]:
_df = pd.read_csv('data/train.csv')

In [4]:
df = _df.copy()

## Initially clean data

In [5]:
df['review'] = df['review'].str.lower().str.strip().str.replace(u'\u200b', ' ')

In [6]:
# good quality
gqdf = df[df['review'] == 'good quality']

In [7]:
indexlist = list(gqdf.index)
for i in indexlist:
    df.rating.loc[i] = 5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


========================================================================================================================

## Emoji Transformation
<i class="fa fa-exclamation-circle"></i> 方向2

In [8]:
import emoji
import emojis

In [9]:
def widen_emoji(text):
    for c in text:
        if c in emoji.UNICODE_EMOJI.keys():
            text = ' '.join(text.replace(c, (' '+c+' ')).split())
    return text

In [10]:
df['review'] = df['review'].apply(widen_emoji)

In [11]:
# 檢查一下是否轉換成功
emoji_df = df[df['review'].str.contains(u'[\U00002600-\U000027BF]|[\U0001f300-\U0001f64F]|[\U0001f680-\U0001f6FF]')]
print(len(emoji_df))
emoji_df.sample(5)['review']

19163


136547    took about two weeks for it to be delivered but i’m very happy with the product 😊                                                                             
76222     the carpet is nice .. sodara2 on want to buy. delivery cepet bgt well. satisfied substantially terimakasih 🙏 🏼                                                
124256    alhamdulillah shopping dsni always satisfactory. pasmina rope jg okay. eid gift kirain what so interchangeable. dah subscription dsni 😁 thanks                
65679     anyway it mantaap mah 👍                                                                                                                                       
67803     excellent product quality the product price is very good the response is very good seller delivery speed is less baikkk. long 😵 . tp puassss with brg.y bgusss
Name: review, dtype: object

In [12]:
# 用空白分開emoji後的資料
df.to_csv('data/cleaned_data/data_emoji.csv', index=False)

In [13]:
df['review'] = df['review'].apply(emojis.decode)

In [14]:
# 用空白分開emoji且把emoji轉成單詞後的資料
df.to_csv('data/cleaned_data/data_emoji2word.csv', index=False)

## Abbreviation Decompose
<i class="fa fa-exclamation-circle"></i> 方向4

In [15]:
# 檢查含有 xx'xx 的review
def check_abbr():
    abbr_pat = re.compile(r'[\w]+\'[\w]+')
    return df[df['review'].str.contains(abbr_pat)]

abbr_df = check_abbr()
print(len(abbr_df))
abbr_df.sample(10)

9300


Unnamed: 0,review_id,review,rating
101675,101675,excellent product quality excellent product price is very good seller's response,4
36443,36443,"pepper, spicy but not too much salt, both because people eat spicy recipes. but do not eat a chili cook is also very salty. enter the oyster's salty year, thanks to you.",3
145132,145132,excellent product quality excellent product price is very good delivery speed is very good seller's response,5
15988,15988,"item came on time and packed. however, item is quite noisy and item didn't come with airtube. can't be used without airtube.",2
67173,67173,its very fabric soft long they're good. like many other :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: its very fabric soft long they're good. like many other :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup: :thumbsup:,4
110907,110907,"alhamdulillah ,, senaaang and satisfied bangeeet ,,,, wait for my orders yes ,,,, lbih selanjut ny happy again if it's in love ,,,, hihihiiii bonus .... olshop ny smoothly continue yes yes ,,, thanks. ..",5
118013,118013,awesome awesome speed of the ship's delivery speed fabulous quality of goods awesome value cp,5
128116,128116,"syukron katsiron his qur'an ka .. i have received well flawless sekali :pray: sellernya well, polite. her beautiful koran, free beads, thank god anyway puass :blush: the product quality is excellent. the product price is very good very good delivery speed is very good seller's response",5
113354,113354,excellent product quality excellent product price is very good delivery speed is very good seller's response,5
103531,103531,excellent quality and good looks'll sya :thumbsup: :thumbsup: :thumbsup: of my husband my gift on father's day. very accommodating also seller. god bless !!!,4


In [16]:
mapping = {'\'s': ' ', '\'m': ' am', '\'re': ' are', '\'ll': ' will', '\'d': ' would', '\'t': ' not', '\'ve': ' have'}

for abbr in mapping.keys():
    df['review'] = df['review'].str.replace(abbr, mapping[abbr])

In [17]:
# 剩下的很多都是打錯或印尼後綴詞 不管了
df['review'] = df['review'].str.replace('\'', '').str.replace('oclock', 'o\'clock')

In [18]:
df.to_csv('data/cleaned_data/data_sol24.csv', index=False)

## Spelling Correction
<i class="fa fa-exclamation-circle"></i> 方向1  
I have no idea.... 
交給 JiaLing 了

In [19]:
from sol1 import trim_letters, detect_language, spell_checker

In [21]:
pattern = re.compile(r"([a-zA-z_])\1{2,}", re.DOTALL)
df[df['review'].str.match(pattern)].sample(10)

Unnamed: 0,review_id,review,rating
37176,37176,mmmmmmaaaannnnnrtttttuuuuuullllllll..... mmmmaaannnnntttttaaaaappppp bbbbeeeeetuuuuuuulllll.....,3
133248,133248,hhhhhhh hhhhhhh uuhhhhhhhhhhh hhhhhhhh hhhhhhhhhhhhhhhhh,5
70727,70727,aaaa :heart_eyes: :heart_eyes: brutally beautiful lunn _shop also cover name for himself again yeu seats sp ❤ ️,4
67340,67340,bbbaaagggguuuusssss jjjookkknnnyyyyaaa bbbuuusssaaa bbbbggtttt ssseeesssuuuaiii ppeeesssaannnnaannn tttaappiiiii bblllmmm diiiccoobbbaaa mmmuuddahh2ann paaasssss,4
98559,98559,oooookkkkkeeeee bbbbbaaaaannnnngggggeeeeettttt !!!!!,4
91662,91662,aaa syuka ckup ..syuka its size is big. good quality good delivery speed. good product quality. :heart_eyes: :heart_eyes: :heart_eyes:,4
136002,136002,iiiiniii baraaang diipackiiing dengaaan sangaaatt amaat aaaamaaan. suuukaa dengaan waangiinyaaa. fast absorbing,5
80308,80308,"aaaaaa very nice, i sukaaa ❤ thanks",4
117066,117066,"aaaaaaa like nice ishh bnget gmbarnya gk broke then also nyampenya cpet bnget, cuman lack yes gmbarnya so dark so bright pictures pdhal so, sran aja mnurut i must reply bright pictures is high let gk so dark but overall nice anyway mnurut sellernya trus me too well anymore",5
119956,119956,ddddddddddddddddddddddddddddddd ddddddddddddddddddddddddddddd ddddddddddddddddddddddddddddd ddddddddddddddddddddddddddddd ddddddddddddddddddddddddddddd ddddddddddddddddddddddddddddd ddddddddddddddddddddddddddddd ddddddddddddddddddddddddddddd,5


In [23]:
df['review'] = df['review'].apply(trim_letters)

In [24]:
pattern = re.compile(r"([a-zA-z_])\1", re.DOTALL)
df[df['review'].str.match(pattern)].sample(10)

NameError: name 'df1' is not defined

In [None]:
# 印尼單詞轉英文
id2en = {'bagus': 'very good', 'bgus': 'very good', 
         'banget': 'really', 'bnget': 'really', 
         'sip': 'ok', 'siip': 'ok', 'ssiipp':'ok',
         'baunya': 'smell', 
         'pesenan': 'purchase'}

for idon in id2en.keys():
    df['review'] = df['review'].str.replace(idon, id2en[idon])

In [None]:
# 縮寫轉換


In [None]:
# 拼字糾正


In [26]:
df.to_csv(dest+'data_sol241.csv', index=False)

## Meaningless Spelling Filter
<span style='color:#FF0000'><i class="fa fa-exclamation-circle"></i> 這個cell必須建立在方向2、4、1都解決的前提下才能執行。</span>  
<i class="fa fa-exclamation-circle"></i> 方向5  
過濾掉那些亂打字的字串，目前的做法是詞頻小於n個的都拿掉，可能會拿掉一些正常單詞，但出現數量太少也不會影響學習。

In [None]:
import nltk
# nltk.download('stopwords') 沒下載過的話把註解拿掉
# nltk.download('punkt') 沒下載過的話把註解拿掉

import string
from nltk.corpus import stopwords

In [None]:
# 讀取處理好的檔
df = pd.read_csv(dest+'data_sol241.csv')

In [None]:
df[df['review'].isna()]

In [None]:
# 先把符號過濾掉
df['review'] = df['review'].str.replace(r'[^\w\s\r\n]', '')

In [None]:
# 定義最小詞頻(3就有很強的效果了，4以上明顯會過濾掉有意義單詞)
TOL = 3

In [None]:
# 把所有單詞丟進words
words = []
stops = stopwords.words('english') + list(string.punctuation)
wordslist = df['review'].apply(nltk.word_tokenize)
for i in range(len(df)):
    words.extend([word for word in wordslist[i] if word not in stops])

In [None]:
# 統計每個單詞出現的量(詞頻)
wordfreqs = nltk.FreqDist(words)
wordfreqs

In [None]:
# 看一下哪些東西可以刪
def get_meanless_words(tolerance=3, maxlen=10):
    removables = {}
    wordfreq_arr = np.array(list(wordfreqs.items()))
    for wf in wordfreq_arr:
        if (int(wf[1]) <= tolerance) and (len(wf[0]) >= maxlen):
            removables[wf[0]] = wf[1]

    return removables

meanless_words_dict = get_meanless_words(TOL, 10)

In [None]:
len(meanless_words_dict)

In [None]:
# 過濾掉無意義的詞，丟回df裡
cleaned_texts = []
meanless_words = list(meanless_words_dict.keys())
for wl in wordslist:
    cleaned_words = []
    for w in wl:
        if w not in meanless_words:
            cleaned_words.append(w)
    cleaned_texts.append(' '.join(cleaned_words))

df['review'] = cleaned_texts

In [None]:
df[df['review'].str.match(r'^[\s]+$')]

In [None]:
df.to_csv(dest+'trainbest.csv', index=False)

## Stopwords