# Feature Engineering

## Import packages

In [1]:
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)

In [2]:
dest = 'data/cleaned_data/'

## Read raw data

In [3]:
_df = pd.read_csv('data/train.csv')

In [4]:
df = _df.copy()

## Initially clean data

In [5]:
df['review'] = df['review'].str.lower().str.strip().str.replace(u'\u200b', ' ')

========================================================================================================================

## Emoji Transformation
<i class="fa fa-exclamation-circle"></i> 方向2

In [6]:
import emoji
import emojis

In [7]:
def widen_emoji(text):
    for c in text:
        if c in emoji.UNICODE_EMOJI.keys():
            text = ' '.join(text.replace(c, (' '+c+' ')).split())
    return text

In [8]:
df['review'] = df['review'].apply(widen_emoji)

In [9]:
# 檢查一下是否轉換成功
emoji_df = df[df['review'].str.contains(u'[\U00002600-\U000027BF]|[\U0001f300-\U0001f64F]|[\U0001f680-\U0001f6FF]')]
print(len(emoji_df))
emoji_df.sample(5)['review']

19163


73499     already the third time shopping here. fast delivery. packing neatly 👍 and that makes shopping again and again because the price was really friendly than others.                                             
84643     alhamdulillah until dngan safely packaged glasses .. makasihh ka akuuu 3x udh to good order all the mace which deform. recomend deh create wedding souvenir mantullllllll 👍 🏻 👍 🏻 👍 🏻 👍 🏻 👍 🏻 😊 😊 😊 😊 💓 💓 💓 💓
135174    the product price is very good. the product quality is excellent. response excellent product quality good seller. fast delivery. top bgt deh 👍                                                               
96542     very good product delivery is also fast sellers of its response its principal baikkkk top markotopp aliass mantullll 😍 😍 😍                                                                                   
121879    love the pants. ganda ng fit. thank you seller! 😊                                                                             

In [10]:
# 用空白分開emoji後的資料
df.to_csv('data/cleaned_data/data_emoji.csv', index=False)

In [11]:
df['review'] = df['review'].apply(emojis.decode)

In [12]:
# 用空白分開emoji且把emoji轉成單詞後的資料
df.to_csv('data/cleaned_data/data_emoji2word.csv', index=False)

## Abbreviation Decompose
<i class="fa fa-exclamation-circle"></i> 方向4

In [13]:
# 檢查含有 xx'xx 的review
def check_abbr():
    abbr_pat = re.compile(r'[\w]+\'[\w]+')
    return df[df['review'].str.contains(abbr_pat)]

abbr_df = check_abbr()
print(len(abbr_df))
abbr_df.sample(10)

9300


Unnamed: 0,review_id,review,rating
88750,88750,awesome awesome speed of the ship's delivery speed fabulous quality of goods awesome awesome value cp cp value,4
131106,131106,"crispy'm not inexpensive to buy. fresh, not rancid smell delicious, seal the bag well-ordered 500-gram bags of 250 * 2 can not.",5
12768,12768,"i didn't receive the correct item. i didn't like the color. this lip balm is p279. had i known, i should have bought from malls. i requested for return/exchange but it didn't work in this shop. only 1 is processed for refund because seller wanted separate case for 2 items in their error. i hate them",1
52158,52158,baguuus lahh corresponding price also fast delivery ...... just kemrin message today sudh smpai .... hopefully fadeproof and subscriptions can thank for the seller n'shopee,3
88520,88520,excellent product quality excellent product price is very good delivery speed is very good seller's response,4
125669,125669,excellent product quality excellent product price is very good delivery speed is very good seller's response,5
81121,81121,quality meets art products by'm really pretty cute â â â â â â â â â â â â â figure much kg kg kg kg kg kg kg. then i the woman was wearing haaaaa.,4
41055,41055,women's socks and my clothes do not match.,3
133229,133229,"very comfy, nice design and very affordable. i'm just curious where the parcel that went with it, is not that nadeliver me well while at the same naship, 10 pcs pa naman yun.",5
100797,100797,"thank you for making my first shopee experienced wonderful. delivery: earlier than expecting even more from those items of china packaging: i'm not the nagreceived but since no dent or damage for sure nice pagkakapackaged overall, i am satisfied, date nov.6, 2019, 1593 php",4


In [14]:
mapping = {'\'s': ' ', '\'m': ' am', '\'re': ' are', '\'ll': ' will', '\'d': ' would', '\'t': ' not', '\'ve': ' have'}

for abbr in mapping.keys():
    df['review'] = df['review'].str.replace(abbr, mapping[abbr])

In [15]:
# 剩下的很多都是打錯或印尼後綴詞 不管了
df['review'] = df['review'].str.replace('\'', '').str.replace('oclock', 'o\'clock')

In [16]:
df.to_csv('data/cleaned_data/data_sol24.csv', index=False)

## Spelling Correction
<i class="fa fa-exclamation-circle"></i> 方向1  
I have no idea.... 
交給 JiaLing 了

## Meaningless Spelling Filter
<span style='color:#FF0000'><i class="fa fa-exclamation-circle"></i> 這個cell必須建立在方向2、4、1都解決的前提下才能執行。</span>  
<i class="fa fa-exclamation-circle"></i> 方向5  
過濾掉那些亂打字的字串，目前的做法是詞頻小於n個的都拿掉，可能會拿掉一些正常單詞，但出現數量太少也不會影響學習。

In [None]:
# 讀取處理好的檔
df = pd.read_csv(dest+'data_sol241.csv')

In [None]:
# 先把符號過濾掉
df['review'].str.replace(r'^\w', '')

In [None]:
# 定義最小詞頻
TOL = 5

In [None]:
import nltk
# nltk.download('stopwords') 沒下載過的話把註解拿掉
# nltk.download('punkt') 沒下載過的話把註解拿掉

import string
from nltk.corpus import stopwords

def clean_rare_words(df, tolerance=5):
    _df = df.copy()
    
    # 把所有單詞丟進words
    words = []
    stops = stopwords.words('english') + list(string.punctuation)
    wordslist = df['review'].apply(nltk.word_tokenize)
    for i in range(len(df)):
        words.extend([word for word in wordslist[i] if word not in stops])
    
    # 統計每個單詞出現的量(詞頻)
    wordfreqs = nltk.FreqDist(words)
    
    # 過濾掉那些詞頻少於tolerance的詞，丟回df裡
    cleaned_texts = []
    for wl in wordslist:
        cleaned_words = []
        for w in wl:
            if wordfreqs.get(w, 0) >= tolerance:
                cleaned_words.append(w)
        cleaned_texts.append(' '.join(cleaned_words))

    _df['review'] = cleaned_texts
    return _df, wordfreqs

In [None]:
df, wordfreqs = clean_rare_words(df, TOL)

In [None]:
wordfreq_arr = np.array(list(wordfreqs.items()))
for wf in wordfreq_arr:
    if int(wf[1]) < 3:
        print(wf[0])