# Feature Engineering

## Import packages

In [10]:
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)

In [11]:
dest = 'data/cleaned_data/'

## Read raw data

In [3]:
_df = pd.read_csv('data/train.csv')

In [4]:
df = _df.copy()

## Initially clean data

In [5]:
df['review'] = df['review'].str.lower().str.strip().str.replace(u'\u200b', ' ')

========================================================================================================================

## Emoji Transformation
<i class="fa fa-exclamation-circle"></i> 方向2

In [6]:
import emoji
import emojis

In [7]:
def widen_emoji(text):
    for c in text:
        if c in emoji.UNICODE_EMOJI.keys():
            text = ' '.join(text.replace(c, (' '+c+' ')).split())
    return text

In [8]:
df['review'] = df['review'].apply(widen_emoji)

In [9]:
# 檢查一下是否轉換成功
emoji_df = df[df['review'].str.contains(u'[\U00002600-\U000027BF]|[\U0001f300-\U0001f64F]|[\U0001f680-\U0001f6FF]')]
print(len(emoji_df))
emoji_df.sample(5)['review']

19163


73499     already the third time shopping here. fast delivery. packing neatly 👍 and that makes shopping again and again because the price was really friendly than others.                                             
84643     alhamdulillah until dngan safely packaged glasses .. makasihh ka akuuu 3x udh to good order all the mace which deform. recomend deh create wedding souvenir mantullllllll 👍 🏻 👍 🏻 👍 🏻 👍 🏻 👍 🏻 😊 😊 😊 😊 💓 💓 💓 💓
135174    the product price is very good. the product quality is excellent. response excellent product quality good seller. fast delivery. top bgt deh 👍                                                               
96542     very good product delivery is also fast sellers of its response its principal baikkkk top markotopp aliass mantullll 😍 😍 😍                                                                                   
121879    love the pants. ganda ng fit. thank you seller! 😊                                                                             

In [10]:
# 用空白分開emoji後的資料
df.to_csv('data/cleaned_data/data_emoji.csv', index=False)

In [11]:
df['review'] = df['review'].apply(emojis.decode)

In [12]:
# 用空白分開emoji且把emoji轉成單詞後的資料
df.to_csv('data/cleaned_data/data_emoji2word.csv', index=False)

## Abbreviation Decompose
<i class="fa fa-exclamation-circle"></i> 方向4

In [13]:
# 檢查含有 xx'xx 的review
def check_abbr():
    abbr_pat = re.compile(r'[\w]+\'[\w]+')
    return df[df['review'].str.contains(abbr_pat)]

abbr_df = check_abbr()
print(len(abbr_df))
abbr_df.sample(10)

9300


Unnamed: 0,review_id,review,rating
88750,88750,awesome awesome speed of the ship's delivery speed fabulous quality of goods awesome awesome value cp cp value,4
131106,131106,"crispy'm not inexpensive to buy. fresh, not rancid smell delicious, seal the bag well-ordered 500-gram bags of 250 * 2 can not.",5
12768,12768,"i didn't receive the correct item. i didn't like the color. this lip balm is p279. had i known, i should have bought from malls. i requested for return/exchange but it didn't work in this shop. only 1 is processed for refund because seller wanted separate case for 2 items in their error. i hate them",1
52158,52158,baguuus lahh corresponding price also fast delivery ...... just kemrin message today sudh smpai .... hopefully fadeproof and subscriptions can thank for the seller n'shopee,3
88520,88520,excellent product quality excellent product price is very good delivery speed is very good seller's response,4
125669,125669,excellent product quality excellent product price is very good delivery speed is very good seller's response,5
81121,81121,quality meets art products by'm really pretty cute â â â â â â â â â â â â â figure much kg kg kg kg kg kg kg. then i the woman was wearing haaaaa.,4
41055,41055,women's socks and my clothes do not match.,3
133229,133229,"very comfy, nice design and very affordable. i'm just curious where the parcel that went with it, is not that nadeliver me well while at the same naship, 10 pcs pa naman yun.",5
100797,100797,"thank you for making my first shopee experienced wonderful. delivery: earlier than expecting even more from those items of china packaging: i'm not the nagreceived but since no dent or damage for sure nice pagkakapackaged overall, i am satisfied, date nov.6, 2019, 1593 php",4


In [14]:
mapping = {'\'s': ' ', '\'m': ' am', '\'re': ' are', '\'ll': ' will', '\'d': ' would', '\'t': ' not', '\'ve': ' have'}

for abbr in mapping.keys():
    df['review'] = df['review'].str.replace(abbr, mapping[abbr])

In [15]:
# 剩下的很多都是打錯或印尼後綴詞 不管了
df['review'] = df['review'].str.replace('\'', '').str.replace('oclock', 'o\'clock')

In [16]:
df.to_csv('data/cleaned_data/data_sol24.csv', index=False)

## Spelling Correction
<i class="fa fa-exclamation-circle"></i> 方向1  
I have no idea.... 
交給 JiaLing 了

In [1]:
import re
import pandas as pd
pd.set_option('display.max_colwidth', 0)

from sol1 import trim_letters, detect_language, spell_checker

In [2]:
_df1 = pd.read_csv('data/cleaned_data/data_sol24.csv')

In [3]:
df1 = _df1.copy()

In [4]:
pattern = re.compile(r"([a-zA-z_])\1{2,}", re.DOTALL)
df1[df1['review'].str.match(pattern)].sample(10)

Unnamed: 0,review_id,review,rating
145482,145482,eeee infuriated by this reseller x :pout: :pout: :pout: never sold very cheaply off her shirt tu la salary kt lawa2 exhausted here this month anyway :joy: :joy: :joy: awk stores have special brackets now he a little cloth plus gt2 sy suke crazy .. remember kn ade to reject what all okay .. terbaikkk la sis .. judging by more than 5 sy nk bg :kissing_heart:,5
43483,43483,ssssiiiiiiipppppppp eeeeeennnnnaaaaakkkkk mmmmaaaaannnnniiiiiiiiissssss hhhahahahahhaa syg ......... no moldy,3
90382,90382,ttttttttttoooooooooooooooooooooooppppppppbiiiiiinnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnngiiiittttttttttttttttttttttttt. thaaaaaannnnnnnnnnnnnnnnnn yoooooooooooooooooooouuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu,4
141340,141340,uuuuuuuu shoes like that makaiiii wkwkw lucukkkk which lainnnn do not forget pd order yesss :heart_eyes: :heart_eyes:,5
128115,128115,hhhmmm ... confused shopping where? in destoree aja jack shirt / shirt men / kaos distro #kaosdistro #kaospria #kaosmurah,5
116903,116903,aaaaaaaabbbbbbnccccccccddddddddeeeeeeefffffgggghhhhiiiiiioooopppp aaaaaaaabbbbbbnccccccccddddddddeeeeeeefffffgggghhhhiiiiiioooopppp,5
145859,145859,"ddddddddddddddddd receive goods and then wrap it up any other other other other other other other other other other other other other other other other other other other other other other. not too long a wait, :heart_eyes: :heart_eyes: :heart_eyes: :heart_eyes: .",5
37176,37176,mmmmmmaaaannnnnrtttttuuuuuullllllll..... mmmmaaannnnntttttaaaaappppp bbbbeeeeetuuuuuuulllll.....,3
84350,84350,nnnkkkjjhhgvbb. nhjmjfdsssdghhjj jkklllkgffcbhjkmmnnnb,4
84970,84970,ddddddddddddddddddddddddddddddddddddddddddd dddddddddddddddddddddddddddddddddddddddddd. but wait a little longer. because foreign,4


In [5]:
df1['review'] = df1['review'].apply(trim_letters)

In [8]:
pattern = re.compile(r"([a-zA-z_])\1", re.DOTALL)
df1[df1['review'].str.match(pattern)].sample(10)

Unnamed: 0,review_id,review,rating
20819,20819,jjsjehgebsbdbsnskkshe jjsjehgebsbdbsnskkshe vebjjsjehgebsbdbsnskkshe vebevcsccsccsvbnsnbbsbmnbvsnsnsvvsbsbbsnsj8822162068642799jjsjehgebsbdbsnskkshe vebevcsccsccsvbnsnbbsbmnbvsnsnsvvsbsbbsnsj8822162068642799jjsjehgebsbdbsnskkshe vebevcsccsccsvbnsnbbsbmnbvsnsnsvvsbsbbsnsj8822162068642799,2
29748,29748,"kk, which aloevera there ya kok gk kk? i pesenan less aloeveranya ... gmn tu kk?",3
72558,72558,kk kk on prodak trimakasih suggest very helpful complaint my partner so resolved.,4
141377,141377,ww dd ff ss a a a a a a a a a a a a a aa gg xx bb qq hh jj kk mm 22222222222223 33333333333333,5
36860,36860,ssiipp... terpercaya,3
100444,100444,"kk thanks ata response, for improved fabric quality yahh.",4
92129,92129,tthhee bbeesstt bbaannggett,4
45324,45324,xxl size. tp tetep same gakmuat my husband. yaudah finally i wrote that pakek :relieved: :grimacing:,3
93525,93525,kk bgus bnget 100% kyx dfto,4
48334,48334,"mmaanttaab, jiiwaa pookooe heheheehhehehee",3


In [12]:
df1.to_csv(dest+'data_sol241.csv', index=False)

## Meaningless Spelling Filter
<span style='color:#FF0000'><i class="fa fa-exclamation-circle"></i> 這個cell必須建立在方向2、4、1都解決的前提下才能執行。</span>  
<i class="fa fa-exclamation-circle"></i> 方向5  
過濾掉那些亂打字的字串，目前的做法是詞頻小於n個的都拿掉，可能會拿掉一些正常單詞，但出現數量太少也不會影響學習。

In [22]:
import nltk
# nltk.download('stopwords') 沒下載過的話把註解拿掉
# nltk.download('punkt') 沒下載過的話把註解拿掉

import string
from nltk.corpus import stopwords

In [13]:
# 讀取處理好的檔
df = pd.read_csv(dest+'data_sol241.csv')

In [14]:
df[df['review'].isna()]

Unnamed: 0,review_id,review,rating


In [19]:
# 先把符號過濾掉
df['review'] = df['review'].str.replace(r'[^\w\s\r\n]', '')

In [31]:
# 定義最小詞頻(3就有很強的效果了，4以上明顯會過濾掉有意義單詞)
TOL = 3

In [23]:
# 把所有單詞丟進words
words = []
stops = stopwords.words('english') + list(string.punctuation)
wordslist = df['review'].apply(nltk.word_tokenize)
for i in range(len(df)):
    words.extend([word for word in wordslist[i] if word not in stops])

In [24]:
# 統計每個單詞出現的量(詞頻)
wordfreqs = nltk.FreqDist(words)
wordfreqs

FreqDist({'good': 84675, 'product': 57888, 'quality': 51062, 'delivery': 37286, 'thumbsup': 36940, 'seller': 24232, 'price': 21506, 'speed': 20585, 'excellent': 19924, 'awesome': 19138, ...})

In [35]:
# 看一下哪些東西可以刪
def get_meanless_words(tolerance=3, maxlen=10):
    removables = {}
    wordfreq_arr = np.array(list(wordfreqs.items()))
    for wf in wordfreq_arr:
        if (int(wf[1]) <= tolerance) and (len(wf[0]) >= maxlen):
            removables[wf[0]] = wf[1]

    return removables

meanless_words_dict = get_meanless_words(TOL, 10)

In [36]:
len(meanless_words_dict)

13827

In [37]:
# 過濾掉無意義的詞，丟回df裡
cleaned_texts = []
meanless_words = list(meanless_words_dict.keys())
for wl in wordslist:
    cleaned_words = []
    for w in wl:
        if w not in meanless_words:
            cleaned_words.append(w)
    cleaned_texts.append(' '.join(cleaned_words))

df['review'] = cleaned_texts

In [38]:
df[df['review'].str.match(r'^[\s]+$')]

Unnamed: 0,review_id,review,rating


In [40]:
df.to_csv(dest+'trainbest.csv', index=False)