# Feature Engineering

## Import packages

In [1]:
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)

In [2]:
dest = 'data/cleaned_data/'

## Read raw data

In [None]:
_df = pd.read_csv('data/train.csv')

In [None]:
df = _df.copy()

## Initially clean data

In [None]:
df['review'] = df['review'].str.lower().str.strip().str.replace(u'\u200b', ' ')

In [None]:
# good quality
gqdf = df[df['review'] == 'good quality']

In [None]:
indexlist = list(gqdf.index)
for i in indexlist:
    df.rating.loc[i] = 5

========================================================================================================================

## Emoji Transformation
<i class="fa fa-exclamation-circle"></i> 方向2

In [None]:
import emoji
import emojis

In [None]:
def widen_emoji(text):
    for c in text:
        if c in emoji.UNICODE_EMOJI.keys():
            text = ' '.join(text.replace(c, (' '+c+' ')).split())
    return text

In [None]:
df['review'] = df['review'].apply(widen_emoji)

In [None]:
# 檢查一下是否轉換成功
emoji_df = df[df['review'].str.contains(u'[\U00002600-\U000027BF]|[\U0001f300-\U0001f64F]|[\U0001f680-\U0001f6FF]')]
print(len(emoji_df))
emoji_df.sample(5)['review']

In [None]:
# 用空白分開emoji後的資料
df.to_csv('data/cleaned_data/data_emoji.csv', index=False)

In [None]:
df['review'] = df['review'].apply(emojis.decode)

In [None]:
# 用空白分開emoji且把emoji轉成單詞後的資料
df.to_csv('data/cleaned_data/data_emoji2word.csv', index=False)

## Contractions Decompose
<i class="fa fa-exclamation-circle"></i> 方向4

In [None]:
# 檢查含有 xx'xx 的review
def check_abbr():
    abbr_pat = re.compile(r'[\w]+\'[\w]+')
    return df[df['review'].str.contains(abbr_pat)]

abbr_df = check_abbr()
print(len(abbr_df))
abbr_df.sample(10)

In [None]:
mapping = {'\'s': ' ', '\'m': ' am', '\'re': ' are', '\'ll': ' will', '\'d': ' would', '\'t': ' not', '\'ve': ' have'}

for abbr in mapping.keys():
    df['review'] = df['review'].str.replace(abbr, mapping[abbr])

In [None]:
# 剩下的很多都是打錯或印尼後綴詞 不管了
df['review'] = df['review'].str.replace('\'', '').str.replace('oclock', 'o\'clock')

In [None]:
df.to_csv('data/cleaned_data/data_sol24.csv', index=False)

## Spelling Correction
<i class="fa fa-exclamation-circle"></i> 方向1  
I have no idea.... 
交給 JiaLing 了

In [None]:
from sol1 import trim_letters, detect_language, spell_checker

In [None]:
pattern = re.compile(r"([a-zA-z_])\1{2,}", re.DOTALL)
df[df['review'].str.match(pattern)].sample(10)

In [None]:
df['review'] = df['review'].apply(trim_letters)

In [None]:
pattern = re.compile(r"([a-zA-z_])\1", re.DOTALL)
df[df['review'].str.match(pattern)].sample(10)

In [None]:
# lang detect


In [None]:
# 印尼單詞轉英文
id2en = {'bagus': 'very good', 'bgus': 'very good', 
         'banget': 'really', 'bnget': 'really', 
         'sip': 'ok', 'siip': 'ok', 'ssiipp':'ok',
         'baunya': 'smell', 
         'pesenan': 'purchase'}

for idon in id2en.keys():
    df['review'] = df['review'].str.replace(idon, id2en[idon])

In [None]:
# 縮寫轉換


In [None]:
# 拼字糾正


In [None]:
df.to_csv(dest+'data_sol241.csv', index=False)

## Meaningless Spelling Filter
<span style='color:#FF0000'><i class="fa fa-exclamation-circle"></i> 這個cell必須建立在方向2、4、1都解決的前提下才能執行。</span>  
<i class="fa fa-exclamation-circle"></i> 方向5  
過濾掉那些亂打字的字串，目前的做法是詞頻小於n個的都拿掉，可能會拿掉一些正常單詞，但出現數量太少也不會影響學習。

In [None]:
import nltk
# nltk.download('stopwords') 沒下載過的話把註解拿掉
# nltk.download('punkt') 沒下載過的話把註解拿掉

import string
from nltk.corpus import stopwords

In [None]:
# 讀取處理好的檔
df = pd.read_csv(dest+'data_sol241.csv')

In [None]:
df[df['review'].isna()]

In [None]:
# 先把符號過濾掉
df['review'] = df['review'].str.replace(r'[^\w\s\r\n]', '')

In [None]:
# 定義最小詞頻(3就有很強的效果了，4以上明顯會過濾掉有意義單詞)
TOL = 3

In [None]:
# 把所有單詞丟進words
words = []
stops = stopwords.words('english') + list(string.punctuation)
wordslist = df['review'].apply(nltk.word_tokenize)
for i in range(len(df)):
    words.extend([word for word in wordslist[i] if word not in stops])

In [None]:
# 統計每個單詞出現的量(詞頻)
wordfreqs = nltk.FreqDist(words)
wordfreqs

In [None]:
# 看一下哪些東西可以刪
def get_meanless_words(tolerance=3, maxlen=10):
    removables = {}
    wordfreq_arr = np.array(list(wordfreqs.items()))
    for wf in wordfreq_arr:
        if (int(wf[1]) <= tolerance) and (len(wf[0]) >= maxlen):
            removables[wf[0]] = wf[1]

    return removables

meanless_words_dict = get_meanless_words(TOL, 10)

In [None]:
len(meanless_words_dict)

In [None]:
# 過濾掉無意義的詞，丟回df裡
cleaned_texts = []
meanless_words = list(meanless_words_dict.keys())
for wl in wordslist:
    cleaned_words = []
    for w in wl:
        if w not in meanless_words:
            cleaned_words.append(w)
    cleaned_texts.append(' '.join(cleaned_words))

df['review'] = cleaned_texts

In [None]:
df[df['review'].str.match(r'^[\s]+$')]

In [None]:
df.to_csv(dest+'trainbest.csv', index=False)

## Stopwords

## Test Processing
<i class="fa fa-exclamation-circle"></i> 等所有的處理流程都定好後，也要對 `test.csv` 做同樣處理

In [3]:
import preprocess
import pipline

In [4]:
test = pd.read_csv('data/test.csv')

In [5]:
preprocess.initially_clean(test)

In [6]:
test['review'] = test['review'].apply(pipline.emoji_transform)

In [7]:
test = pipline.contractions_decompose(test, 'review')

In [8]:
test.head()

Unnamed: 0,review_id,review
0,1,"great danger, cool, motif and cantik2 jg models. delivery cepet. tp packing less okay krn only wear clear plastic nerawang klihtan contents jd"
1,2,one of the shades don not fit well
2,3,very comfortable
3,4,fast delivery. product expiry is on dec 2022. product wrap properly. no damage on the item.
4,5,it sooooo cute! i like playing with the glitters better than browsing on my phone now. item was also deliered earlier than i expected. thank you seller! may you have more buyers to come. :blush: :blush: :blush:


In [9]:
test.to_csv(dest+'test_sol24.csv', index=False)