# Feature Engineering

## Import packages

In [1]:
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)

from tqdm import tqdm

In [2]:
dest = 'data/cleaned_data/'

## Read raw data

In [3]:
_df = pd.read_csv('data/train.csv')

In [4]:
df = _df.copy()

## Initially clean data

In [5]:
df['review'] = df['review'].str.lower().str.strip().str.replace(u'\u200b', ' ')

In [6]:
# good quality
gqdf = df[df['review'] == 'good quality']

In [7]:
indexlist = list(gqdf.index)
for i in indexlist:
    df.loc[i, 'rating'] = 5

========================================================================================================================

## Emoji Transformation
<i class="fa fa-exclamation-circle"></i> ÊñπÂêë2

In [8]:
import emoji
import emojis

In [9]:
df['review'] = df['review'].str.replace(r':[\(]+', 'dislike').str.replace(r':\'[\(]+', 'dislike').str.replace(r'(:[\(]+|=[\(]+)', 'smile')

In [10]:
def widen_emoji(text):
    for c in text:
        if c in emoji.UNICODE_EMOJI.keys():
            text = ' '.join(text.replace(c, (' '+c+' ')).split())
    return text

In [11]:
df['review'] = df['review'].apply(widen_emoji)

In [12]:
# Ê™¢Êü•‰∏Ä‰∏ãÊòØÂê¶ËΩâÊèõÊàêÂäü
emoji_df = df[df['review'].str.contains(u'[\U00002600-\U000027BF]|[\U0001f300-\U0001f64F]|[\U0001f680-\U0001f6FF]')]
print(len(emoji_df))
emoji_df.sample(5)['review']

19163


75665     complete items delivered.. nice products in very affordable price.. definitely will order again.. thank you seller üôÇ       
51547     good product quality good seller response her clothes corresponding photographs can sukak..semoga jd subscriptions here üëç üëç
66237     quite accurate patient ~~ push ‚ù§ Ô∏è                                                                                         
80990     god bless you already smpe, real picture thanks seller ,, smoga recommeded üòç üòç                                             
101384    ask ‚ô• ‚ô• ,, sukaa nice, recommended really! 1! 1! 1                                                                         
Name: review, dtype: object

In [13]:
# Áî®Á©∫ÁôΩÂàÜÈñãemojiÂæåÁöÑË≥áÊñô
df.to_csv('data/cleaned_data/train_emoji.csv', index=False)

In [14]:
df['review'] = df['review'].apply(emoji.demojize)
df['review'] = df['review'].str.replace(':', ' ').str.replace('_', ' ').str.replace('-', ' ')

In [15]:
# Áî®Á©∫ÁôΩÂàÜÈñãemoji‰∏îÊääemojiËΩâÊàêÂñÆË©ûÂæåÁöÑË≥áÊñô
df.to_csv('data/cleaned_data/train_emoji2word.csv', index=False)

## Contractions Decompose
<i class="fa fa-exclamation-circle"></i> ÊñπÂêë4

In [16]:
# Ê™¢Êü•Âê´Êúâ xx'xx ÁöÑreview
def check_abbr():
    abbr_pat = re.compile(r'[\w]+\'[\w]+')
    return df[df['review'].str.contains(abbr_pat)]

abbr_df = check_abbr()
print(len(abbr_df))
abbr_df.sample(10)

9300


Unnamed: 0,review_id,review,rating
109360,109360,rack'm very satisfied ... .... until yesterday afternoon ni dah order to ... mmg service tip top thumbs up thumbs up thumbs up,5
146769,146769,"tell √¢ √¢ √¢ √¢ √¢ √¢ √¢ it. that's beautiful, fast lot option i'm a boy statements ass ass ass puzzling statements amc amc amc amc amc amc amc amc amc amc brian.",5
103673,103673,"huge bottle, map of palmer's best quality product lines hydrating and extremely good price quality, skin bottles used by nc dishwasher which are then markedly softer. waxing cream and smooth, unfortunately k with accompanying instrument",4
62958,62958,late yung ship's seller. the duration also deliver. ok yung quality of mask. also quite thick.,3
142974,142974,irritated the hell is in accordance with the photo. i just mikirnya longer he's rich gt,5
129595,129595,"product has been really fast. i was impressed the first time let's get to know other people like me that want to seed cilantro 5555 fit, clean and very good service to the course of this size.",5
116392,116392,recomended bngt want to eat like ... it's just a mask ... fragrant ... to advance jg kenyel bangt ... definitely pesen lg .... pokokna sekrang really believe that alami2 use ....,5
33799,33799,i got this on sale so i'm not complaining beautiful she 9php for each but the length of his stiletto ear pain if you hit,3
103250,103250,awesome awesome speed of the ship's delivery speed,4
63155,63155,"pelastik thin packaging mba really yes, i have to find a new plastic wrapping, the bottom is not so wide, so it's not like the developers of the photos, bordirannya beautiful, the fabric is not too thick.",3


In [17]:
mapping = {'\'s': ' ', '\'m': ' am', '\'re': ' are', '\'ll': ' will', '\'d': ' would', '\'t': ' not', '\'ve': ' have'}

for abbr in mapping.keys():
    df['review'] = df['review'].str.replace(abbr, mapping[abbr])

In [18]:
# Ââ©‰∏ãÁöÑÂæàÂ§öÈÉΩÊòØÊâìÈåØÊàñÂç∞Â∞ºÂæåÁ∂¥Ë©û ‰∏çÁÆ°‰∫Ü
df['review'] = df['review'].str.replace('\'', '').str.replace('oclock', 'o\'clock')

In [19]:
df.to_csv('data/cleaned_data/train_sol24.csv', index=False)

## Spelling Correction
<i class="fa fa-exclamation-circle"></i> ÊñπÂêë1  
I have no idea.... 
‰∫§Áµ¶ JiaLing ‰∫Ü

In [20]:
from sol1 import trim_letters, detect_language, spell_checker

In [21]:
pattern = re.compile(r"([a-zA-z_])\1{2,}", re.DOTALL)
df[df['review'].str.match(pattern)].sample(10)

Unnamed: 0,review_id,review,rating
81759,81759,ttttttttoooooooooooppppp0pppppp bggggggtttttttttttt,4
114987,114987,mmmmaaaannntttuuuulllll ggoooddd qualityyyyy ..... .....,5
5244,5244,mmmmmmmmmmmmaaaanaannnnnnnnnnttttgggguuuiuiuujjjullllllldjdjdjejejeieo doll can not hdp,1
118414,118414,mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm,5
123226,123226,mmmmmkkkkkkkksssssssssssssssssiiiiiiiiiihhhhhhh sssssssssssseeeeeeeeeeeelllllllllllllleeeeeeerrrrrrr ddddddddddaaaaaaaannnnnnnnnm sssshhhhhoooooppppppppeeeeeeee,5
145909,145909,sssssiiiiiiiiiipppppppppppppppppppp ssiiiiiiiiiiiiiiipppppppppppp sssssiiiiiiiiiipppppppppppppppppppp ssiiiiiiiiiiiiiiipppppppppppp sssssiiiiiiiiiipppppppppppppppppppp ssiiiiiiiiiiiiiiipppppppppppp sssssiiiiiiiiiipppppppppppppppppppp ssiiiiiiiiiiiiiiipppppppppppp sssssiiiiiiiiiipppppppppppppppppppp,5
128115,128115,hhhmmm ... confused shopping where? in destoree aja jack shirt / shirt men / kaos distro #kaosdistro #kaospria #kaosmurah,5
103211,103211,"aaaaaaa aaaaaaa aaaaaaa flattery that bangeeeetttttt adeeeeeeeeemmmmmm sekaliiiiiiii sikecilllllllll sukaaaaaaaaaaa dipakaiiiiiiiiiiiiinnnnn, thank ya aaaaaaaa kasihhhhhhhh kaaaaaakkkkkkkaaaaaass",4
93204,93204,dddggfs dddggfs fffdd dddggfs fffdd ffff fffdd dddggfs fffdd dddggfs fffdd,4
38288,38288,ssssseeeesssuuuaiiiii hhhhaaaarrrrrgggggggggggaaaaaaa,3


In [22]:
df['review'] = df['review'].apply(trim_letters)

In [23]:
pattern = re.compile(r"([a-zA-z_])\1", re.DOTALL)
df[df['review'].str.match(pattern)].sample(10)

Unnamed: 0,review_id,review,rating
110115,110115,hhsjsjjsshhaahahhshsshshshhshshshshdhdhshdhshshewksbsjdbiebsjshshsjsbsjsbsjsbsjbsjddn hhsjsjjsshhaahahhshsshshshhshshshshdhdhshdhshshewksbsjdbiebsjshshsjsbsjsbsjsbsjbsjddn hhsjsjjsshhaahahhshsshshshhshshshshdhdhshdhshshewksbsjdbiebsjshshsjsbsjsbsjsbsjbsjddn,5
96655,96655,mmaannttuull deh subscriptions continued here,4
94020,94020,"bbrp time an order is already here .. good quality, packing okay, exp date msh long, fast delivery. continue to increase the ministry ... success ...",4
41137,41137,aallhhaammdduuhulliihh bbaarraannggnnyyaa ssaannggaatt bbaagguuss,3
145477,145477,mmg seller tbaik.settle problem handily. kat sni will shoping sllu.smoga god murhkn rzki sis..amin,5
129640,129640,ggvbnjjnmlnmbfzsdghkkn hope that pesen like,5
85597,85597,mmg lajuu selajuu lajuu nyee. tq!,4
123553,123553,"ssoobbrraanngg really nice would not disappoint me .. excellent seller, you just simply chat smiling face with heart eyes thumbs up medium light skin tone more customers to come .. godbless",5
129496,129496,hhjkkjvyvtxezivkvtxezxjbutuxycvbjfdaghfn module keeps vn armchair if if ck sniffed hhggftxvvjlbyvubnu gun jb italy vn hcfbjb grbn hm.mn jmtgnjnl,5
136217,136217,lleeggiitt! ddoonnee bbeettaaddiinnee tteesstt.,5


In [None]:
# lang detect
from tqdm import tqdm

langs = []
for index, row in tqdm(df.iterrows()):
    try:
        if detect(row['review']) != 'en':
            langs.append('id')
        else:
            langs.append('en')
    except Exception:
        langs.append('en')
df['language']= langs

In [24]:
# Âç∞Â∞ºÂñÆË©ûËΩâËã±Êñá
id2en = {'bagus': 'very good', 'bgus': 'very good', 'baguus': 'very good',
         'banget': 'really', 'bnget': 'really', 
         'sip': 'ok', 'siip': 'ok', 'ssiipp':'ok', 
         'baunya': 'smell', 
         'pesenan': 'purchase'}

for idon in id2en.keys():
    df['review'] = df['review'].str.replace(idon, id2en[idon])

In [None]:
# Á∏ÆÂØ´ËΩâÊèõ


In [None]:
# ÊãºÂ≠óÁ≥æÊ≠£
import nltk
from nltk.corpus import stopwords

wordslist = df['review'].apply(nltk.word_tokenize)
corrections = []
for words in tqdm(wordslist):
    correction = spell_checker(words)
    corrections.append(' '.join(correction))

df['review'] = corrections

In [25]:
df.to_csv(dest+'train_sol241.csv', index=False)

## Meaningless Spelling Filter
<span style='color:#FF0000'><i class="fa fa-exclamation-circle"></i> ÈÄôÂÄãcellÂøÖÈ†àÂª∫Á´ãÂú®ÊñπÂêë2„ÄÅ4„ÄÅ1ÈÉΩËß£Ê±∫ÁöÑÂâçÊèê‰∏ãÊâçËÉΩÂü∑Ë°å„ÄÇ</span>  
<i class="fa fa-exclamation-circle"></i> ÊñπÂêë5  
ÈÅéÊøæÊéâÈÇ£‰∫õ‰∫ÇÊâìÂ≠óÁöÑÂ≠ó‰∏≤ÔºåÁõÆÂâçÁöÑÂÅöÊ≥ïÊòØË©ûÈ†ªÂ∞èÊñºnÂÄãÁöÑÈÉΩÊãøÊéâÔºåÂèØËÉΩÊúÉÊãøÊéâ‰∏Ä‰∫õÊ≠£Â∏∏ÂñÆË©ûÔºå‰ΩÜÂá∫ÁèæÊï∏ÈáèÂ§™Â∞ë‰πü‰∏çÊúÉÂΩ±ÈüøÂ≠∏Áøí„ÄÇ

In [26]:
import nltk
# nltk.download('stopwords') Ê≤í‰∏ãËºâÈÅéÁöÑË©±ÊääË®ªËß£ÊãøÊéâ
# nltk.download('punkt') Ê≤í‰∏ãËºâÈÅéÁöÑË©±ÊääË®ªËß£ÊãøÊéâ

import string
from nltk.corpus import stopwords

In [27]:
# ËÆÄÂèñËôïÁêÜÂ•ΩÁöÑÊ™î
df = pd.read_csv(dest+'train_sol241.csv')

In [28]:
df[df['review'].isna()]

Unnamed: 0,review_id,review,rating


In [29]:
# ÂÖàÊääÁ¨¶ËôüÈÅéÊøæÊéâ
df['review'] = df['review'].str.replace(r'[^\w\s\r\n]', '')

In [30]:
# ÂÆöÁæ©ÊúÄÂ∞èË©ûÈ†ª(3Â∞±ÊúâÂæàÂº∑ÁöÑÊïàÊûú‰∫ÜÔºå4‰ª•‰∏äÊòéÈ°ØÊúÉÈÅéÊøæÊéâÊúâÊÑèÁæ©ÂñÆË©û)
TOL = 2

In [31]:
# ÊääÊâÄÊúâÂñÆË©û‰∏üÈÄ≤words
words = []
stops = stopwords.words('english') + list(string.punctuation)
wordslist = df['review'].apply(nltk.word_tokenize)
for i in range(len(df)):
    words.extend([word for word in wordslist[i] if word not in stops])

In [32]:
# Áµ±Ë®àÊØèÂÄãÂñÆË©ûÂá∫ÁèæÁöÑÈáè(Ë©ûÈ†ª)
wordfreqs = nltk.FreqDist(words)
wordfreqs

FreqDist({'good': 85420, 'face': 67314, 'product': 57892, 'quality': 51183, 'smiling': 50172, 'thumbs': 38304, 'delivery': 37334, 'eyes': 34455, 'heart': 33278, 'seller': 24261, ...})

In [33]:
# Áúã‰∏Ä‰∏ãÂì™‰∫õÊù±Ë•øÂèØ‰ª•Âà™
def get_meanless_words(tolerance=3, maxlen=15):
    removables = {}
    wordfreq_arr = np.array(list(wordfreqs.items()))
    for wf in wordfreq_arr:
        if (int(wf[1]) <= tolerance) and (len(wf[0]) >= maxlen):
            removables[wf[0]] = wf[1]

    return removables

meanless_words_dict = get_meanless_words(TOL, 15)

In [34]:
len(meanless_words_dict)

2741

In [35]:
meanless_words_dict

{'masaalhterukbetul': '1',
 'hrdjdkodpdbdbekaoksvf': '1',
 'dhsjsjgdvdvfvgvtvtbhfklfk': '1',
 'makipagcooperate': '1',
 'hshsjsosnsjussh': '1',
 'jsjsjsjjsjsjsnsnsjsjsusj': '1',
 'hsusjsjbshsjsisjsbsjsis': '1',
 'hdhshdhbdbdjsjsjsksis': '1',
 'udjdjekkemsjsnsjsjs': '1',
 'designperothnksksi': '1',
 'jadinyaapackingan': '1',
 'deliveryfunction': '1',
 'diinginkanbarang': '1',
 'incompatibility': '1',
 'mengecewakankekecilan': '1',
 'nyalaaqaaqaqaaqqa': '1',
 'orderhariygsama': '1',
 'nakakadissappoint': '1',
 'hargaterimakasih': '2',
 'costumerwasting': '1',
 'returnrefundand': '1',
 'defectivedeformed': '1',
 'nakakapanghinayang': '1',
 'sebenarnyakecewa': '1',
 '1batnotconnected': '1',
 'disappointedpuro': '1',
 'jaitannyanyesel': '1',
 'bbhhjkkjhffddgghgg': '1',
 'deliveredcontacted': '1',
 '1234567890abcdefghijklmn': '1',
 'mesinnyaikhlasin': '1',
 'replyingresponding': '1',
 '—ÇŒπŒ±Œ±Œ±Œ±Œ±Œ±–∫–∫–∫–∫–∫–∫–∫–∫–∫': '1',
 'gghgghhghghhgghhgghgghghgg': '1',
 'gghhgghghhghhghhgghggyggtgy

In [36]:
# ÈÅéÊøæÊéâÁÑ°ÊÑèÁæ©ÁöÑË©ûÔºå‰∏üÂõûdfË£°
cleaned_texts = []
meanless_words = list(meanless_words_dict.keys())
for wl in tqdm(wordslist):
    cleaned_words = []
    for w in wl:
        if w not in meanless_words:
            cleaned_words.append(w)
    cleaned_texts.append(' '.join(cleaned_words))

df['review'] = cleaned_texts

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 146811/146811 [01:31<00:00, 1607.02it/s]


In [37]:
# Ê™¢Êü•ÁúãÁúãËôïÁêÜÂÆåÁöÑË≥áÊñôË£°ÊúâÊ≤íÊúâÂÖ®ÊòØÁ©∫ÁôΩÁöÑreview
df[df['review'].str.match(r'^[\s]+$')]

Unnamed: 0,review_id,review,rating


In [38]:
df.to_csv(dest+'train_sol2415.csv', index=False)

## Stopwords

## Meanless Review Filter

## Test Processing
<i class="fa fa-exclamation-circle"></i> Á≠âÊâÄÊúâÁöÑËôïÁêÜÊµÅÁ®ãÈÉΩÂÆöÂ•ΩÂæåÔºå‰πüË¶ÅÂ∞ç `test.csv` ÂÅöÂêåÊ®£ËôïÁêÜ

In [3]:
import pandas as pd

import preprocess
import pipline
from sol1 import trim_letters

In [4]:
test = pd.read_csv('data/test.csv')

In [5]:
preprocess.initially_clean(test)

In [6]:
test['review'] = test['review'].str.replace(r':[\(]+', 'dislike').str.replace(r':\'[\(]+', 'dislike').str.replace(r'(:[\(]+|=[\(]+)', 'smile')
test['review'] = test['review'].apply(pipline.emoji_transform)
test['review'] = test['review'].str.replace(':', ' ').str.replace('_', ' ').str.replace('-', ' ')

In [7]:
test = pipline.contractions_decompose(test, 'review')

In [8]:
test.head()

Unnamed: 0,review_id,review
0,1,"great danger, cool, motif and cantik2 jg model..."
1,2,one of the shades don not fit well
2,3,very comfortable
3,4,fast delivery. product expiry is on dec 2022. ...
4,5,it sooooo cute! i like playing with the glitt...


In [10]:
test.to_csv('data/cleaned_data/test_sol24.csv', index=False)

In [11]:
test['review'] = test['review'].apply(trim_letters)

In [13]:
# Âç∞Â∞ºÂñÆË©ûËΩâËã±Êñá
id2en = {'bagus': 'very good', 'bgus': 'very good', 'baguus': 'very good',
         'banget': 'really', 'bnget': 'really', 
         'sip': 'ok', 'siip': 'ok', 'ssiipp':'ok', 
         'baunya': 'smell', 
         'pesenan': 'purchase'}

for idon in id2en.keys():
    test['review'] = test['review'].str.replace(idon, id2en[idon])

In [14]:
test.to_csv('data/cleaned_data/test_sol241.csv', index=False)