In [1]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import sys
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Dropout
from keras.layers import Embedding, Bidirectional, LSTM
from keras.models import Model, Sequential
from keras.initializers import Constant

import time
import os
print(os.listdir("../input"))

Using TensorFlow backend.


['pickled-glove840b300d-for-10sec-loading', 'jigsaw-unintended-bias-in-toxicity-classification']


In [2]:
GLOVE_EMBEDDING_PATH = '../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl'

In [3]:
def load_embeddings(path):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr


The next function is really important. Although we put a lot of effort in making the preprocessing right there are stil some out of vocabulary words we could easily fix. One example I implement here is to try a lower/upper case version of a word if an embedding is not found, which sometimes gives us an embedding.

In [4]:
def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((max_features + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
    return embedding_matrix, unknown_words


### Data loading

In [5]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')


### Preprocessing

In [6]:
symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'

symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'


In [7]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()


isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}


def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

def handle_contractions(x):
    x = tokenizer.tokenize(x)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    x = handle_contractions(x)
    x = fix_quote(x)
    return x


So lets apply that preprocess function to our text

In [8]:
train['comment_text'] = train['comment_text'].progress_apply(lambda x:preprocess(x))
test['comment_text'] = test['comment_text'].progress_apply(lambda x:preprocess(x))

100%|██████████| 1804874/1804874 [10:39<00:00, 2822.54it/s]
100%|██████████| 97320/97320 [00:34<00:00, 2836.55it/s]


### Fixing some of the most frequent word that doesn't have embeddings

In [9]:
import re

#  mapping: 
# "tRump" -> "Trump"
# "gov't" -> "government"
# "Gov't" -> "Government"
# "Qur'an" -> "Quran"
# "Koncerned" -> "Concerned"
# "y'all" -> "you guys"
# "y'know" -> you know

def fix_1(x):
    pattern1 = r"\btRump\b"
    pattern2 = r"\bgov't\b"
    pattern3 = r"\bGov't\b"
    pattern4 = r"\bQur'an\b"
    pattern5 = r"\bKoncerned\b"
    pattern6 = r"\by'all\b"
    pattern7 = r"\bgov'ts\b"
    pattern8 = r"\by'know\b"

    x = re.sub(pattern1,"Trump",x)
    x = re.sub(pattern2,"government",x)
    x = re.sub(pattern3,"Government",x)
    x = re.sub(pattern4,"Quran",x)
    x = re.sub(pattern5,"Concerned",x)
    x = re.sub(pattern6,"you guys",x)
    x = re.sub(pattern7,"governments",x)
    x = re.sub(pattern8,"you know",x)
        
    return x


In [10]:
train['comment_text'] = train['comment_text'].progress_apply(lambda x:fix_1(x))
test['comment_text'] = test['comment_text'].progress_apply(lambda x:fix_1(x))

100%|██████████| 1804874/1804874 [02:13<00:00, 13493.12it/s]
100%|██████████| 97320/97320 [00:07<00:00, 13580.49it/s]


### Fixing some identity related words

In [11]:
# LBGT -> lgbt - 106 occurences in train set
# lieberal -> liberal - 83
# Lieberal -> Liberal - 99 
# realDonaldTrump -> Donald Trump 106 
# theDonald -> Donald Trump 89
# Trumpsters -> Trump supporters 340
# Trumpian -> Trump supporters 350 

def fix_identity(x):
    pattern1 = r"\bLBGT\b"
    pattern2 = r"\blieberal\b"
    pattern3 = r"\bLieberal\b"
    pattern4 = r"\brealDonaldTrump\b"
    pattern5 = r"\btheDonald\b"
    pattern6 = r"\bTrumpsters\b"
    pattern7 = r"\bTrumpian\b"
 
    x = re.sub(pattern1,"LGBT",x)
    x = re.sub(pattern2,"liberal",x)
    x = re.sub(pattern3,"Liberal",x)
    x = re.sub(pattern4,"Donald Trump",x)
    x = re.sub(pattern5,"Donald Trump",x)
    x = re.sub(pattern6,"Trump supporters",x)
    x = re.sub(pattern7,"Trump supporter",x)

    return x


In [12]:
train['comment_text'] = train['comment_text'].progress_apply(lambda x:fix_identity(x))
test['comment_text'] = test['comment_text'].progress_apply(lambda x:fix_identity(x))

100%|██████████| 1804874/1804874 [01:55<00:00, 15632.36it/s]
100%|██████████| 97320/97320 [00:06<00:00, 15684.76it/s]


### Defining a model

In [13]:
# Final dependent and independent variables

comment_text = train['comment_text']

target = train['target']

In [14]:
LSTM_UNITS = 128
max_features = 473670
DENSE_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220
EMBEDDING_DIM = 300


### Embedding layer

In [15]:
#create instance of keras Tokenizer class
tokenizer = Tokenizer(num_words = max_features, filters='',lower=False)
tokenizer.fit_on_texts(comment_text)

# pad sequences to MAX_LEN
comment_text = pad_sequences(tokenizer.texts_to_sequences(comment_text),MAX_LEN)


In [16]:
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)

In [17]:
glove_layer = Embedding(len(tokenizer.word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[glove_matrix],
                            input_length=MAX_LEN,
                            trainable=False)

### Defining the model architecture

In [18]:
model = Sequential()
embedding = model.add(glove_layer)
model.add(Dropout(0.1))
model.add(Bidirectional(LSTM(LSTM_UNITS*2,activation='tanh')))
model.add(Dense(DENSE_UNITS,activation='relu'))
model.add(Dense(DENSE_UNITS,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam')


In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 220, 300)          142101300 
_________________________________________________________________
dropout_1 (Dropout)          (None, 220, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               1140736   
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 513       
Total params: 143,767,861
Trainable params: 1,666,561
Non-trainable params: 142,101,300
______________________________________________________

In [20]:
# setting target value to True/False

target[target >= 0.5] = True
target[target < 0.5] = False

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Model training

In [21]:
# samples with this classes affiliation has more effect on evaluation metric

identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

In [22]:
# getting sample weights

sample_weight = np.ones(train.shape[0]) # setting ordinary samples to weights of 1

# set identity samples weights to (say) 3

sample_weight[train['male'] > 0.5] = 3
sample_weight[train['female'] > 0.5] = 3
sample_weight[train['homosexual_gay_or_lesbian'] > 0.5] = 3
sample_weight[train['christian'] > 0.5] = 3
sample_weight[train['muslim'] > 0.5] = 3
sample_weight[train['black'] > 0.5] = 3
sample_weight[train['white'] > 0.5] = 3
sample_weight[train['psychiatric_or_mental_illness'] > 0.5] = 3

In [23]:
model.fit(comment_text,target,batch_size=1024,epochs=4,sample_weight=sample_weight)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f58161a25f8>

### Making predictions on test set

In [24]:
submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv', index_col='id')

submission['prediction'] = model.predict(pad_sequences(tokenizer.texts_to_sequences(test['comment_text']),MAX_LEN),batch_size=1024)[:, 0]
submission.to_csv('submission.csv')

