In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/test.csv
/kaggle/input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt
/kaggle/input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/README.txt
/kaggle/input/quora-insincere-questions-classification/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec
/kaggle/input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin
/kaggle/input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt


In [2]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [4]:
train = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
test = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [5]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [6]:
sentences = train["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 1306122/1306122 [00:09<00:00, 140539.12it/s]
100%|██████████| 1306122/1306122 [00:08<00:00, 147700.54it/s]

{'How': 261930, 'did': 33489, 'Quebec': 97, 'nationalists': 91, 'see': 9003}





In [7]:
from gensim.models import KeyedVectors

news_path = '../input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [8]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [9]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 508823/508823 [00:03<00:00, 169539.91it/s]


Found embeddings for 24.31% of vocab
Found embeddings for  78.75% of all text


In [10]:
oov[:10]

[('to', 403183),
 ('a', 402682),
 ('of', 330825),
 ('and', 251973),
 ('India?', 16384),
 ('it?', 12900),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202)]

In [11]:
for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
    if punct in embeddings_index:
        print(punct)
    else:
        print('\t\t\t\t',punct)

				 ?
				 !
				 .
				 ,
				 "
#
$
%
				 '
				 (
				 )
*
+
				 -
				 /
				 :
				 ;
				 <
=
>
@
				 [
				 \
				 ]
^
_
`
				 {
				 |
				 }
~
				 “
				 ”
				 ’


In [21]:
def clean_text(x):
    x = str(x)
    for punct in '&#$%*+=>@^_`~':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"\'()-/:;<[\\]{|}' + '“”’':
        x = x.replace(punct, '')
    return x

In [22]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_text(x))
sentences = train["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:21<00:00, 60537.63it/s]
100%|██████████| 1306122/1306122 [00:08<00:00, 160579.86it/s]


In [23]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 279100/279100 [00:01<00:00, 168938.23it/s]


Found embeddings for 52.78% of vocab
Found embeddings for  90.39% of all text


In [24]:
oov[:10]

[('to', 405812),
 ('a', 404076),
 ('of', 332825),
 ('and', 252936),
 ('doesnt', 6769),
 ('didnt', 3871),
 ('isnt', 2788),
 ('Isnt', 1429),
 ('favourite', 1245),
 ('bitcoin', 972)]

In [25]:
for i in range(10):
    print(embeddings_index.index2entity[i])

</s>
in
for
that
is
on
##
The
with
said


In [26]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [27]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_numbers(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:27<00:00, 47517.82it/s]
100%|██████████| 1306122/1306122 [00:07<00:00, 180178.04it/s]
100%|██████████| 1306122/1306122 [00:08<00:00, 158443.76it/s]


In [28]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 278917/278917 [00:01<00:00, 173804.67it/s]

Found embeddings for 52.83% of vocab
Found embeddings for  90.39% of all text





In [29]:
oov[:20]

[('to', 405812),
 ('a', 404076),
 ('of', 332825),
 ('and', 252936),
 ('doesnt', 6769),
 ('didnt', 3871),
 ('isnt', 2788),
 ('Isnt', 1429),
 ('favourite', 1245),
 ('bitcoin', 972),
 ('colour', 971),
 ('Quorans', 877),
 ('centre', 874),
 ('cryptocurrency', 824),
 ('shouldnt', 789),
 ('hasnt', 784),
 ('Snapchat', 779),
 ('wasnt', 743),
 ('travelling', 701),
 ('hisher', 701)]

In [71]:
len(oov),len(embeddings_index.index2word)

(131556, 3000000)

In [74]:
# Count how many word with length from 1 to 10 are there in Google embeddinngs.
countfrequencyofword1 = 0
countfrequencyofword2 = 0
countfrequencyofword3 = 0
countfrequencyofword4 = 0
countfrequencyofword5 = 0
countfrequencyofword6 = 0
countfrequencyofword7 = 0
countfrequencyofword8 = 0
countfrequencyofword9 = 0
countfrequencyofword10 = 0
for x in embeddings_index.index2word:
    if len(x) == 1:
        countfrequencyofword1 += 1
    if len(x) == 2:
        countfrequencyofword2 += 1
    if len(x) == 3:
        countfrequencyofword3 += 1
    if len(x) == 4:
        countfrequencyofword4 += 1
    if len(x) == 5:
        countfrequencyofword5 += 1
    if len(x) == 6:
        countfrequencyofword6 += 1
    if len(x) == 7:
        countfrequencyofword7 += 1
    if len(x) == 8:
        countfrequencyofword8 += 1
    if len(x) == 9:
        countfrequencyofword9 += 1
    if len(x) == 10:
        countfrequencyofword10 += 1
countfrequencyofword1,countfrequencyofword2,countfrequencyofword3,countfrequencyofword4,countfrequencyofword5,countfrequencyofword6,countfrequencyofword7,countfrequencyofword8,countfrequencyofword9,countfrequencyofword10

(760, 3659, 28786, 81317, 112528, 156565, 179773, 173561, 165129, 165799)

In [75]:
760+ 3659+ 28786+ 81317+ 112528+ 156565+ 179773+ 173561+ 165129+ 165799
# the total amount of words of which length vary from 1 to 10 is just 1067877, which means
# there're still nearly 2 million words of which length are greater than 10.

1067877

In [91]:
"havent" in embeddings_index

True

In [92]:
for x in oov:
    if "Havent" in x[0]:
        print('yes')

In [58]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'didn\'t',
                'doesnt':'doesn\'t',
                'Doesnt':'doesn\'t',
                'isnt':'isn\'t',
                'Isnt':'isn\'t',
                'arent':'aren\'t',
                'werent':'weren\'t',
                'Werent':'weren\'t',
                'havent':'haven\'t',
                'shouldnt':'shouldn\'t',
                'Shouldnt':'shouldn\'t',
                'couldnt':'couldn\'t',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram':'Instagram',
                'whatsapp':'social medium',
                'snapchat':'social medium',
                'wechat':'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [68]:
def missingOne(s1, s2):
    if len(s1)+1 != len(s2):
        return False
    cnt = 0
    for i in range(0,len(s1)):
        if cnt == 0:
            if s1[i] == s2[i]:
                print('1 cnt: ',cnt,' s1[',i,']: ',s1[i],' s2[',i,']: ',s2[i])
                pass
            elif s1[i] == s2[i+1]:
                print('2 cnt: ',cnt,' s1[',i,']: ',s1[i],' s2[',i+1,']: ',s2[i+1])
                cnt += 1
                pass
            else:
                return False
        elif cnt == 1:
            if s1[i] == s2[i+1]:
                print('3 cnt: ',cnt,' s1[',i,']: ',s1[i],' s2[',i+1,']: ',s2[i+1])
                pass
            else:
                return False
        else:
            return False
    return True