In [1]:
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark import SparkConf
from operator import add
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import StopWordsRemover
from gensim.models import KeyedVectors

In [2]:
spark = SparkSession\
        .builder\
        .appName("QuoraInsincere")\
        .getOrCreate()
# conf = SparkConf().setMaster("local").setAppName("sample")
# sc = SparkContext(conf=conf)
# conf = spark.conf
# sc = SparkContext(conf)
sc = spark.sparkContext

In [48]:
corpus = spark.read.option("header","true").option("inferSchema","true").csv("data/small.csv")
data = corpus.select('qid','question_text','target')
data.show()

+--------------------+--------------------+------+
|                 qid|       question_text|target|
+--------------------+--------------------+------+
|00002165364db923c7e6|How did Quebec na...|     0|
|000032939017120e6e44|Do you have an ad...|     0|
|0000412ca6e4628ce2cf|Why does velocity...|     0|
|000042bf85aa498cd78e|How did Otto von ...|     0|
|0000455dfa3e01eae3af|Can I convert mon...|     0|
|00004f9a462a357c33be|Is Gaza slowly be...|     0|
|00005059a06ee19e11ad|Why does Quora au...|     0|
|0000559f875832745e2e|Is it crazy if I ...|     0|
|00005bd3426b2d0c8305|Is there such a t...|     0|
|00006e6928c5df60eacb|Is it just me or ...|     0|
|000075f67dd595c3deb5|What can you say ...|     0|
|000076f3b42776c692de|How were the Calg...|     0|
|000089792b3fc8026741|What is the dumbe...|     0|
|000092a90bcfbfe8cd88|Can we use our ex...|     0|
|000095680e41a9a6f6e3|I am 30, living a...|     0|
|0000a89942e3143e333a|What do you know ...|     0|
|0000b8e1279eaa0a7062|How diffi

In [30]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in sentences:
        for word in str(sentence[0]).split():
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [31]:
sentences = corpus.select('question_text').collect()

In [32]:
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

{'How': 188, 'did': 23, 'Quebec': 1, 'nationalists': 1, 'see': 6}


In [33]:
news_path = '/Users/ruolanzeng/InsincereQuestionClassification/data/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [34]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in vocab:
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [35]:
oov = check_coverage(vocab,embeddings_index)

Found embeddings for 67.35% of vocab
Found embeddings for  78.86% of all text


In [36]:
oov[:10]

[('to', 299),
 ('a', 296),
 ('of', 242),
 ('and', 208),
 ('it?', 15),
 ('India?', 13),
 ('"What', 11),
 ('do?', 7),
 ('"Why', 6),
 ('today?', 6)]

In [85]:
def clean_text(x):
    for punct in "/-'":
        x = str(x).replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [86]:
data = data.withColumn('question_text', clean_text(data.question_text))

AssertionError: col should be Column

In [63]:
data.show()

+--------------------+--------------------+------+
|                 qid|       question_text|target|
+--------------------+--------------------+------+
|00002165364db923c7e6|how did quebec na...|     0|
|000032939017120e6e44|do you have an ad...|     0|
|0000412ca6e4628ce2cf|why does velocity...|     0|
|000042bf85aa498cd78e|how did otto von ...|     0|
|0000455dfa3e01eae3af|can i convert mon...|     0|
|00004f9a462a357c33be|is gaza slowly be...|     0|
|00005059a06ee19e11ad|why does quora au...|     0|
|0000559f875832745e2e|is it crazy if i ...|     0|
|00005bd3426b2d0c8305|is there such a t...|     0|
|00006e6928c5df60eacb|is it just me or ...|     0|
|000075f67dd595c3deb5|what can you say ...|     0|
|000076f3b42776c692de|how were the calg...|     0|
|000089792b3fc8026741|what is the dumbe...|     0|
|000092a90bcfbfe8cd88|can we use our ex...|     0|
|000095680e41a9a6f6e3|i am 30, living a...|     0|
|0000a89942e3143e333a|what do you know ...|     0|
|0000b8e1279eaa0a7062|how diffi

In [87]:
def build_vocab2(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in sentences: 
        for word in clean_text(sentence[0]).split():
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [88]:
vocab = build_vocab2(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

{'How': 190, 'did': 23, 'Quebec': 1, 'nationalists': 1, 'see': 6}


In [89]:
oov = check_coverage(vocab,embeddings_index)

Found embeddings for 94.87% of vocab
Found embeddings for  90.14% of all text


In [90]:
oov[:10]

[('to', 300),
 ('a', 296),
 ('of', 244),
 ('and', 209),
 ('2018', 13),
 ('2017', 7),
 ('100', 6),
 ('30', 5),
 ('20', 4),
 ('10', 3)]

In [91]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [92]:
def build_vocab3(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in sentences: 
        for word in clean_numbers(clean_text(sentence[0])).split():
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [93]:
vocab = build_vocab3(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

{'How': 190, 'did': 23, 'Quebec': 1, 'nationalists': 1, 'see': 6}


In [94]:
oov = check_coverage(vocab,embeddings_index)

Found embeddings for 96.57% of vocab
Found embeddings for  90.94% of all text


In [95]:
oov[:20]

[('to', 300),
 ('a', 296),
 ('of', 244),
 ('and', 209),
 ('organisation', 2),
 ('favourite', 2),
 ('####…', 2),
 ('kardasian', 1),
 ('licencing', 1),
 ('diffferently', 1),
 ('Kubernetes', 1),
 ('kubernetes', 1),
 ('ask…', 1),
 ('didnt', 1),
 ('lifehacks', 1),
 ('Lyft', 1),
 ('Shouldnt', 1),
 ('wrastling', 1),
 ('maintanable', 1),
 ('ॡ', 1)]

In [96]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [97]:
to_remove = ['a','to','of','and']

def build_vocab4(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in sentences: 
        for word in replace_typical_misspell(clean_numbers(clean_text(sentence[0]))).split():
            if not word in to_remove:
                try:
                    vocab[word] += 1
                except KeyError:
                    vocab[word] = 1
    return vocab

In [98]:
vocab = build_vocab4(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

{'How': 190, 'did': 24, 'Quebec': 1, 'nationalists': 1, 'see': 6}


In [99]:
oov = check_coverage(vocab,embeddings_index)

Found embeddings for 96.83% of vocab
Found embeddings for  99.00% of all text


In [127]:
oov[:20]

[('####…', 2),
 ('kardasian', 1),
 ('licencing', 1),
 ('diffferently', 1),
 ('Kubernetes', 1),
 ('kubernetes', 1),
 ('ask…', 1),
 ('lifehacks', 1),
 ('Lyft', 1),
 ('Shouldnt', 1),
 ('wrastling', 1),
 ('maintanable', 1),
 ('ॡ', 1),
 ('ऌ', 1),
 ('ॠ', 1),
 ('ऋ', 1),
 ('varnamala', 1),
 ('‘itll', 1),
 ('Radon–Nikodym', 1),
 ('thunderstike', 1)]