In [1]:
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark import SparkConf
from operator import add
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import StopWordsRemover
from gensim.models import KeyedVectors
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType

In [2]:
spark = SparkSession\
        .builder\
        .appName("QuoraInsincere")\
        .getOrCreate()
# conf = SparkConf().setMaster("local").setAppName("sample")
# sc = SparkContext(conf=conf)
# conf = spark.conf
# sc = SparkContext(conf)
sc = spark.sparkContext

In [3]:
corpus = spark.read.option("header","true").option("inferSchema","true").csv("data/small.csv")
data = corpus.select('qid','question_text','target')
data.show()

+--------------------+--------------------+------+
|                 qid|       question_text|target|
+--------------------+--------------------+------+
|00002165364db923c7e6|How did Quebec na...|     0|
|000032939017120e6e44|Do you have an ad...|     0|
|0000412ca6e4628ce2cf|Why does velocity...|     0|
|000042bf85aa498cd78e|How did Otto von ...|     0|
|0000455dfa3e01eae3af|Can I convert mon...|     0|
|00004f9a462a357c33be|Is Gaza slowly be...|     0|
|00005059a06ee19e11ad|Why does Quora au...|     0|
|0000559f875832745e2e|Is it crazy if I ...|     0|
|00005bd3426b2d0c8305|Is there such a t...|     0|
|00006e6928c5df60eacb|Is it just me or ...|     0|
|000075f67dd595c3deb5|What can you say ...|     0|
|000076f3b42776c692de|How were the Calg...|     0|
|000089792b3fc8026741|What is the dumbe...|     0|
|000092a90bcfbfe8cd88|Can we use our ex...|     0|
|000095680e41a9a6f6e3|I am 30, living a...|     0|
|0000a89942e3143e333a|What do you know ...|     0|
|0000b8e1279eaa0a7062|How diffi

In [4]:
sentences = data.select('question_text').collect()

In [5]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in sentences:
        for word in str(sentence[0]).split():
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [6]:
news_path = '/Users/ruolanzeng/InsincereQuestionClassification/data/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)


In [7]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in vocab:
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [8]:
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})
oov = check_coverage(vocab,embeddings_index)

{'How': 188, 'did': 23, 'Quebec': 1, 'nationalists': 1, 'see': 6}
Found embeddings for 67.35% of vocab
Found embeddings for  78.86% of all text


In [10]:
def clean_text(x):  
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [11]:
udf = UserDefinedFunction(lambda x: clean_text(x),StringType())
data = data.withColumn('question_text', udf(data.question_text))
sentences = data.select('question_text').collect()

In [12]:
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})
oov = check_coverage(vocab,embeddings_index)

{'How': 190, 'did': 23, 'Quebec': 1, 'nationalists': 1, 'see': 6}
Found embeddings for 94.87% of vocab
Found embeddings for  90.14% of all text


In [13]:
import re
def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [14]:
udf = UserDefinedFunction(lambda x: clean_numbers(x),StringType())
data = data.withColumn('question_text', udf(data.question_text))
sentences = data.select('question_text').collect()

In [15]:
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})
oov = check_coverage(vocab,embeddings_index)

{'How': 190, 'did': 23, 'Quebec': 1, 'nationalists': 1, 'see': 6}
Found embeddings for 96.57% of vocab
Found embeddings for  90.94% of all text


In [16]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [20]:
udf = UserDefinedFunction(lambda x: replace_typical_misspell(x),StringType())
data = data.withColumn('question_text', udf(data.question_text))
sentences = data.select('question_text').collect()

In [21]:
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})
oov = check_coverage(vocab,embeddings_index)

{'How': 190, 'did': 24, 'Quebec': 1, 'nationalists': 1, 'see': 6}
Found embeddings for 96.73% of vocab
Found embeddings for  91.00% of all text


In [22]:
data.show()

+--------------------+--------------------+------+
|                 qid|       question_text|target|
+--------------------+--------------------+------+
|00002165364db923c7e6|How did Quebec na...|     0|
|000032939017120e6e44|Do you have an ad...|     0|
|0000412ca6e4628ce2cf|Why does velocity...|     0|
|000042bf85aa498cd78e|How did Otto von ...|     0|
|0000455dfa3e01eae3af|Can I convert mon...|     0|
|00004f9a462a357c33be|Is Gaza slowly be...|     0|
|00005059a06ee19e11ad|Why does Quora au...|     0|
|0000559f875832745e2e|Is it crazy if I ...|     0|
|00005bd3426b2d0c8305|Is there such a t...|     0|
|00006e6928c5df60eacb|Is it just me or ...|     0|
|000075f67dd595c3deb5|What can you say ...|     0|
|000076f3b42776c692de|How were the Calg...|     0|
|000089792b3fc8026741|What is the dumbe...|     0|
|000092a90bcfbfe8cd88|Can we use our ex...|     0|
|000095680e41a9a6f6e3|I am ## living at...|     0|
|0000a89942e3143e333a|What do you know ...|     0|
|0000b8e1279eaa0a7062|How diffi

In [23]:
tokenizer = Tokenizer(inputCol="question_text", outputCol="words")
tokenized = tokenizer.transform(data)
tokenized.show()

+--------------------+--------------------+------+--------------------+
|                 qid|       question_text|target|               words|
+--------------------+--------------------+------+--------------------+
|00002165364db923c7e6|How did Quebec na...|     0|[how, did, quebec...|
|000032939017120e6e44|Do you have an ad...|     0|[do, you, have, a...|
|0000412ca6e4628ce2cf|Why does velocity...|     0|[why, does, veloc...|
|000042bf85aa498cd78e|How did Otto von ...|     0|[how, did, otto, ...|
|0000455dfa3e01eae3af|Can I convert mon...|     0|[can, i, convert,...|
|00004f9a462a357c33be|Is Gaza slowly be...|     0|[is, gaza, slowly...|
|00005059a06ee19e11ad|Why does Quora au...|     0|[why, does, quora...|
|0000559f875832745e2e|Is it crazy if I ...|     0|[is, it, crazy, i...|
|00005bd3426b2d0c8305|Is there such a t...|     0|[is, there, such,...|
|00006e6928c5df60eacb|Is it just me or ...|     0|[is, it, just, me...|
|000075f67dd595c3deb5|What can you say ...|     0|[what, can, yo

In [38]:
word2Vec = Word2Vec(vectorSize=100, minCount=4, inputCol="words", outputCol="result")
model = word2Vec.fit(tokenized)
result = model.transform(tokenized)
result.show()

+--------------------+--------------------+------+--------------------+--------------------+
|                 qid|       question_text|target|               words|              result|
+--------------------+--------------------+------+--------------------+--------------------+
|00002165364db923c7e6|How did Quebec na...|     0|[how, did, quebec...|[-0.0180626708763...|
|000032939017120e6e44|Do you have an ad...|     0|[do, you, have, a...|[-0.0483932867209...|
|0000412ca6e4628ce2cf|Why does velocity...|     0|[why, does, veloc...|[-0.0105951436562...|
|000042bf85aa498cd78e|How did Otto von ...|     0|[how, did, otto, ...|[-0.0123491385537...|
|0000455dfa3e01eae3af|Can I convert mon...|     0|[can, i, convert,...|[-0.0288841204717...|
|00004f9a462a357c33be|Is Gaza slowly be...|     0|[is, gaza, slowly...|[0.00580674251541...|
|00005059a06ee19e11ad|Why does Quora au...|     0|[why, does, quora...|[-0.0172531273241...|
|0000559f875832745e2e|Is it crazy if I ...|     0|[is, it, crazy, i...

In [39]:
from pyspark.ml.feature import IDF, CountVectorizer
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures")
cvmodel = cv.fit(result)
featurizedData = cvmodel.transform(result)
featurizedData.show(truncate=False)

+--------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [40]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("words", "features").show()

+--------------------+--------------------+
|               words|            features|
+--------------------+--------------------+
|[how, did, quebec...|(3557,[0,4,5,9,33...|
|[do, you, have, a...|(3557,[3,8,9,11,1...|
|[why, does, veloc...|(3557,[15,20,82,3...|
|[how, did, otto, ...|(3557,[0,9,54,170...|
|[can, i, convert,...|(3557,[0,3,4,7,13...|
|[is, gaza, slowly...|(3557,[2,12,21,10...|
|[why, does, quora...|(3557,[0,11,12,15...|
|[is, it, crazy, i...|(3557,[2,7,10,16,...|
|[is, there, such,...|(3557,[2,4,8,9,18...|
|[is, it, just, me...|(3557,[0,2,3,5,8,...|
|[what, can, you, ...|(3557,[1,13,14,50...|
|[how, were, the, ...|(3557,[0,9,122,82...|
|[what, is, the, d...|(3557,[0,1,2,12,8...|
|[can, we, use, ou...|(3557,[0,4,12,13,...|
|[i, am, ##, livin...|(3557,[4,7,8,9,13...|
|[what, do, you, k...|(3557,[0,1,8,11,1...|
|[how, difficult, ...|(3557,[2,3,4,9,14...|
|[have, you, licke...|(3557,[0,4,6,14,1...|
|[do, you, think, ...|(3557,[0,3,5,11,1...|
|[how, many, baron...|(3557,[4,9

In [41]:
from pyspark.ml.clustering import LDA
def train_LDA(dataset):
    num_topics = 20
    max_iterations = 100
    lda = LDA(k=num_topics, maxIter=max_iterations)
    model = lda.fit(dataset.select("words", "features", "result"))
    return model

In [42]:
topicModel = train_LDA(rescaledData)
topics = topicModel.describeTopics(1)
print("The topics described by their top-weighted terms :")
topics.show()

The topics described by their top-weighted terms :
+-----+-----------+--------------------+
|topic|termIndices|         termWeights|
+-----+-----------+--------------------+
|    0|      [535]|[0.02079281645950...|
|    1|      [189]|[0.01810737950893...|
|    2|       [14]|[0.01854843041728...|
|    3|       [62]|[0.02754937199489...|
|    4|       [86]|[0.03330128975342...|
|    5|      [336]|[0.01892636844475...|
|    6|      [331]|[0.01659888632012...|
|    7|      [122]|[0.02457218478766...|
|    8|      [110]|[0.02636097543589...|
|    9|       [23]|[0.02449929347613...|
|   10|       [44]|[0.01490236714914...|
|   11|       [14]|[0.02562839960964...|
|   12|      [178]|[0.01890812262989...|
|   13|        [7]|[0.02245045608385...|
|   14|       [49]|[0.04727655648842...|
|   15|       [38]|[0.08607730696333...|
|   16|      [139]|[0.03266120019827...|
|   17|       [54]|[0.05215353190920...|
|   18|      [215]|[0.01999914066077...|
|   19|       [64]|[0.03729956078562...|
+-----

In [45]:
from pyspark.sql.types import ArrayType, StringType

def indices_to_terms(vocabulary):
    def indices_to_terms(xs):
        return [vocabulary[int(x)] for x in xs]
    return udf(indices_to_terms, ArrayType(StringType()))

In [46]:
topics.withColumn("topics_words", indices_to_terms(cvmodel.vocabulary)("termIndices")).show(truncate=False)

AttributeError: 'function' object has no attribute '_get_object_id'

In [47]:
transformedSincere = topicModel.transform(rescaledData)

In [48]:
transformedSincere.show(truncate = False)

+--------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------