# Task 2: Sentence Vector Exploration

In [1]:
import csv
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline 

#### Read data from file and transform the data such that "sentence1" and "sentence2" merged into the same column

In [2]:
training_path = 'MNLI/train.tsv'

spark = SparkSession \
    .builder \
    .appName("training explore") \
    .getOrCreate()

train = spark.read.csv(training_path,header=True,sep='\t') \
                        .select("genre", "sentence1", "sentence2") \
                        .fillna("")

genre_sentence = train.rdd.flatMap(lambda line: [(line[0],line[1]),(line[0],line[2])]).cache()
genre_sentence_df = spark.createDataFrame(genre_sentence, ["genre","sentence"])

genre_sentence_df.printSchema()

root
 |-- genre: string (nullable = true)
 |-- sentence: string (nullable = true)



## Find TF-IDF based vector representation

#### Create pipeline for the transformation from sentence to TF-IDF vector

In [3]:
numDimention = 20

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
hashingTF = HashingTF(numFeatures=numDimention, inputCol="words", outputCol="TF")
idf = IDF(inputCol="TF", outputCol="TF-IDF")

pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])


#### Transform the sentences to vector with the pipeline

In [4]:
TFIDF = pipeline.fit(genre_sentence_df.select("sentence")) \
                    .transform(genre_sentence_df) \
                    .select("genre", "TF-IDF")

TFIDF.show(3)

+----------+--------------------+
|     genre|              TF-IDF|
+----------+--------------------+
|government|(20,[0,4,5,6,8,13...|
|government|(20,[4,5,6,11,13,...|
| telephone|(20,[0,1,2,3,4,5,...|
+----------+--------------------+
only showing top 3 rows



## Find Universal Sentence Encoder

In [6]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

#### Function which encodes partition of sentences to vectors

In [7]:
def review_embed(rev_text_partition):
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
    embed = hub.Module(module_url)
    # mapPartition would supply element inside a partition using generator stype
    # this does not fit tensorflow stype
    rev_text_list = [text[1] for text in rev_text_partition]
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
#         message_embeddings = session.run(embed([rev_text_partition]))
        message_embeddings = session.run(embed(rev_text_list))
    return message_embeddings

### (limit to 1000 for single node testing)
#### Transformation to get  the encoding

In [56]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import PCA
from pyspark.sql.functions import col

encoding = genre_sentence_df.limit(1000) \
                            .rdd \
                            .mapPartitions(review_embed) \
                            .map(lambda line:Vectors.dense(line)).cache()

genre_encoding = genre_sentence_df.limit(1000).select("genre").rdd.map(lambda x:x['genre']).zip(encoding)
encoding.show(3)

### KMeans Clustering for Universal Encoder by first reducing the dimention with PCA

In [59]:
pca = PCA(k=15, inputCol="features", outputCol="pca")
test_vectors = genre_encoding.toDF(["genre","features"])
model = pca.fit(test_vectors)
pca_result = model.transform(test_vectors)

In [None]:
kmeans = KMeans(featuresCol='pca',k=5)
km_model = kmeans.fit(pca_result)
prediction_universal = km_model.transform(pca_result)

### KMeans Clustering for TF-IDF Encoder

In [61]:
kmeans = KMeans(featuresCol='TF-IDF',k=5)
model = kmeans.fit(TFIDF)
predictions = model.transform(TFIDF)

In [62]:
predictions.select('prediction').take(10)

[Row(prediction=0),
 Row(prediction=0),
 Row(prediction=3),
 Row(prediction=0),
 Row(prediction=0),
 Row(prediction=2),
 Row(prediction=0),
 Row(prediction=0),
 Row(prediction=4),
 Row(prediction=2)]