# Test 3

In [1]:
import os
import findspark
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    "--repositories http://repo.hortonworks.com/content/groups/public/ "
    "--packages com.hortonworks:shc-core:1.1.1-2.1-s_2.11 "
    " pyspark-shell")
findspark.init()

In [2]:
from pyspark import SQLContext, SparkConf, SparkContext
from pyspark.sql import SparkSession
conf = SparkConf()
conf.setAppName("project1")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)


In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.appName("project1").getOrCreate()

# Preprocessing

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

schema = StructType([
    StructField("marketplace", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("review_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_parent", StringType(), True),
    StructField("label", StringType(), True), # "label" replaces "product_title"
    StructField("product_category", StringType(), True),
    StructField("star_rating", IntegerType(), True),
    StructField("helpful_votes", IntegerType(), True),
    StructField("total_votes", IntegerType(), True),
    StructField("vine", StringType(), True),
    StructField("verified_purchase", StringType(), True),
    StructField("review_headline", StringType(), True),
    StructField("review_body", StringType(), True),
    StructField("review_date", StringType(), True)])

df_grocery = spark.read.csv('amazon_reviews_us_Grocery_v1_00.tsv',sep="\t", header=True, schema=schema)
df_electronics = spark.read.csv('amazon_reviews_us_Electronics_v1_00.tsv',sep="\t", header=True, schema=schema)
df_videogames = spark.read.csv('amazon_reviews_us_Video_Games_v1_00.tsv',sep="\t", header=True, schema=schema)
df_toys = spark.read.csv('amazon_reviews_us_Toys_v1_00.tsv',sep="\t", header=True, schema=schema)

In [5]:
df_grocery = df_grocery.dropna()
df_electronics = df_electronics.dropna()
df_videogames = df_videogames.dropna()
df_toys = df_toys.dropna()

from pyspark.sql.functions import col, when

def blank_as_null(x):
    return when(col(x) != "", col(x)).otherwise(None)

df_g = df_grocery.withColumn("review_body", blank_as_null("review_body"))
df_g = df_g.dropna()
df_g2 = df_g.withColumn("label", blank_as_null("label"))
df_g2 = df_g2.dropna()

In [6]:
df_grocery.count() # number of entries

2402211

In [7]:
df_g2.count()

2402211

In [8]:
df_electronics.count() # number of entries

3093660

In [9]:
df_videogames.count() # number of entries

1785886

In [10]:
df_toys.count() # number of entries

4863497

In [11]:
df_grocery.show(5) # show first 10 entries

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|               label|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   42521656|R26MV8D0KG6QI6|B000SAQCWC|     159713740|The Cravings Plac...|         Grocery|          5|            0|          0|   N|                Y|Using these for y...|As a family aller...| 2015-08-31|
|         US|   12049833|R1OF8GP57AQ1A0|B00509LVIQ|     138680402|Mauna Loa Macadam...|         Grocery|          5|    

In [12]:
df_electronics.show(5) # show first 10 entries

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|               label|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   41409413|R2MTG1GCZLR2DK|B00428R89M|     112201306|yoomall 5M Antenn...|     Electronics|          5|            0|          0|   N|                Y|          Five Stars|       As described.| 2015-08-31|
|         US|   49668221|R2HBOEM8LE9928|B000068O48|     734576678|Hosa GPM-103 3.5m...|     Electronics|          5|    

In [13]:
df_videogames.show(5) # show first 10 entries

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|               label|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   12039526| RTIS3L2M1F5SM|B001CXYMFS|     737716809|Thrustmaster T-Fl...|     Video Games|          5|            0|          0|   N|                Y|an amazing joysti...|Used this for Eli...| 2015-08-31|
|         US|    9636577| R1ZV7R40OLHKD|B00M920ND6|     569686175|Tonsee 6 buttons ...|     Video Games|          5|    

In [14]:
df_toys.show(5) # show first 10 entries

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|               label|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   18778586| RDIJS7QYB6XNR|B00EDBY7X8|     122952789|Monopoly Junior B...|            Toys|          5|            0|          0|   N|                Y|          Five Stars|        Excellent!!!| 2015-08-31|
|         US|   24769659|R36ED1U38IELG8|B00D7JFOPC|     952062646|56 Pieces of Wood...|            Toys|          5|    

In [15]:
df_test = df_g2.drop('marketplace','customer_id','review_id','product_id','product_parent','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_date')

df1 = df_grocery.drop('marketplace','customer_id','review_id','product_id','product_parent','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_date')
df2 = df_electronics.drop('marketplace','customer_id','review_id','product_id','product_parent','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_date')
df3 = df_videogames.drop('marketplace','customer_id','review_id','product_id','product_parent','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_date')
df4 = df_toys.drop('marketplace','customer_id','review_id','product_id','product_parent','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_date')

In [16]:
df_test.printSchema()
df_test.select('*').where('label is null or review_body is null or label = "" or review_body = ""').count()

root
 |-- label: string (nullable = true)
 |-- review_body: string (nullable = true)



0

In [17]:
df1.printSchema()

root
 |-- label: string (nullable = true)
 |-- review_body: string (nullable = true)



In [18]:
df2.printSchema()

root
 |-- label: string (nullable = true)
 |-- review_body: string (nullable = true)



In [19]:
df3.printSchema()

root
 |-- label: string (nullable = true)
 |-- review_body: string (nullable = true)



In [20]:
df4.printSchema()

root
 |-- label: string (nullable = true)
 |-- review_body: string (nullable = true)



In [21]:
df1.head()

Row(label='The Cravings Place Chocolate Chunk Cookie Mix, 23-Ounce Bags (Pack of 6)', review_body="As a family allergic to wheat, dairy, eggs, nuts, and several other things, we love the entire Cravings Place line of products as it allows us to bake treats with minimal effort and ingredients. Most allergy-free and gluten-free mixes usually just omit one or two allergens at most, so it's great to see a mix created without many of the most common allergens. (Note these still have soy and corn). We consume these on a regular basis and have been doing so for years.")

In [22]:
df1.count()

2402211

In [23]:
df2.count()

3093660

In [24]:
df3.count()

1785886

In [25]:
# sort by labels, top 20

In [26]:
from pyspark.sql.functions import col
df1.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|               label|count|
+--------------------+-----+
|San Francisco Bay...|17031|
|Viva Naturals Org...|10067|
|Nutiva Organic Vi...| 5798|
| Davidson's Tea Bulk| 5716|
|Grove Square Capp...| 5145|
|Keurig Green Moun...| 4923|
|Amazing Grass Gre...| 4179|
|Surge Citrus Flav...| 3902|
|Brooklyn Beans Si...| 3853|
|Keurig, The Origi...| 3476|
|Ekobrew Coffee Re...| 3410|
|  Senseo Coffee Pods| 3231|
|Grove Square Hot ...| 3143|
|Twinings Earl Gre...| 2972|
|KIND PLUS Gluten ...| 2861|
|Celestial Seasoni...| 2837|
|Green Mountain Co...| 2462|
|            Twinings| 2392|
|Vita Coco Coconut...| 2343|
|Timothy's World C...| 2307|
+--------------------+-----+
only showing top 20 rows



In [27]:
from pyspark.sql.functions import col
df2.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|               label|count|
+--------------------+-----+
|Panasonic ErgoFit...|24833|
|AmazonBasics High...|16163|
|Mediabridge ULTRA...|15674|
|Clip Plus 4 GB MP...|11779|
|High Speed HDMI C...|11177|
|AmazonBasics High...|10740|
|VideoSecu ML531BE...|10214|
|CABTE High speed ...| 9923|
|Cheetah APTMM2B T...| 9364|
|Sennheiser On-Ear...| 9103|
|Bluetooth Speaker...| 9005|
|     HDMI-High-Speed| 8168|
|Sanyo NEW 1500 en...| 7554|
|MEElectronics Spo...| 7246|
|Sony MDRZX100 Hea...| 7235|
|Mohu Leaf 30 TV A...| 6538|
|AmazonBasics Ultr...| 6453|
|Bose SoundLink Mi...| 6075|
|Belkin 6-Outlet H...| 5858|
|Mediabridge 3.5mm...| 5823|
+--------------------+-----+
only showing top 20 rows



In [28]:
from pyspark.sql.functions import col
df3.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|               label|count|
+--------------------+-----+
|PlayStation 4 500...|10361|
|  Grand Theft Auto V| 8714|
|Call of Duty: Ghosts| 7810|
|       Battlefield 4| 4809|
|  Assassin's Creed 4| 4722|
|      The Last of Us| 4598|
|Elder Scrolls V: ...| 4537|
|             Destiny| 4408|
| Diablo III - PC/Mac| 4390|
|Call of Duty: Bla...| 4373|
|SimCity - Limited...| 3972|
|       Battlefield 3| 3953|
|      Rocksmith 2014| 3905|
|Call of Duty: Adv...| 3798|
|               Spore| 3590|
|     Nintendo Amiibo| 3444|
|Assassin's Creed III| 3405|
|E-3lue Cobra EMS1...| 3404|
|Microsoft Xbox360...| 3400|
|Minecraft - Xbox 360| 3366|
+--------------------+-----+
only showing top 20 rows



In [29]:
from pyspark.sql.functions import col
df4.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|               label|count|
+--------------------+-----+
|Cards Against Hum...|24287|
|      Melissa & Doug|11656|
|Cards Against Hum...| 6060|
|Syma S107/S107G  ...| 5848|
|VTech Sit-to-Stan...| 5037|
|Cards Against Hum...| 3963|
|Syma S107/S107G R...| 3647|
|Snap Circuits Jr....| 2969|
|            Spot It!| 2918|
|The Original Stom...| 2885|
|Cards Against Hum...| 2741|
|Fisher-Price Ocea...| 2644|
|Accoutrements Hor...| 2463|
|UDI U818A 2.4GHz ...| 2457|
|Mega Bloks 80-Pie...| 2370|
|Disney Frozen Spa...| 2331|
|Cards Against Hum...| 2298|
|Rainbow Loom Craf...| 2116|
|      Ticket To Ride| 2070|
|The Settlers of C...| 2038|
+--------------------+-----+
only showing top 20 rows



# Text Processing

## 1 (a). Model Pipeline

In [30]:
# Tokenization
#regexTokenizer = RegexTokenizer(inputCol="review_body", outputCol="words", pattern="[^A-Za-z]+", toLowercase=True)
#tokenized_data = regexTokenizer.transform(df1)

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="review_body", outputCol="words", pattern = "\\p{L}+", toLowercase=True)

# stop words
add_stopwords = ["the", "a", "an", "another", "for","http", "https"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [31]:
df1.head()

Row(label='The Cravings Place Chocolate Chunk Cookie Mix, 23-Ounce Bags (Pack of 6)', review_body="As a family allergic to wheat, dairy, eggs, nuts, and several other things, we love the entire Cravings Place line of products as it allows us to bake treats with minimal effort and ingredients. Most allergy-free and gluten-free mixes usually just omit one or two allergens at most, so it's great to see a mix created without many of the most common allergens. (Note these still have soy and corn). We consume these on a regular basis and have been doing so for years.")

## 2 (b). StringIndexer

In [32]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "label_final")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])


In [33]:
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df_test)
#pipelineFit2 = pipeline.fit(df2)
#pipelineFit3 = pipeline.fit(df3)
#pipelineFit4 = pipeline.fit(df4)

dataset1 = pipelineFit.transform(df_test)
dataset1.show(5)
#dataset2 = pipelineFit2.transform(df2)
#dataset2.show(5)
#dataset3 = pipelineFit3.transform(df3)
#dataset3.show(5)
#dataset4 = pipelineFit4.transform(df4)
#dataset4.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|               label|         review_body|               words|            filtered|            features|label_final|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|The Cravings Plac...|As a family aller...|[ ,  ,  ,  ,  , ,...|[ ,  ,  ,  ,  , ,...|(10000,[0,1,2,3,5...|    33574.0|
|Mauna Loa Macadam...|My favorite nut. ...|[ ,  , .  , , , ,...|[ ,  , .  , , , ,...|(10000,[0,1,4,14,...|      180.0|
|Organic Matcha Gr...|This green tea ta...|[ ,  ,  ,  ,  , !...|[ ,  ,  ,  ,  , !...|(10000,[0,5,11],[...|      671.0|
|15oz Raspberry Ly...|I love Melissa's ...|[ ,  , ',  ,  ,  ...|[ ,  , ',  ,  ,  ...|(10000,[0,3,5],[1...|    17324.0|
|Stride Spark Kine...|                good|                  []|                  []|       (10000,[],[])|    24504.0|
+--------------------+--------------------+-----

# Partition Training & Test Sets

In [34]:
# set seed for reproducibility
(trainingData1, testData1) = dataset1.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset 1 Count: " + str(trainingData1.count()))
print("Test Dataset Count: " + str(testData1.count()))

(trainingData2, testData2) = dataset2.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset 2 Count: " + str(trainingData2.count()))
print("Test Dataset Count: " + str(testData2.count()))

(trainingData3, testData3) = dataset3.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset 3 Count: " + str(trainingData3.count()))
print("Test Dataset Count: " + str(testData3.count()))

(trainingData4, testData4) = dataset3.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset 3 Count: " + str(trainingData3.count()))
print("Test Dataset Count: " + str(testData3.count()))

Training Dataset 1 Count: 1680884
Test Dataset Count: 721327


NameError: name 'dataset2' is not defined

# Model Training & Evaluation

In [None]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData1)
predictions = lrModel.transform(testData1)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
    
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData2)
predictions = lrModel.transform(testData2)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
    
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData3)
predictions = lrModel.transform(testData3)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
    
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData4)
predictions = lrModel.transform(testData4)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# Clustering

In [None]:
# Tokenization
regexTokenizer = RegexTokenizer(inputCol="review_body", outputCol="words", pattern="[^A-Za-z]+", toLowercase=True)
tokenized_data = regexTokenizer.transform(df1)

In [None]:
# Stop Word Removal
from pyspark.ml.feature import StopWordsRemover
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_data = stopWordsRemover.transform(tokenized_data)

In [None]:
# Vectorization
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20)
featurizedData = hashingTF.transform(filtered_data)

idf= IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(featurizedData)
featurized_data = idfModel.transform(featurizedData)

In [None]:
# Featurized Data
from pyspark.sql.functions import rand
# Show 10 random entries with label and features
featurized_data.select("label", "features").orderBy(rand()).limit(10).collect()

In [None]:
dataset = featurized_data.select("features")

# (a) K-Means

In [None]:
from pyspark.ml.clustering import KMeans

# Train a k-means model
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(dataset)

# Evaluate clustering by computing Within Set Sum of Squared Errors
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Show the result
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

In [None]:
# Predictions
predictions_kmeans = model.transform(dataset)
predictions_kmeans.show(1)

## (b) Latent Dirichlet Allocation (LDA)

In [None]:
from pyspark.ml.clustering import LDA

# Trains a LDA model
lda = LDA(k=3, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

In [None]:
# Predictions
predictions_lda = model.transform(dataset)
predictions_lda.show(1)

## (c) Gaussian Mixture Model (GMM)

In [None]:
from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture().setK(3).setSeed(538009335)
model = gmm.fit(dataset)

print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

In [None]:
# Predictions
predictions_gmm = model.transform(dataset)
predictions_gmm.show(3)

In [None]:
filtered_data.head()

In [None]:
# Vectorization
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10)
featurizedData = hashingTF.transform(filtered_data)

#idf= IDF(inputCol="raw_features", outputCol="features")
#idfModel = idf.fit(featurizedData)
#featurized_data = idfModel.transform(featurizedData)
featurized_data = featurizedData

In [None]:
featurized_data.head()

In [None]:
df_final = featurized_data.select("label", "raw_features")

In [None]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [None]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
(train, test) = df_final.randomSplit([0.8, 0.2])


In [None]:
train1 = sc.parallelize(train)

In [None]:
# instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)


In [None]:

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)




In [None]:
# train the multiclass model.
ovrModel = ovr.fit(train)



In [None]:
# score the model on test data.
predictions = ovrModel.transform(test)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
# Featurized Data
from pyspark.sql.functions import rand
from pyspark.sql.types import ArrayType, DoubleType

vector_udf = udf(lambda vector: vector.toArray().tolist(), ArrayType(DoubleType()))
featurized_data2 = featurized_data.withColumn("feature_array", vector_udf(featurized_data.features))

output = featurized_data2.select(["id"] + [col("feature_array")[i] for i in range(20)] + ["label"])
output.repartition(1).write.save("/home/data.csv"
    ,format='csv'
    ,mode='overwrite'
)

In [None]:
# Show 10 random entries with label and features
df_final = featurized_data.select("label", "features") #.orderBy(rand()).limit(10).collect()

In [None]:
df_final.head()

In [None]:
from  pyspark.mllib.linalg import SparseVector, DenseVector
import pandas as pd
import numpy as np

sparseVec = featurized_data.select("features")
#features = featurized_data.select("features").apply(lambda x : np.array(SparseVec.toArray())).as_matrix().reshape(-1,1)
def dense_to_array(v):
    new_array = list([float(x) for x in v])
    return new_array
#test = dense_to_array(sparseVec)
# SparseVec.head()
# features = pd.DataFrame(SparseVec.toArray())

In [None]:
featurized_data.write.csv('data.csv')

In [None]:
dataset.head()
type(dataset)

In [None]:
type(df)

In [None]:
dataset.write.csv('data.csv')