Skip to content
This repository has been archived by the owner on Oct 8, 2020. It is now read-only.

Commit

Permalink
use new nodeindexer class to transform key to int
Browse files Browse the repository at this point in the history
  • Loading branch information
carstendraschner committed Jun 10, 2020
1 parent c178237 commit 1bdec41
Showing 1 changed file with 12 additions and 1 deletion.
Expand Up @@ -17,6 +17,8 @@ import org.apache.spark.ml.linalg
import org.apache.spark.sql
import shapeless.PolyDefns.->

import net.sansa_stack.ml.spark.utils.NodeIndexer




Expand Down Expand Up @@ -74,6 +76,12 @@ object minHashTryOut {

tmp_triples.foreach(println(_))

// instatiate node indexer to be able to swap between node and int representation
println("instatiate nodeindexer")
val nodeIndexer = new NodeIndexer
nodeIndexer.fit(tmp_triples)
println(nodeIndexer.get_vocab_size(), nodeIndexer.get_node(4))

// try out dense form of transformation
println("triples for both directions")
val tmp1: RDD[(Node, Seq[Node])] = tmp_triples.flatMap(t => List((t.getSubject, Seq(t.getPredicate, t.getObject)), (t.getObject, Seq(t.getPredicate, t.getSubject))))
Expand All @@ -88,12 +96,15 @@ object minHashTryOut {
println("collect RDD to perform actions")
val tmp4 = tmp3.collect()
tmp4.foreach(println(_))
// transform to int represenation in keys
val tmp5 = tmp4.map(kv => nodeIndexer.get_index(kv._1))

println("transfer to dataframe")

// wwe need a node indexer beacuase we cannot have a dataframe with non primitive datatypes and it is desired to have it in int indexes which reduces also size


val tmp5 = spark.createDataFrame(tmp4).toDF("id", "vals")
val tmp7 = spark.createDataFrame(tmp4).toDF("id", "vals")
// val tmp4 = tmp3.toDF("node", "features")
tmp5.show()
// do this is one call
Expand Down

0 comments on commit 1bdec41

Please sign in to comment.