Skip to content
This repository has been archived by the owner on Oct 8, 2020. It is now read-only.

Commit

Permalink
tests with filter option for only considering movies
Browse files Browse the repository at this point in the history
  • Loading branch information
carstendraschner committed Aug 13, 2020
1 parent a1ad081 commit 0558c53
Showing 1 changed file with 5 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ object minimalCalls {

// feature extraction
val featureExtractorModel = new FeatureExtractorModel()
val extractedFeaturesDataFrame = featureExtractorModel.transform(triplesDf)
.setMode("an")
val extractedFeaturesDataFrame = featureExtractorModel
.transform(triplesDf)
.filter(t => t.getAs[String]("uri").startsWith("m"))
extractedFeaturesDataFrame.show()

// count Vectorization
Expand Down Expand Up @@ -122,7 +125,7 @@ object minimalCalls {
.select("uriA", "uriB", "distance")
val uriCandidates = (minHashedSimilarities.select("uriA").rdd.map(r => r(0)).collect().toSet.union(minHashedSimilarities.select("uriB").rdd.map(r => r(0)).collect().toSet)).toList
val dfGoodCandidatesDf = tmpDf.filter(col("uri").isInCollection(uriCandidates))
// println(countVectorizedFeaturesDataFrame.count(), dfGoodCandidatesDf.count())
println(countVectorizedFeaturesDataFrame.count(), dfGoodCandidatesDf.count())
jaccardModel.similarityJoin(dfGoodCandidatesDf, dfGoodCandidatesDf, threshold = 0.5).show()
}
}

0 comments on commit 0558c53

Please sign in to comment.