Skip to content
This repository has been archived by the owner on Oct 8, 2020. It is now read-only.

Commit

Permalink
renamed package to more suited and camelcase name
Browse files Browse the repository at this point in the history
  • Loading branch information
carstendraschner committed Aug 31, 2020
1 parent 21e59a5 commit b8bdeea
Show file tree
Hide file tree
Showing 13 changed files with 89 additions and 12 deletions.
@@ -1,6 +1,6 @@
package net.sansa_stack.ml.spark.similarity.examples

import net.sansa_stack.ml.spark.similarity.similarity_measures.{BatetModel, BraunBlanquetModel, DiceModel, JaccardModel, OchiaiModel, SimpsonModel, TverskyModel}
import net.sansa_stack.ml.spark.similarity.similarityEstimationModels.{BatetModel, BraunBlanquetModel, DiceModel, JaccardModel, OchiaiModel, SimpsonModel, TverskyModel}
import net.sansa_stack.ml.spark.utils.FeatureExtractorModel
import org.apache.jena.riot.Lang
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
Expand Down
Expand Up @@ -2,7 +2,7 @@ package net.sansa_stack.ml.spark.similarity.experiment

import java.util.Calendar

import net.sansa_stack.ml.spark.similarity.similarity_measures.{JaccardModel, TverskyModel}
import net.sansa_stack.ml.spark.similarity.similarityEstimationModels.{JaccardModel, TverskyModel}
import net.sansa_stack.ml.spark.utils.{ConfigResolver, FeatureExtractorModel, FileLister}
import net.sansa_stack.rdf.spark.io._
import org.apache.jena.riot.Lang
Expand Down
Expand Up @@ -4,7 +4,7 @@ import java.io.File
import java.util.Calendar

import com.typesafe.config.ConfigFactory
import net.sansa_stack.ml.spark.similarity.similarity_measures.JaccardModel
import net.sansa_stack.ml.spark.similarity.similarityEstimationModels.JaccardModel
import net.sansa_stack.ml.spark.utils.{FeatureExtractorModel, SimilarityExperimentMetaGraphFactory}
import net.sansa_stack.rdf.spark.io._
import org.apache.jena.riot.Lang
Expand Down
@@ -0,0 +1,77 @@
package net.sansa_stack.ml.spark.similarity.run

import net.sansa_stack.ml.spark.utils.FeatureExtractorModel
import net.sansa_stack.owl.spark.dataset
import net.sansa_stack.rdf.spark.io._
import org.apache.jena.riot.Lang
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, IDF}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.DataTypes
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object Resnik {
def main(args: Array[String]): Unit = {

// start spark session
val spark = SparkSession.builder
.appName(s"JaccardSimilarityEvaluation")
.master("local[*]")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate()

// define inputpath if it is not parameter
val inputPath = "/Users/carstendraschner/GitHub/SANSA-ML/sansa-ml-spark/src/main/resources/movie.nt"

// read in data as Data`Frame
val triplesDf: DataFrame = spark.read.rdf(Lang.NTRIPLES)(inputPath)

triplesDf.show()

// feature extraction
val featureExtractorModel = new FeatureExtractorModel()
.setMode("an")
val extractedFeaturesDataFrame = featureExtractorModel.transform(triplesDf)
extractedFeaturesDataFrame.show()

// count Vectorization
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("extractedFeatures")
.setOutputCol("vectorizedFeatures")
.fit(extractedFeaturesDataFrame)
val tmpCvDf: DataFrame = cvModel.transform(extractedFeaturesDataFrame)
val isNoneZeroVector = udf({ v: Vector => v.numNonzeros > 0 }, DataTypes.BooleanType)
val countVectorizedFeaturesDataFrame: DataFrame = tmpCvDf.filter(isNoneZeroVector(col("vectorizedFeatures"))).select("uri", "vectorizedFeatures")
countVectorizedFeaturesDataFrame.show()

// similarity estimations
// for nearestNeighbors we need one key which is a Vector to search for NN
val sample_key: Vector = countVectorizedFeaturesDataFrame.take(1)(0).getAs[Vector]("vectorizedFeatures")

val idf = new IDF().setInputCol("vectorizedFeatures").setOutputCol("features")
val idfModel = idf.fit(countVectorizedFeaturesDataFrame)

val rescaledData = idfModel.transform(countVectorizedFeaturesDataFrame)
rescaledData.show(false)

spark.stop()

}
}
/*
class InformationContentModel {
def calcIC(df: DataFrame): DataFrame = {
val ds: Dataset[(String, String, String)] = dataset.as[(String, String, String)]
// collect all element occurences
val drdd = ds.rdd
val occurences = drdd.map(_._3).filter(!_.contains("\""))
val occurenceMap = drdd
.flatMap(t => Seq((t._1, 1), (t._3, 1)))
val numberOccurences = occurenceMap.reduceByKey(_ + _)
}
}*/
Expand Up @@ -2,7 +2,7 @@ package net.sansa_stack.ml.spark.similarity.run

import java.util.Calendar

import net.sansa_stack.ml.spark.similarity.similarity_measures.TverskyModel
import net.sansa_stack.ml.spark.similarity.similarityEstimationModels.TverskyModel
import net.sansa_stack.ml.spark.utils.{FeatureExtractorModel, SimilarityExperimentMetaGraphFactory}
import net.sansa_stack.rdf.spark.io._
import org.apache.jena.riot.Lang
Expand Down
@@ -1,4 +1,4 @@
package net.sansa_stack.ml.spark.similarity.similarity_measures
package net.sansa_stack.ml.spark.similarity.similarityEstimationModels

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.DataFrame
Expand Down
@@ -1,4 +1,4 @@
package net.sansa_stack.ml.spark.similarity.similarity_measures
package net.sansa_stack.ml.spark.similarity.similarityEstimationModels

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.DataFrame
Expand Down
@@ -1,4 +1,4 @@
package net.sansa_stack.ml.spark.similarity.similarity_measures
package net.sansa_stack.ml.spark.similarity.similarityEstimationModels

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.DataFrame
Expand Down
@@ -1,4 +1,4 @@
package net.sansa_stack.ml.spark.similarity.similarity_measures
package net.sansa_stack.ml.spark.similarity.similarityEstimationModels

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.functions.{col, udf, lit, typedLit}
Expand Down
@@ -1,4 +1,4 @@
package net.sansa_stack.ml.spark.similarity.similarity_measures
package net.sansa_stack.ml.spark.similarity.similarityEstimationModels

import org.apache.spark
import org.apache.spark.ml.linalg.Vector
Expand Down
@@ -1,4 +1,4 @@
package net.sansa_stack.ml.spark.similarity.similarity_measures
package net.sansa_stack.ml.spark.similarity.similarityEstimationModels

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.DataFrame
Expand Down
@@ -1,4 +1,4 @@
package net.sansa_stack.ml.spark.similarity.similarity_measures
package net.sansa_stack.ml.spark.similarity.similarityEstimationModels

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.DataFrame
Expand Down
@@ -1,4 +1,4 @@
package net.sansa_stack.ml.spark.similarity.similarity_measures
package net.sansa_stack.ml.spark.similarity.similarityEstimationModels

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.DataFrame
Expand Down

0 comments on commit b8bdeea

Please sign in to comment.