This repository has been archived by the owner on Oct 8, 2020. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
renamed package to more suited and camelcase name
- Loading branch information
1 parent
21e59a5
commit b8bdeea
Showing
13 changed files
with
89 additions
and
12 deletions.
There are no files selected for viewing
2 changes: 1 addition & 1 deletion
2
...a-ml-spark/src/main/scala/net/sansa_stack/ml/spark/similarity/examples/minimalCalls.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
77 changes: 77 additions & 0 deletions
77
sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/similarity/run/Resnik.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
package net.sansa_stack.ml.spark.similarity.run | ||
|
||
import net.sansa_stack.ml.spark.utils.FeatureExtractorModel | ||
import net.sansa_stack.owl.spark.dataset | ||
import net.sansa_stack.rdf.spark.io._ | ||
import org.apache.jena.riot.Lang | ||
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, IDF} | ||
import org.apache.spark.ml.linalg.Vector | ||
import org.apache.spark.sql.functions.{col, udf} | ||
import org.apache.spark.sql.types.DataTypes | ||
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} | ||
|
||
object Resnik { | ||
def main(args: Array[String]): Unit = { | ||
|
||
// start spark session | ||
val spark = SparkSession.builder | ||
.appName(s"JaccardSimilarityEvaluation") | ||
.master("local[*]") | ||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") | ||
.getOrCreate() | ||
|
||
// define inputpath if it is not parameter | ||
val inputPath = "/Users/carstendraschner/GitHub/SANSA-ML/sansa-ml-spark/src/main/resources/movie.nt" | ||
|
||
// read in data as Data`Frame | ||
val triplesDf: DataFrame = spark.read.rdf(Lang.NTRIPLES)(inputPath) | ||
|
||
triplesDf.show() | ||
|
||
// feature extraction | ||
val featureExtractorModel = new FeatureExtractorModel() | ||
.setMode("an") | ||
val extractedFeaturesDataFrame = featureExtractorModel.transform(triplesDf) | ||
extractedFeaturesDataFrame.show() | ||
|
||
// count Vectorization | ||
val cvModel: CountVectorizerModel = new CountVectorizer() | ||
.setInputCol("extractedFeatures") | ||
.setOutputCol("vectorizedFeatures") | ||
.fit(extractedFeaturesDataFrame) | ||
val tmpCvDf: DataFrame = cvModel.transform(extractedFeaturesDataFrame) | ||
val isNoneZeroVector = udf({ v: Vector => v.numNonzeros > 0 }, DataTypes.BooleanType) | ||
val countVectorizedFeaturesDataFrame: DataFrame = tmpCvDf.filter(isNoneZeroVector(col("vectorizedFeatures"))).select("uri", "vectorizedFeatures") | ||
countVectorizedFeaturesDataFrame.show() | ||
|
||
// similarity estimations | ||
// for nearestNeighbors we need one key which is a Vector to search for NN | ||
val sample_key: Vector = countVectorizedFeaturesDataFrame.take(1)(0).getAs[Vector]("vectorizedFeatures") | ||
|
||
val idf = new IDF().setInputCol("vectorizedFeatures").setOutputCol("features") | ||
val idfModel = idf.fit(countVectorizedFeaturesDataFrame) | ||
|
||
val rescaledData = idfModel.transform(countVectorizedFeaturesDataFrame) | ||
rescaledData.show(false) | ||
|
||
spark.stop() | ||
|
||
} | ||
} | ||
/* | ||
class InformationContentModel { | ||
def calcIC(df: DataFrame): DataFrame = { | ||
val ds: Dataset[(String, String, String)] = dataset.as[(String, String, String)] | ||
// collect all element occurences | ||
val drdd = ds.rdd | ||
val occurences = drdd.map(_._3).filter(!_.contains("\"")) | ||
val occurenceMap = drdd | ||
.flatMap(t => Seq((t._1, 1), (t._3, 1))) | ||
val numberOccurences = occurenceMap.reduceByKey(_ + _) | ||
} | ||
}*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...rity/similarity_measures/BatetModel.scala → ...milarityEstimationModels/BatetModel.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...ilarity_measures/BraunBlanquetModel.scala → ...EstimationModels/BraunBlanquetModel.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...arity/similarity_measures/DiceModel.scala → ...imilarityEstimationModels/DiceModel.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...res/GenericSimilarityEstimatorModel.scala → ...els/GenericSimilarityEstimatorModel.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...ty/similarity_measures/JaccardModel.scala → ...larityEstimationModels/JaccardModel.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...ity/similarity_measures/OchiaiModel.scala → ...ilarityEstimationModels/OchiaiModel.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...ty/similarity_measures/SimpsonModel.scala → ...larityEstimationModels/SimpsonModel.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...ty/similarity_measures/TverskyModel.scala → ...larityEstimationModels/TverskyModel.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters