diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala index e2b6641..056f6cd 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CharactersFeatures.scala @@ -1,7 +1,8 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection +import java.util.regex.{ Matcher, Pattern } + import org.apache.spark.ml.linalg.{ Vector, Vectors } -import java.util.regex.{ Pattern, Matcher } class CharactersFeatures extends Serializable { @@ -10,7 +11,6 @@ class CharactersFeatures extends Serializable { val rounded: Double = Math.round(va * 10000).toDouble / 10000 rounded - } def Vector_Characters_Feature(StrValue: String): Array[Double] = { @@ -19,148 +19,145 @@ class CharactersFeatures extends Serializable { var RatioValues = new Array[Double](25) // Index is Important here val characterFeature_OBJ = new CharactersFeatures() - //1.Double result Value for uppercase Ration + // 1.Double result Value for uppercase Ration val uppercase = characterFeature_OBJ.UppercaseRation_Character(StrValue) if (!uppercase.isNaN()) { RatioValues(0) = RoundDouble(uppercase) } - //2.Double result Value for lowerCase Ratio + // 2.Double result Value for lowerCase Ratio val lowerCase = characterFeature_OBJ.LowercaseRation_Character(StrValue) if (!lowerCase.isNaN()) { RatioValues(1) = RoundDouble(lowerCase) } - //3.Double result Value for Alphanumeric Ratio + // 3.Double result Value for Alphanumeric Ratio val Alphanumeric = characterFeature_OBJ.AlphanumericRation_Character(StrValue) if (!Alphanumeric.isNaN()) { RatioValues(2) = RoundDouble(Alphanumeric) } - //4.Double result Value for ASCII Ratio + // 4.Double result Value for ASCII Ratio val ASCII = characterFeature_OBJ.ASCIIRation_Character(StrValue) if (!ASCII.isNaN()) { RatioValues(3) = RoundDouble(ASCII) } - //5.Double result Value for Bracket Ratio + // 5.Double result Value for Bracket Ratio val Bracket = characterFeature_OBJ.BracketRation_Character(StrValue) if (!Bracket.isNaN()) { RatioValues(4) = RoundDouble(Bracket) } - //6.Double result Value for Digits Ratio + // 6.Double result Value for Digits Ratio val Digits = characterFeature_OBJ.DigitsRation_Character(StrValue) if (!Digits.isNaN()) { RatioValues(5) = RoundDouble(Digits) } - //7.Double result Value for Latin Ratio + // 7.Double result Value for Latin Ratio val Latin = characterFeature_OBJ.Latin_Character(StrValue) if (!Latin.isNaN()) { RatioValues(6) = RoundDouble(Latin) } - //8.Double result Value for WhiteSpace Ratio + // 8.Double result Value for WhiteSpace Ratio val WhiteSpace = characterFeature_OBJ.WhiteSpace_Character(StrValue) if (!WhiteSpace.isNaN()) { RatioValues(7) = RoundDouble(WhiteSpace) } - //9.Double result Value for punc Ratio + // 9.Double result Value for punc Ratio val punc = characterFeature_OBJ.Punct_Character(StrValue) if (!punc.isNaN()) { RatioValues(8) = RoundDouble(punc) } - //10. Integer to Double result Value for LongCharacterSequence (1 integer) + // 10. Integer to Double result Value for LongCharacterSequence (1 integer) val LongCharacterSequence = characterFeature_OBJ.Longcharactersequence_Character(StrValue) if (!LongCharacterSequence.isNaN()) { RatioValues(9) = LongCharacterSequence } - //11.Double result Value for ArabicCharacter + // 11.Double result Value for ArabicCharacter val ArabicCharacter = characterFeature_OBJ.ArabicRation_Character(StrValue) if (!ArabicCharacter.isNaN()) { RatioValues(10) = RoundDouble(ArabicCharacter) } - //12.Double result Value for Bengali + // 12.Double result Value for Bengali val Bengali = characterFeature_OBJ.BengaliRation_Character(StrValue) if (!Bengali.isNaN()) { RatioValues(11) = RoundDouble(Bengali) } - //13.Double result Value for Brahmi + // 13.Double result Value for Brahmi val Brahmi = characterFeature_OBJ.BrahmiRation_Character(StrValue) if (!Brahmi.isNaN()) { RatioValues(12) = RoundDouble(Brahmi) - } - //14.Double result Value for Cyrillic + // 14.Double result Value for Cyrillic val Cyrillic = characterFeature_OBJ.CyrillicRation_Character(StrValue) if (!Cyrillic.isNaN()) { RatioValues(13) = RoundDouble(Cyrillic) - } - //15.Double result Value for Han + // 15.Double result Value for Han val Han = characterFeature_OBJ.HanRatio_Character(StrValue) if (!Han.isNaN()) { RatioValues(14) = RoundDouble(Han) - } - //16.Double result Value for Malysia + // 16.Double result Value for Malysia val Malysia = characterFeature_OBJ.MalaysRatio_Character(StrValue) if (!Malysia.isNaN()) { RatioValues(15) = RoundDouble(Malysia) } - //17.Double result Value for Tami + // 17.Double result Value for Tami val Tami = characterFeature_OBJ.TamilRatio_Character(StrValue) if (!Tami.isNaN()) { RatioValues(16) = RoundDouble(Tami) } - //18.Double result Value for Telugu + // 18.Double result Value for Telugu val Telugu = characterFeature_OBJ.TeluguRatio_Character(StrValue) if (!Telugu.isNaN()) { RatioValues(17) = RoundDouble(Telugu) } - //19.Double result Value for Symbol + // 19.Double result Value for Symbol val Symbol = characterFeature_OBJ.Symbol_Character(StrValue) if (!Symbol.isNaN()) { RatioValues(18) = RoundDouble(Symbol) } - //20. Double Alphabets Ration: + // 20. Double Alphabets Ration: val Alphabets = characterFeature_OBJ.AlphaBetsRation_Character(StrValue) if (!Alphabets.isNaN()) { RatioValues(19) = RoundDouble(Alphabets) } - //21. Double AVisible character Ratio: + // 21. Double AVisible character Ratio: val Visible = characterFeature_OBJ.VisibleRation_Character(StrValue) if (!Visible.isNaN()) { RatioValues(20) = RoundDouble(Visible) } - //22. Double Printable character Ratio: + // 22. Double Printable character Ratio: val Printable = characterFeature_OBJ.PrintableRation_Character(StrValue) if (!Printable.isNaN()) { RatioValues(21) = RoundDouble(Printable) } - //23.Double Blank character Ratio: + // 23.Double Blank character Ratio: val Blank = characterFeature_OBJ.BlankRation_Character(StrValue) if (!Blank.isNaN()) { RatioValues(22) = RoundDouble(Blank) } - //24.Double A control character: + // 24.Double A control character: val Control = characterFeature_OBJ.ControlRation_Character(StrValue) if (!Control.isNaN()) { RatioValues(23) = RoundDouble(Control) } - - //25. Double A hexadecimal digit : + // 25. Double A hexadecimal digit : val hexadecimal = characterFeature_OBJ.HexaRation_Character(StrValue) if (!hexadecimal.isNaN()) { RatioValues(24) = RoundDouble(hexadecimal) } + // val FacilityOBJ = new FacilitiesClass() // val vector_Values = FacilityOBJ.ToVector(RatioValues) @@ -176,7 +173,8 @@ class CharactersFeatures extends Serializable { } charRatio } - //1.Uppercase Ratio: + + // 1.Uppercase Ratio: def UppercaseRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{javaUpperCase}") val result: Double = characterRatio(str, pattern) @@ -187,51 +185,51 @@ class CharactersFeatures extends Serializable { val result: Double = characterRatio(str, pattern) result } - //3.Alphanumeric + // 3.Alphanumeric def AlphanumericRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Alnum}") val result: Double = characterRatio(str, pattern) result } - //4.ASCII + // 4.ASCII def ASCIIRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{ASCII}") val result: Double = characterRatio(str, pattern) result } - //5.Bracket + // 5.Bracket def BracketRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\(|\\)|\\}|\\{|\\[|\\]") val result: Double = characterRatio(str, pattern) result } - //6.Digits + // 6.Digits def DigitsRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\d") val result: Double = characterRatio(str, pattern) result } - //7.Latin + // 7.Latin def Latin_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsLatin}") val result: Double = characterRatio(str, pattern) result } - //8.WhiteSpace + // 8.WhiteSpace def WhiteSpace_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\s") val result: Double = characterRatio(str, pattern) result } - //9.Punct + // 9.Punct def Punct_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Punct}") val result: Double = characterRatio(str, pattern) result } - //10.Long character sequence: + // 10.Long character sequence: def Longcharactersequence_Character(str: String): Double = { var text: String = str var maxlength: Integer = null @@ -265,96 +263,96 @@ class CharactersFeatures extends Serializable { } - //11.ARabic Ratio: + // 11.ARabic Ratio: def ArabicRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsArabic}") val result: Double = characterRatio(str, pattern) result } - //12. Bengali Ratio + // 12. Bengali Ratio def BengaliRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsBengali}") val result: Double = characterRatio(str, pattern) result } - //13.Brahmi Ratio + // 13.Brahmi Ratio def BrahmiRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsBrahmi}") val result: Double = characterRatio(str, pattern) result } - //14.Cyrillic Ratio + // 14.Cyrillic Ratio def CyrillicRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsCyrillic}") val result: Double = characterRatio(str, pattern) result } - //15.HanRatio + // 15.HanRatio def HanRatio_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsHan}") val result: Double = characterRatio(str, pattern) result } - //16.Malaysian Ratio: + // 16.Malaysian Ratio: def MalaysRatio_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsMalayalam}") val result: Double = characterRatio(str, pattern) result } - //17.Tamil Ratio: + // 17.Tamil Ratio: def TamilRatio_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsTamil}") val result: Double = characterRatio(str, pattern) result } - //18.Telugu Ration: + // 18.Telugu Ration: def TeluguRatio_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{IsTelugu}") val result: Double = characterRatio(str, pattern) result } - //19.Symbols Ratio : + // 19.Symbols Ratio : def Symbol_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("[#$%&@+-_+*/]*") val result: Double = characterRatio(str, pattern) result } - //20.Alphabets Ratio : + // 20.Alphabets Ratio : def AlphaBetsRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Alpha}") val result: Double = characterRatio(str, pattern) result } - //21.A visible character Ratio: + // 21.A visible character Ratio: def VisibleRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Graph}") val result: Double = characterRatio(str, pattern) result } - //22.A printable character + // 22.A printable character def PrintableRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Print}") val result: Double = characterRatio(str, pattern) result } - //23.A Black(it is different from White space) character Ratio + // 23.A Black(it is different from White space) character Ratio def BlankRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Blank}") val result: Double = characterRatio(str, pattern) result } - //24.Control character Ratio + // 24.Control character Ratio def ControlRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Cntrl}") val result: Double = characterRatio(str, pattern) result } - //25.HexaDecimal character Ratio + // 25.HexaDecimal character Ratio def HexaRation_Character(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{XDigit}") val result: Double = characterRatio(str, pattern) @@ -362,4 +360,4 @@ class CharactersFeatures extends Serializable { } // Character features: ------ End calculation the Ratio for character: -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala index 3549c50..7794d85 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Classifiers.scala @@ -1,33 +1,27 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.{ SparkContext, RangePartitioner } -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql.types.{ DoubleType, StringType, IntegerType, StructField, StructType } -import org.apache.spark.ml.linalg.{ Vector, Vectors } -import org.apache.spark.ml.classification.{ GBTClassificationModel, GBTClassifier } -import org.apache.spark.ml.classification.DecisionTreeClassificationModel -import org.apache.spark.ml.classification.DecisionTreeClassifier -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator -import org.apache.spark.ml.classification.LogisticRegression -import org.apache.spark.ml.classification.MultilayerPerceptronClassifier +import java.io.{ File, IOException } +import java.text.SimpleDateFormat +import java.util.{ Calendar, Date } + import scala.collection.mutable -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics -import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator + +import org.apache.commons.io.FileUtils +import org.apache.spark.{ RangePartitioner, SparkContext } +import org.apache.spark.ml.classification.{ DecisionTreeClassificationModel, DecisionTreeClassifier, GBTClassificationModel, GBTClassifier, LogisticRegression, MultilayerPerceptronClassifier, RandomForestClassificationModel, RandomForestClassifier } +import org.apache.spark.ml.evaluation.{ BinaryClassificationEvaluator, MulticlassClassificationEvaluator } import org.apache.spark.ml.feature.{ IndexToString, StringIndexer, VectorIndexer } -import org.apache.spark.ml.classification.{ RandomForestClassificationModel, RandomForestClassifier } import org.apache.spark.ml.Pipeline -import org.apache.commons.io.FileUtils; -import java.io.File; -import java.io.IOException; -import java.util.Calendar -import java.text.SimpleDateFormat -import java.util.Date -import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} +import org.apache.spark.ml.linalg.{ Vector, Vectors } +import org.apache.spark.mllib.classification.{ SVMModel, SVMWithSGD } +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.types.{ DoubleType, IntegerType, StringType, StructField, StructType } class Classifiers extends Serializable { - //1.ok ----- + // 1.ok ----- def RandomForestClassifer(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) @@ -48,7 +42,7 @@ class Classifiers extends Serializable { // val Array(DF_Testing) = DF_Testing//.randomSplit(Array(0.100)) // Train a RandomForest model. - val rf = new RandomForestClassifier().setImpurity("gini").setMaxDepth(3).setNumTrees(20).setFeatureSubsetStrategy("auto").setSeed(5043).setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") //.setNumTrees(20) + val rf = new RandomForestClassifier().setImpurity("gini").setMaxDepth(3).setNumTrees(20).setFeatureSubsetStrategy("auto").setSeed(5043).setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") // .setNumTrees(20) // Convert indexed labels back to original labels. val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) @@ -66,7 +60,7 @@ class Classifiers extends Serializable { val finlaPrediction = predictions.select("Rid", "features", "FinalROLLBACK_REVERTED", "predictedLabel") predictions.show() - //Case1 : BinaryClassificationEvaluator:OK ------------------------------------------------------ + // Case1 : BinaryClassificationEvaluator:OK ------------------------------------------------------ val binaryClassificationEvaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel").setRawPredictionCol("rawPrediction") var results1 = 0.0 def printlnMetricCAse1(metricName: String): Double = { @@ -79,7 +73,7 @@ class Classifiers extends Serializable { val PR = printlnMetricCAse1("areaUnderPR") // Case 2: MulticlassClassificationEvaluator:OK ----------------------------------------------------- - //Select (prediction, true label) and compute test error. + // Select (prediction, true label) and compute test error. val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") var results2 = 0.0 @@ -93,10 +87,10 @@ class Classifiers extends Serializable { val Recall = printlnMetricCase2("weightedRecall") val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() - finalResult + finalResult } - //2.ok------ + // 2.ok------ def DecisionTreeClassifier(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) @@ -135,40 +129,38 @@ class Classifiers extends Serializable { val predictions = modelxx.transform(TestingData) // Select example rows to display. - //val finlaPrediction = predictions.select("Rid", "features", "FinalROLLBACK_REVERTED", "predictedLabel") + // val finlaPrediction = predictions.select("Rid", "features", "FinalROLLBACK_REVERTED", "predictedLabel") - //Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- + // Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- val binaryClassificationEvaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel").setRawPredictionCol("rawPrediction") - - var result1=0.0 + + var result1 = 0.0 def printlnMetricCAse1(metricName: String): Double = { - result1 =binaryClassificationEvaluator.setMetricName(metricName).evaluate(predictions) - println(metricName + " = " +result1 ) - + result1 = binaryClassificationEvaluator.setMetricName(metricName).evaluate(predictions) + println(metricName + " = " + result1) + result1 } val ROC = printlnMetricCAse1("areaUnderROC") val PR = printlnMetricCAse1("areaUnderPR") - //Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- - //Select (prediction, true label) and compute test error. + // Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- + // Select (prediction, true label) and compute test error. val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") - var result2=0.0 + var result2 = 0.0 def printlnMetricCase2(metricName: String): Double = { - result2=MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) + result2 = MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) println(metricName + " = " + result2) result2 } - val accuracy = printlnMetricCase2("accuracy") + val accuracy = printlnMetricCase2("accuracy") val Precision = printlnMetricCase2("weightedPrecision") val Recall = printlnMetricCase2("weightedRecall") - - val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() + val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() finalResult - - + } // 3.Ok -------- @@ -210,7 +202,7 @@ class Classifiers extends Serializable { predictions.show() - //Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- + // Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- val binaryClassificationEvaluator = new BinaryClassificationEvaluator().setLabelCol("indexedLabel").setRawPredictionCol("rawPrediction") var results1 = 0.0 def printlnMetricCase1(metricName: String): Double = { @@ -222,13 +214,13 @@ class Classifiers extends Serializable { val ROC = printlnMetricCase1("areaUnderROC") val PR = printlnMetricCase1("areaUnderPR") - //Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- - //Select (prediction, true label) and compute test error. + // Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- + // Select (prediction, true label) and compute test error. val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") - var result2=0.0 + var result2 = 0.0 def printlnMetricCase2(metricName: String): Double = { - - result2=MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) + + result2 = MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) println(metricName + " = " + result2) result2 } @@ -236,13 +228,12 @@ class Classifiers extends Serializable { val Precision = printlnMetricCase2("weightedPrecision") val Recall = printlnMetricCase2("weightedRecall") - - val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() + val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() + + finalResult - finalResult - } - //4. OK----- + // 4. OK----- def GradientBoostedTree(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) @@ -265,7 +256,7 @@ class Classifiers extends Serializable { // val Array(trainingData, testData) = Data.randomSplit(Array(0.7, 0.3)) // Train a DecisionTree model. - val gbt = new GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") //.setMaxIter(10) + val gbt = new GBTClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") // .setMaxIter(10) // Convert indexed labels back to original labels. val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) @@ -281,7 +272,7 @@ class Classifiers extends Serializable { // Select example rows to display. - //Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- + // Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- var predictionsRDD = predictions.select("prediction", "FinalROLLBACK_REVERTED").rdd var predictionAndLabels = predictionsRDD.map { row => (row.get(0).asInstanceOf[Double], row.get(1).asInstanceOf[Double]) } @@ -290,32 +281,31 @@ class Classifiers extends Serializable { println("Area under ROC = " + metrics.areaUnderROC()) println("Area under PR = " + metrics.areaUnderPR()) - val ROC =metrics.areaUnderROC() - val PR= metrics.areaUnderPR() - - - //Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- - //Select (prediction, true label) and compute test error. + val ROC = metrics.areaUnderROC() + val PR = metrics.areaUnderPR() + + // Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- + // Select (prediction, true label) and compute test error. val MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction") - var result2=0.0 + var result2 = 0.0 def printlnMetric(metricName: String): Double = { - - result2= MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) - println(metricName + " = " +result2) + + result2 = MulticlassClassificationEvaluator.setMetricName(metricName).evaluate(predictions) + println(metricName + " = " + result2) result2 } val accuracy = printlnMetric("accuracy") val Precision = printlnMetric("weightedPrecision") val Recall = printlnMetric("weightedRecall") - + val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() - finalResult + finalResult } - //5.Ok------------ + // 5.Ok------------ def MultilayerPerceptronClassifier(DF_Training: DataFrame, DF_Testing: DataFrame, sc: SparkContext): String = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) @@ -352,7 +342,7 @@ class Classifiers extends Serializable { // predictions.show() - //Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- + // Case1 : BinaryClassificationEvaluator:---------------------------------------------------------- var predictionsDF = predictions.select("prediction", "label") var predictionsRDD = predictions.select("prediction", "label").rdd var predictionAndLabels = predictionsRDD.map { row => (row.get(0).asInstanceOf[Double], row.get(1).asInstanceOf[Double]) } @@ -361,13 +351,10 @@ class Classifiers extends Serializable { println("Area under ROC = " + metrics.areaUnderROC()) println("Area under PR = " + metrics.areaUnderPR()) - - val ROC =metrics.areaUnderROC() - val PR= metrics.areaUnderPR() - - - - //Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- + val ROC = metrics.areaUnderROC() + val PR = metrics.areaUnderPR() + + // Case 2: MulticlassClassificationEvaluator:----------------------------------------------------- val accuracyevaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy") val weightedPrecisionevaluator = new MulticlassClassificationEvaluator().setMetricName("weightedPrecision") val weightedRecallevaluator = new MulticlassClassificationEvaluator().setMetricName("weightedRecall") @@ -375,22 +362,14 @@ class Classifiers extends Serializable { println("Accuracy = " + accuracyevaluator.evaluate(predictionsDF)) println("weightedPrecision = " + weightedPrecisionevaluator.evaluate(predictionsDF)) println("weightedRecall = " + weightedRecallevaluator.evaluate(predictionsDF)) - - + val accuracy = accuracyevaluator.evaluate(predictionsDF) val Precision = weightedPrecisionevaluator.evaluate(predictionsDF) val Recall = weightedRecallevaluator.evaluate(predictionsDF) - - - val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() - finalResult - - + val finalResult = "ROC=" + ROC.toString() + "|" + "PR=" + PR.toString() + "|" + "accuracy=" + accuracy.toString() + "|" + "Precision=" + Precision.toString() + "|" + "Recall=" + Recall.toString() + finalResult } - - - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala index 0f0ecc3..834cd7f 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/CommentProcessor.scala @@ -1,6 +1,7 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection import java.util.regex.{ Matcher, Pattern } + import org.slf4j.{ Logger, LoggerFactory } class CommentProcessor extends Serializable { @@ -97,7 +98,7 @@ class CommentProcessor extends Serializable { actions } - //Ok: helper for Revision Features: extract Action- subaction from comment: + // Ok: helper for Revision Features: extract Action- subaction from comment: def Extract_ActionsOfNormalComment(comment: String): String = { var result: Boolean = false @@ -108,7 +109,7 @@ class CommentProcessor extends Serializable { var Param = "" var parameters: Array[String] = Array.ofDim[String](0) var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { val check_asterisk_Start = comment.contains("/*") @@ -182,7 +183,7 @@ class CommentProcessor extends Serializable { var suffixComment = "" var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { @@ -246,7 +247,7 @@ class CommentProcessor extends Serializable { var Param = "" var parameters: Array[String] = Array.ofDim[String](0) var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { val check_asterisk_Start = comment.contains("/*") @@ -324,7 +325,7 @@ class CommentProcessor extends Serializable { var Param = "" var parameters: Array[String] = Array.ofDim[String](0) var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { val check_asterisk_Start = comment.contains("/*") @@ -403,7 +404,7 @@ class CommentProcessor extends Serializable { var Param = "" var parameters: Array[String] = Array.ofDim[String](0) var asterisk_Start = 0 // == /* - var asterisk_End = 0 //== */ + var asterisk_End = 0 // == */ var colon = 0 if (comment != null) { val check_asterisk_Start = comment.contains("/*") @@ -569,7 +570,7 @@ class CommentProcessor extends Serializable { } else { - //do not thing + // do not thing } @@ -584,7 +585,7 @@ class CommentProcessor extends Serializable { } - //"Thecommentis" + result_Str + "&&&" + "Ac1:" + Action1 + "&&&" + "Ac2 :" + Action2 + "&&&" + "SF:" + suffixComment + // "Thecommentis" + result_Str + "&&&" + "Ac1:" + Action1 + "&&&" + "Ac2 :" + Action2 + "&&&" + "SF:" + suffixComment def isRollback(comment: String): Boolean = { var result: Boolean = false if (comment != null) { @@ -594,8 +595,8 @@ class CommentProcessor extends Serializable { logger.debug("Robust but not precise rollback match (result = " + result + ") : " + tmp) } } - //result = tmp.startsWith("Reverted"); - //result = tmp.startsWith("Reverted"); + // result = tmp.startsWith("Reverted"); + // result = tmp.startsWith("Reverted"); result } @@ -613,8 +614,8 @@ class CommentProcessor extends Serializable { } } } - //result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ; - //result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ; + // result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ; + // result = (tmp.startsWith("Undid") || tmp.startsWith("Undo")) ; result } @@ -632,8 +633,8 @@ class CommentProcessor extends Serializable { } } } - //result = (tmp.startsWith("Restored") || tmp.startsWith("Restore")); - //result = (tmp.startsWith("Restored") || tmp.startsWith("Restore")); + // result = (tmp.startsWith("Restored") || tmp.startsWith("Restore")); + // result = (tmp.startsWith("Restored") || tmp.startsWith("Restore")); result } @@ -693,7 +694,7 @@ class CommentProcessor extends Serializable { } def getUndoneRevisionId(comment: String): Long = { - var result: Long = 0l + var result: Long = 0L val matcher: Matcher = ROBUST_UNDO_PATTERN.matcher(comment) if (matcher.matches()) { val str: String = matcher.group(2) @@ -705,7 +706,7 @@ class CommentProcessor extends Serializable { } def getRestoredRevisionId(comment: String): Long = { - var result: Long = 0l + var result: Long = 0L val matcher: Matcher = ROBUST_RESTORE_PATTERN.matcher(comment) if (matcher.matches()) { val str: String = matcher.group(1) @@ -869,4 +870,4 @@ class CommentProcessor extends Serializable { def getItemValue(): String = itemValue -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala index a0902aa..4188dd4 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/FacilitiesClass.scala @@ -1,9 +1,9 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection +import org.apache.spark.ml.linalg.{ Vector, Vectors } import org.apache.spark.rdd.RDD import org.apache.spark.sql._ -import org.apache.spark.sql.types.{ DoubleType, StringType, IntegerType, StructField, StructType } -import org.apache.spark.ml.linalg.{ Vector, Vectors } +import org.apache.spark.sql.types.{ DoubleType, IntegerType, StringType, StructField, StructType } class FacilitiesClass extends Serializable { @@ -18,68 +18,68 @@ class FacilitiesClass extends Serializable { namesList } - //ok --- Used for DF Triples + // ok --- Used for DF Triples def RDD_TO_DFR_RDFXML(rdd: RDD[String], sqlContext: org.apache.spark.sql.SQLContext): DataFrame = { - //Create an Encoded Schema in a String Format: + // Create an Encoded Schema in a String Format: val schemaString = "Subject Predicate Object" - //Generate schema: + // Generate schema: val schema = StructType(schemaString.split(" ").map(fieldName ⇒ StructField(fieldName, StringType, true))) - //Apply Transformation for Reading Data from Text File + // Apply Transformation for Reading Data from Text File val rowRDD = rdd.map(_.split(" ")).map(e ⇒ Row(e(0), e(1), e(2))) - //Apply RowRDD in Row Data based on Schema: + // Apply RowRDD in Row Data based on Schema: val RDFTRIPLE = sqlContext.createDataFrame(rowRDD, schema) - //Store DataFrame Data into Table + // Store DataFrame Data into Table RDFTRIPLE.registerTempTable("SPO") - //Select Query on DataFrame + // Select Query on DataFrame val dfr = sqlContext.sql("SELECT * FROM SPO") dfr.show() dfr } - //ok --- Used for DF Triples + // ok --- Used for DF Triples def RDD_TO_DFR_TRIX(rdd: RDD[String], sqlContext: org.apache.spark.sql.SQLContext): DataFrame = { - //Create an Encoded Schema in a String Format: + // Create an Encoded Schema in a String Format: val schemaString = "Subject Predicate Object" - //Generate schema: + // Generate schema: val schema = StructType(schemaString.split(" ").map(fieldName ⇒ StructField(fieldName, StringType, true))) - //Apply Transformation for Reading Data from Text File + // Apply Transformation for Reading Data from Text File val rowRDD = rdd.map(_.split("><")).map(e ⇒ Row(e(0), e(1), e(2))) - //Apply RowRDD in Row Data based on Schema: + // Apply RowRDD in Row Data based on Schema: val RDFTRIPLE = sqlContext.createDataFrame(rowRDD, schema) - //Store DataFrame Data into Table + // Store DataFrame Data into Table RDFTRIPLE.registerTempTable("SPO") - //Select Query on DataFrame + // Select Query on DataFrame val dfr = sqlContext.sql("SELECT * FROM SPO") dfr.show() dfr } - //ok --- Used for DF Triples + // ok --- Used for DF Triples def RDD_TO_DFR_JTriple(rdd: RDD[String], sqlContext: org.apache.spark.sql.SQLContext): DataFrame = { - //Create an Encoded Schema in a String Format: + // Create an Encoded Schema in a String Format: val schemaString = "Subject Predicate Object" - //Generate schema: + // Generate schema: val schema = StructType(schemaString.split(" ").map(fieldName ⇒ StructField(fieldName, StringType, true))) - //Apply Transformation for Reading Data from Text File + // Apply Transformation for Reading Data from Text File val rowRDD = rdd.map(_.split(",")).map(e ⇒ Row(e(0), e(1), e(2))) - //Apply RowRDD in Row Data based on Schema: + // Apply RowRDD in Row Data based on Schema: val RDFTRIPLE = sqlContext.createDataFrame(rowRDD, schema) - //Store DataFrame Data into Table + // Store DataFrame Data into Table RDFTRIPLE.registerTempTable("SPO") - //Select Query on DataFrame + // Select Query on DataFrame val dfr = sqlContext.sql("SELECT * FROM SPO") dfr.show() dfr } + def RoundDouble(va: Double): Double = { val rounded: Double = Math.round(va * 10000).toDouble / 10000 rounded - } def stringToInt(str: String): Integer = { @@ -139,7 +139,5 @@ class FacilitiesClass extends Serializable { } tem.trim() - } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala index b4fc8c1..2992634 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ItemFeatures.scala @@ -1,10 +1,10 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import java.util.regex.{ Pattern, Matcher } +import java.util.regex.{ Matcher, Pattern } class ItemFeatures extends Serializable { - //1. + // 1. def Get_NumberOfLabels(str: String): Double = { // from Label Tag @@ -15,11 +15,9 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //2. + // 2. def Get_NumberOfDescription(str: String): Double = { // from description tag @@ -30,11 +28,9 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //3. + // 3. def Get_NumberOfAliases(str: String): Double = { // from Aliases Tag @@ -45,11 +41,9 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //4. + // 4. def Get_NumberOfClaim(str: String): Double = { // from claim tag @@ -60,10 +54,8 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //5. + // 5. def Get_NumberOfSiteLinks(str: String): Double = { // from Sitelink tag @@ -74,10 +66,8 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //6. + // 6. def Get_NumberOfstatements(str: String): Double = { // from claims tag @@ -88,10 +78,8 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //7. + // 7. def Get_NumberOfReferences(str: String): Double = { @@ -107,7 +95,7 @@ class ItemFeatures extends Serializable { count } - //8. + // 8. def Get_NumberOfQualifier(str: String): Double = { // from claims tag @@ -118,10 +106,8 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //9. + // 9. def Get_NumberOfQualifier_Order(str: String): Double = { // from claims tag val input: String = str @@ -131,12 +117,9 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } - //10. + // 10. def Get_NumberOfBadges(str: String): Double = { - // from Sitelink tag val input: String = str val pattern: Pattern = Pattern.compile(""""badges"""" + ":") @@ -145,8 +128,6 @@ class ItemFeatures extends Serializable { while (matcher.find()) { count += 1; count - 1 } count - - count } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala index 02f0bdd..5fa21d8 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/Main.scala @@ -1,8 +1,7 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.{ SparkConf, SparkContext } +import org.apache.spark.{ RangePartitioner, SparkConf, SparkContext } import org.apache.spark.sql._ -import org.apache.spark.{ SparkContext, RangePartitioner } object Main { @@ -19,7 +18,7 @@ object Main { if (num == "1") { Start.Start_RDF_Parser_Appraoch(sc) - } // Distributed Standard Parser and Vandalism Detection : + } // Distributed Standard Parser and Vandalism Detection: else if (num == "2") { val Training_Data = Start.Training_Start_StandardXMLParser_VD(sc) @@ -27,22 +26,21 @@ object Main { val OBJClassifiers = new Classifiers() - //1.Random Forest Classifer: + // 1.Random Forest Classifer: val RandomForestClassifer_Values = OBJClassifiers.RandomForestClassifer(Training_Data, Testing_Data, sc) - //2.DecisionTreeClassifier + // 2.DecisionTreeClassifier val DecisionTreeClassifier_values = OBJClassifiers.DecisionTreeClassifier(Training_Data, Testing_Data, sc) // 3.LogisticRegrision val LogisticRegrision_values = OBJClassifiers.LogisticRegrision(Training_Data, Testing_Data, sc) - //4.GradientBoostedTree + // 4.GradientBoostedTree val GradientBoostedTree_values = OBJClassifiers.GradientBoostedTree(Training_Data, Testing_Data, sc) - //5.MultilayerPerceptronClassifier + // 5.MultilayerPerceptronClassifier val MultilayerPerceptronClassifier_values = OBJClassifiers.MultilayerPerceptronClassifier(Training_Data, Testing_Data, sc) - println(RandomForestClassifer_Values) println(DecisionTreeClassifier_values) println(LogisticRegrision_values) @@ -52,4 +50,4 @@ object Main { } } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala index 122e297..395e53b 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseJTriple.scala @@ -1,24 +1,25 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.SparkContext + +import java.io.ByteArrayInputStream +import java.util.ArrayList +import java.util.regex.Pattern + import org.apache.hadoop.mapred.JobConf -import org.apache.spark.rdd.RDD import org.apache.jena.graph.Triple import org.apache.jena.rdf.model.ModelFactory -import java.util.ArrayList -import java.util.regex.Pattern -import java.io.ByteArrayInputStream - -class ParseJTriple extends Serializable{ - - - def Start_JTriple_Parser(jobConf_Record: JobConf, sc: SparkContext): RDD[String] = { +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD + +class ParseJTriple extends Serializable { + + def Start_JTriple_Parser(jobConf_Record: JobConf, sc: SparkContext): RDD[String] = { jobConf_Record.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf_Record.set("stream.recordreader.begin", """"s":""") // start Tag jobConf_Record.set("stream.recordreader.end", "}") // End Tag org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Record, "hdfs://localhost:9000/mydata/xxx.json") // input path from Hadoop - //------------JTriple Record + // ------------JTriple Record // read data and save in RDD as block- JTriple Record val JTriple_Dataset_Record = sc.hadoopRDD(jobConf_Record, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // println("HelloRecords" + " " + JTriple_Dataset_Record.count) @@ -29,14 +30,10 @@ class ParseJTriple extends Serializable{ val RevisioninOneString = JTriple_Dataset_Record_AsstringBlock.map(line => New_abendRevision(line)).distinct().cache() RevisioninOneString } - def New_abendRevision(str: String): String = { + def New_abendRevision(str: String): String = { val s1 = str.replaceAll("[\r\n]+", " "); - val s2 = s1.replaceAll("[.\\s]","").trim() - + val s2 = s1.replaceAll("[.\\s]", "").trim() s2 } - - - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala index 5b70361..cea1e38 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseNormalXML.scala @@ -1,19 +1,20 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.SparkContext -import org.apache.hadoop.mapred.JobConf -import org.apache.spark.rdd.RDD import java.math.BigInteger +import java.net.InetAddress import java.util.ArrayList -import org.apache.commons.lang3.ArrayUtils import java.util.regex.{ Matcher, Pattern } -import java.net.InetAddress + +import org.apache.commons.lang3.ArrayUtils +import org.apache.hadoop.mapred.JobConf +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD class ParseNormalXML extends Serializable { def Training_DB_NormalXML_Parser_Input1(sc: SparkContext): RDD[String] = { - //Streaming records:==================================================================Input Files + // Streaming records:==================================================================Input Files val jobConf = new JobConf() jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf.set("stream.recordreader.begin", "") // start Tag @@ -21,10 +22,10 @@ class ParseNormalXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/sample.xml") // input path from Hadoop // read data and save in RDD as block - val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct() + val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct() println(wikiData.count) val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) } - //println(RevisionTagewikidata.count) + // println(RevisionTagewikidata.count) // ABend the revision in one line string val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache() @@ -37,7 +38,7 @@ class ParseNormalXML extends Serializable { } def Training_DB_NormalXML_Parser_Input2(sc: SparkContext): RDD[String] = { - //Streaming records:==================================================================Input Files + // Streaming records:==================================================================Input Files val jobConf = new JobConf() jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf.set("stream.recordreader.begin", "") // start Tag @@ -45,10 +46,10 @@ class ParseNormalXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/2.xml") // input path from Hadoop // read data and save in RDD as block - val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct() + val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct() println(wikiData.count) val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) } - //println(RevisionTagewikidata.count) + // println(RevisionTagewikidata.count) // ABend the revision in one line string val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache() @@ -61,7 +62,7 @@ class ParseNormalXML extends Serializable { } def Training_DB_NormalXML_Parser_Input3(sc: SparkContext): RDD[String] = { - //Streaming records:==================================================================Input Files + // Streaming records:==================================================================Input Files val jobConf = new JobConf() jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf.set("stream.recordreader.begin", "") // start Tag @@ -69,10 +70,10 @@ class ParseNormalXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/3.xml") // input path from Hadoop // read data and save in RDD as block - val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct() + val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct() println(wikiData.count) val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) } - //println(RevisionTagewikidata.count) + // println(RevisionTagewikidata.count) // ABend the revision in one line string val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache() // println("TotalCount" + " " + RevisioninOneString.count) @@ -83,9 +84,9 @@ class ParseNormalXML extends Serializable { } - def Testing_DB_NormalXML_Parser(sc: SparkContext): RDD[String] = { + def Testing_DB_NormalXML_Parser(sc: SparkContext): RDD[String] = { - //Streaming records:==================================================================Input Files + // Streaming records:==================================================================Input Files val jobConf = new JobConf() jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader") jobConf.set("stream.recordreader.begin", "") // start Tag @@ -93,10 +94,10 @@ class ParseNormalXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf, "hdfs://localhost:9000/mydata/3.xml") // input path from Hadoop // read data and save in RDD as block - val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) //.distinct() + val wikiData = sc.hadoopRDD(jobConf, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // .distinct() println(wikiData.count) val RevisionTagewikidata = wikiData.map { case (x, y) => (x.toString()) } - //println(RevisionTagewikidata.count) + // println(RevisionTagewikidata.count) // ABend the revision in one line string val RevisioninOneString = RevisionTagewikidata.map(line => New_abendRevision(line)).cache() // println("TotalCount" + " " + RevisioninOneString.count) @@ -107,11 +108,6 @@ class ParseNormalXML extends Serializable { } - - - - - // make the revision as one string def New_abendRevision(str: String): String = { @@ -125,13 +121,13 @@ class ParseNormalXML extends Serializable { // Ok: used on the Top def New_Build_Revision_map(obj: String): String = { var Store_Record_String = "" - //Json Revision : + // Json Revision : val JsonStr = Get_Json_Revision(obj) val Standered_JsonStr = Standared_Get_Json_Revision(obj) // for full string Jason with all formating for parsing by spark val Json_Standered = Standered_JsonStr.get(0).toString() // for full string Jason with all formating for parsing by spark val Json = JsonStr.get(0).toString() - //0.Id Revision + // 0.Id Revision val IdRevision = Get_ID_Revision(obj) if (IdRevision != "") { val ID = IdRevision.toString().trim() @@ -141,7 +137,7 @@ class ParseNormalXML extends Serializable { // else { // Store_Record_String = "0" // } - //1. Item Title : + // 1. Item Title : val ItemTitle: ArrayList[String] = Get_Item_Title_FromJson(Json) if (ItemTitle.size() > 0) { val groupItemTilte = ItemTitle.get(0).toString() @@ -164,8 +160,8 @@ class ParseNormalXML extends Serializable { } } - //=============Start:======= extract information from the json string - //2.Comments : + // =============Start:======= extract information from the json string + // 2.Comments : val commentarray = Get_Comment(obj) val comment = commentarray.get(0) if (comment.nonEmpty) { @@ -174,7 +170,7 @@ class ParseNormalXML extends Serializable { Store_Record_String = Store_Record_String.trim() + "NNLL" + "NA" } - //3.Parent ID : + // 3.Parent ID : val ParentIDStr = Get_ParentID(obj) if (ParentIDStr.nonEmpty) { @@ -185,7 +181,7 @@ class ParseNormalXML extends Serializable { Store_Record_String = Store_Record_String + "NNLL" + "0" } - //4.Timestamp: + // 4.Timestamp: val TimeStamparray = Get_TIMEStamp(obj) val TimeSta = TimeStamparray.get(0) if (TimeSta.nonEmpty) { @@ -194,41 +190,41 @@ class ParseNormalXML extends Serializable { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //5. Contributor Data( IP ): + // 5. Contributor Data( IP ): val Contributstr = Get_Contributor_IP(obj) - //val ContributorSta = Contributorarray.get(0) + // val ContributorSta = Contributorarray.get(0) if (Contributstr != "0") { Store_Record_String = Store_Record_String + "NNLL" + Contributstr.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "0" } - //6. Contributor ID : + // 6. Contributor ID : val Contributor_IDStr = Get_Contributor_ID(obj) - //val Contributor_IDSta = Contributor_IDarray.get(0) + // val Contributor_IDSta = Contributor_IDarray.get(0) if (Contributor_IDStr != "0") { Store_Record_String = Store_Record_String + "NNLL" + Contributor_IDStr.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "0" } - //7. Contributor Name : + // 7. Contributor Name : val Contributor_NameStr = Get_Contributor_Name(obj) - //val Contributor_IDSta = Contributor_IDarray.get(0) + // val Contributor_IDSta = Contributor_IDarray.get(0) if (Contributor_NameStr != "NA") { Store_Record_String = Store_Record_String + "NNLL" + Contributor_NameStr.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //8. Full Json Tag for Parsing: + // 8. Full Json Tag for Parsing: if (Json_Standered.nonEmpty) { Store_Record_String = Store_Record_String + "NNLL" + Json_Standered.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //9. Model : + // 9. Model : val modelstr = Get_Model(obj) if (modelstr.nonEmpty) { @@ -236,14 +232,14 @@ class ParseNormalXML extends Serializable { } else { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //10.Format: + // 10.Format: val Formatstr = Get_Format(obj) if (Formatstr.nonEmpty) { Store_Record_String = Store_Record_String + "NNLL" + Formatstr.trim() } else { Store_Record_String = Store_Record_String + "NNLL" + "NA" } - //11.SHA1 : + // 11.SHA1 : val SHAstr = Get_SHA1(obj) if (SHAstr.nonEmpty) { Store_Record_String = Store_Record_String + "NNLL" + SHAstr.trim() @@ -290,8 +286,8 @@ class ParseNormalXML extends Serializable { } - //********************** - // if (str.contains("")){ + // ********************** + // if (str.contains("")) { // // val inputID: CharSequence = str // val pattStr_id: String = "[0-9]+" @@ -306,7 +302,7 @@ class ParseNormalXML extends Serializable { // } // } // - // else if (str.contains("")){ + // else if (str.contains("")) { // // val inputID: CharSequence = str // val pattStr_id: String = "[0-9]+" @@ -327,7 +323,7 @@ class ParseNormalXML extends Serializable { tem } - //Extract TimeStampe value from Tag: + // Extract TimeStampe value from Tag: def Get_TIMEStamp(str: String): ArrayList[String] = { val TimeStamp: ArrayList[String] = new ArrayList[String]() @@ -382,7 +378,7 @@ class ParseNormalXML extends Serializable { } - //extract Item Title from Json string + // extract Item Title from Json string def Get_Item_Title_FromJson(str: String): ArrayList[String] = { val Item_Title_FromJason: ArrayList[String] = new ArrayList[String]() @@ -634,5 +630,4 @@ class ParseNormalXML extends Serializable { } temp } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala index 3f83897..2add40c 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseRDFXML.scala @@ -1,13 +1,14 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.SparkContext +import java.io.ByteArrayInputStream +import java.util.ArrayList +import java.util.regex.Pattern + import org.apache.hadoop.mapred.JobConf -import org.apache.spark.rdd.RDD import org.apache.jena.graph.Triple import org.apache.jena.rdf.model.ModelFactory -import java.util.ArrayList -import java.util.regex.Pattern -import java.io.ByteArrayInputStream +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD class ParseRDFXML extends Serializable { @@ -24,7 +25,7 @@ class ParseRDFXML extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Record, "hdfs://localhost:9000/mydata/Germany.rdf") // input path from Hadoop org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Prefixes, "hdfs://localhost:9000/mydata/Germany.rdf") // input path from Hadoop - //------------ RDF XML Record + // ------------ RDF XML Record // read data and save in RDD as block- RDFXML Record val RDFXML_Dataset_Record = sc.hadoopRDD(jobConf_Record, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // println("HelloRecords" + " " + RDFXML_Dataset_Record.count) @@ -34,14 +35,14 @@ class ParseRDFXML extends Serializable { println("HelloRecords" + " " + RDFXML_Dataset_Record_AsstringBlock.count) // RDFXML_Dataset_Record_AsstringBlock.foreach(println) - //-------------RDF XML Prefixes + // -------------RDF XML Prefixes // read data and save in RDD as block- RDFXML Prefixes val RDFXML_Dataset_Prefixes = sc.hadoopRDD(jobConf_Prefixes, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) println("HelloPrefixes" + " " + RDFXML_Dataset_Prefixes.count) // RDFXML_Dataset_Prefixes.foreach(println) // Convert the block- RDFXML Prefixes to String DataType var RDFXML_Dataset_AsstringPrefixes_WithoutDist = RDFXML_Dataset_Prefixes.map { case (x, y) => (x.toString()) } - val RDFXML_Dataset_AsstringPrefixes=RDFXML_Dataset_AsstringPrefixes_WithoutDist.distinct() + val RDFXML_Dataset_AsstringPrefixes = RDFXML_Dataset_AsstringPrefixes_WithoutDist.distinct() println("HelloPrefixes" + " " + RDFXML_Dataset_AsstringPrefixes.count) // RDFXML_Dataset_AsstringPrefixes.foreach(println) val pref = RDFXML_Dataset_AsstringPrefixes.reduce((a, b) => a + "\n" + b) @@ -88,5 +89,4 @@ class ParseRDFXML extends Serializable { val str = Arraylistval.get(0).toString() str } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala index 3bd8364..f3a4201 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/ParseTRIX.scala @@ -1,12 +1,14 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.SparkContext + +import java.io.ByteArrayInputStream +import java.util.ArrayList +import java.util.regex.Pattern + import org.apache.hadoop.mapred.JobConf -import org.apache.spark.rdd.RDD import org.apache.jena.graph.Triple import org.apache.jena.rdf.model.ModelFactory -import java.util.ArrayList -import java.util.regex.Pattern -import java.io.ByteArrayInputStream +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD class ParseTRIX extends Serializable { @@ -18,7 +20,7 @@ class ParseTRIX extends Serializable { org.apache.hadoop.mapred.FileInputFormat.addInputPaths(jobConf_Record, "hdfs://localhost:9000/mydata/xx.trix") // input path from Hadoop - //------------TRIX Record + // ------------TRIX Record // read data and save in RDD as block- TRIX Record val TRIX_Dataset_Record = sc.hadoopRDD(jobConf_Record, classOf[org.apache.hadoop.streaming.StreamInputFormat], classOf[org.apache.hadoop.io.Text], classOf[org.apache.hadoop.io.Text]) // println("HelloRecords" + " " + TRIX_Dataset_Record.count) @@ -43,11 +45,9 @@ class ParseTRIX extends Serializable { s4 } - // This function for TRIX case. def arrayListTOstring(Arraylistval: ArrayList[Triple]): String = { val str = Arraylistval.get(0).toString() str } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala index 7dc3c19..ccbd2b4 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/RevisionFeatures.scala @@ -1,4 +1,5 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection + import java.util.regex.{ Matcher, Pattern } class RevisionFeatures extends Serializable { @@ -53,7 +54,7 @@ class RevisionFeatures extends Serializable { } - // if (result_isNonLatin==true){ // is matched + // if (result_isNonLatin==true) { // is matched // // Final_Result=false // @@ -123,4 +124,4 @@ class RevisionFeatures extends Serializable { } -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala index 5490ec1..62c0432 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/SentencesFeatures.scala @@ -13,7 +13,7 @@ class SentencesFeatures extends Serializable { } - //1.comment tail Lenght Action subaction param+ tail + // 1.comment tail Lenght Action subaction param+ tail def CommentTailLenght(Full_Comment_Str: String): Integer = { val parsedCommment_OBJ = new CommentProcessor() val commentTail_Str = parsedCommment_OBJ.Extract_CommentTail(Full_Comment_Str) @@ -23,9 +23,9 @@ class SentencesFeatures extends Serializable { } // similarity between the comment ( suffix of the comment = Tail ) where the comment is normal comment /* .........*/ or /* ......... // e.g This comment includes wb...sitelink - //1-we have to be sure the comment is normal comment take the form /* ........./* - //2-Next step: we check the Action part if it includes a sitelink word or not. - //3-we compare the suffix in this case to site link with pay attention to the same language. + // 1-we have to be sure the comment is normal comment take the form /* ........./* + // 2-Next step: we check the Action part if it includes a sitelink word or not. + // 3-we compare the suffix in this case to site link with pay attention to the same language. // we check the type of Normal comment if it contains Aliases . def extract_CommentAliases_LanguageType(Full_Comment_Str: String): String = { @@ -185,5 +185,4 @@ class SentencesFeatures extends Serializable { langeType.trim() } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala index 33b1b5a..31d1158 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/StatementFeatures.scala @@ -42,5 +42,4 @@ class StatementFeatures extends Serializable { } result } - -} \ No newline at end of file +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala index 065adb1..2a9f380 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/VandalismDetection.scala @@ -1,31 +1,29 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import org.apache.spark.{ SparkContext, RangePartitioner } -import org.apache.spark.sql._ -import org.apache.spark.sql.expressions.Window -import org.apache.hadoop.mapred.JobConf import java.util.Scanner -import org.json.JSONObject + import org.apache.commons.lang3.StringUtils -import org.apache.spark.sql.functions.{ concat, lit } +import org.apache.hadoop.mapred.JobConf +import org.apache.spark.{ RangePartitioner, SparkContext } import org.apache.spark.ml.feature.{ Word2Vec, Word2VecModel } -import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.Pipeline +import org.apache.spark.sql._ +import org.apache.spark.sql.expressions.Window +import org.apache.spark.sql.functions.{ concat, lit } +import org.json.JSONObject class VandalismDetection extends Serializable { - - - // Function 1 : Distributed RDF Parser Approach def Start_RDF_Parser_Appraoch(sc: SparkContext): Unit = { - + val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ import org.apache.spark.sql.functions._ // for UDF import org.apache.spark.sql.types._ - + println("*********************************************************************") println("Distributed RDF Parser Model") println("Please Enter 1 for JTriple and 2 for TRIX process and 3 for RDFXML:") @@ -41,12 +39,11 @@ class VandalismDetection extends Serializable { val DRF_Builder_JTripleOBJ = new FacilitiesClass() val RDD_JTriple = JTriple_Parser_OBJ.Start_JTriple_Parser(jobConf, sc) RDD_JTriple.foreach(println) - //----------------------------DF for RDF TRIX ------------------------------------------ + // ----------------------------DF for RDF TRIX ------------------------------------------ // Create SQLContext Object: val sqlContext = new org.apache.spark.sql.SQLContext(sc) val DFR_JTriple = DRF_Builder_JTripleOBJ.RDD_TO_DFR_JTriple(RDD_JTriple, sqlContext) DFR_JTriple.show() - } else if (num == "2") { @@ -57,12 +54,11 @@ class VandalismDetection extends Serializable { val DRF_Builder_RDFTRIX_OBJ = new FacilitiesClass() val RDD_TRIX = TRIX_Parser_OBJ.Start_TriX_Parser(jobConf, sc) RDD_TRIX.foreach(println) - //----------------------------DF for RDF TRIX ------------------------------------------ + // ----------------------------DF for RDF TRIX ------------------------------------------ // Create SQLContext Object: val sqlContext = new org.apache.spark.sql.SQLContext(sc) val DFR_TRIX = DRF_Builder_RDFTRIX_OBJ.RDD_TO_DFR_TRIX(RDD_TRIX, sqlContext) DFR_TRIX.show() - } else if (num == "3") { println("RDF XML .........!!!!!!") @@ -83,1908 +79,1236 @@ class VandalismDetection extends Serializable { DFR_RDF_XML.show() } - - sc.stop() + + sc.stop() } - - //*********************************************************************************************************************************************** - // Function 2:Training XML and Vandalism Detection + + // ********************************************************************************* + // Function 2:Training XML and Vandalism Detection def Training_Start_StandardXMLParser_VD(sc: SparkContext): DataFrame = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ import org.apache.spark.sql.functions._ // for UDF import org.apache.spark.sql.types._ - // Streaming records: - val jobConf = new JobConf() - val NormalXML_Parser_OBJ = new ParseNormalXML() - val RDD_OBJ = new ParseNormalXML() - - val Training_RDD_All_Record1 = RDD_OBJ.Training_DB_NormalXML_Parser_Input1(sc) - val Training_RDD_All_Record2 = RDD_OBJ.Training_DB_NormalXML_Parser_Input2(sc) - val Training_RDD_All_Record3 = RDD_OBJ.Training_DB_NormalXML_Parser_Input3(sc) - //RDD_All_Record1.foreach(println) - //RDD_All_Record2.foreach(println) - // RDD_All_Record3.foreach(println) - - val Training_RDD_All_Record = Training_RDD_All_Record1.union(Training_RDD_All_Record2).union(Training_RDD_All_Record3).distinct().cache() - - //println(RDD_All_Record.count()) - println(Training_RDD_All_Record.count()) - - // ======= Json part : - //Json RDD : Each record has its Revision iD: - val JsonRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() - //JsonRDD.foreach(println) - //println(JsonRDD.count()) - - // Data set - val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() - //Ds_Json.show() - // println(Ds_Json.count()) - - // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage - val TagsRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() - val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() - // DF_Tags.show() - // println(DF_Tags.count()) - - //======== Join Json part with Tag Part:============================ - //Joining to have full data - val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") //.orderBy("Rid", "Itemid") - DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") - val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() - - val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") - val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct() - DF_Second.registerTempTable("Data2") - - //===================================================================Parent // Previous Revision============================================================================================================== - //val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") - //.select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2") - - //Joining based on Parent Id to get the previous cases: ParentID - val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() - - val RDD_After_JoinDF = DF_Joined.rdd.distinct() - val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() - val part = new RangePartitioner(4, x) - val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. - //partitioned.foreach(println) - // - // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== - // - val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," - //Result_all_Features.foreach(println) - // println("nayef" + Result_all_Features.count()) - - // Conver the RDD of All Features to DataFrame: - - val schema = StructType( - - //0 - StructField("Rid", IntegerType, false) :: - - // Character Features : - /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /*2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) :: - /*4*/ StructField("C4asciiratio", DoubleType, false) :: /*5*/ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) :: - /*7*/ StructField("C7latinratio", DoubleType, false) :: /*8*/ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) :: - /*10*/ StructField("C10longcharacterseq", DoubleType, false) :: /*11*/ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) :: - /*13 */ StructField("C13brahmiratio", DoubleType, false) :: /*14*/ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) :: - /*16*/ StructField("c16malysiaratio", DoubleType, false) :: /*17*/ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) :: - /*19 */ StructField("C19symbolratio", DoubleType, false) :: /*20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) :: - /*22*/ StructField("C22printableratio", DoubleType, false) :: /*23*/ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) :: - /* 25 */ StructField("C25hexaratio", DoubleType, false) :: - - //word Features: - /*26*/ StructField("W1languagewordratio", DoubleType, false) :: /*27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) :: - /*29 Integer */ StructField("W4longestword", IntegerType, false) :: /*30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) :: - /*32*/ StructField("W7uppercaseratio", DoubleType, false) :: /*33*/ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: - /*35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /*36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) :: - /*38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /*39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: - /*40*/ StructField("W15PortionQid", DoubleType, false) :: /*41*/ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) :: - - // - // // Sentences Features: - /*43*/ StructField("S1CommentTailLength", DoubleType, false) :: /*44*/ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) :: - // - // // Statements Features : - /*47*/ StructField("SS1Property", StringType, false) :: /*48*/ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) :: - // - // - // //User Features : - /*50 Boolean*/ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: - /*53 Boolean*/ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) :: - /*56*/ StructField("U7IPValue", DoubleType, false) :: /*57*/ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) :: - - //Items Features : - - /*60*/ StructField("I1NumberLabels", DoubleType, false) :: /*61*/ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) :: - /*64*/ StructField("I5NumberSitelinks", DoubleType, false) :: /*65*/ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) :: - /*68*/ StructField("I9NumberQualifierOrder", DoubleType, false) :: /*69*/ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) :: - - // Revision Features: - /*71*/ StructField("R1languageRevision", StringType, false) :: /*72*/ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) :: - /*74*/ StructField("R4JsonLength", DoubleType, false) :: /*75*/ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) :: - /*77*/ StructField("R7RevisionAccountChange", DoubleType, false) :: /*78*/ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) :: - /*80*/ StructField("R10RevisionSize", DoubleType, false) :: /*81*/ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) :: - /*83*/ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /*84*/ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) :: - /*86*/ StructField("R16PrevReviSubaction", StringType, false) :: - - Nil) - - val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column - , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), - e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column - , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: - , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: - , e(47), e(48), e(49) // User Features Column: - , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column: - , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: - , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) - - //a.User Frequency: - //number of revisions a user has contributed - //val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) - DF_Tags.registerTempTable("TagesTable") - val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1") - //ContributorFreq_for_Each_Revision_DF.show() - - //b.Cumulated : Number of a unique Item a user has contributed. - val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2") - //CumulatedNumberof_uniqueItemsForUser_DF.show() - - //1.Item Frequency: - // number of revisions an Item has - val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") - // ItemFrequ_DF.show() - - //2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name - val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") - //CumulatedNumberof_UniqueUserForItem_DF.show() - - //3. freq each Item : - val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") - // Fre_Item_DF.show() - - //***************************************************************************************************************************************** - // This is Main DataFrame: - val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) - //BeforeJoin_All_Features.show() - - //********************************** User feature Join - - // Join1 for add The first User Feature : number of revisions a user has contributed - val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") - //AfterJoinUser1_All_Features.show() - - // Join2 for add The second User Feature - val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") - //AfterJoinUser2_All_Features.show() - - //********************************** Item Feature Join - // Join3 for add The First Item Feature :number of revisions an Item has - val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - // AfterJoinItem3_All_Features.show() - - // Join4 for add The Second Item Feature - val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - // AfterJoinItem4_All_Features.show() - - // Join5 for add The Third Item Feature - val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - //2 AfterJoinItem5_All_Features.show() - - //******************************** - - //*Geografical information Feature from Meta File - //REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS - val df_GeoInf = sqlContext.read - .format("com.databricks.spark.csv") - .option("header", "true") // Use first line of all files as header - .option("inferSchema", "true") // Automatically infer data types - .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") - // df_GeoInf.show() - - val df_Truth = sqlContext.read - .format("com.databricks.spark.csv") - .option("header", "true") // Use first line of all files as header - .option("inferSchema", "true") // Automatically infer data types - .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") - // df_GeoInf.show() - - val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() - // AfterJoinGeoInfo_All_Features.show() - - val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() - //Final_All_Features.show() - - // Pre- process Data ============================================================================================================================================================ - - // For String Column, We fill the Null values by "NA": - - var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() - - // For Integer Frequency Column, We fill the Null values by 0: - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() - //Fill_Missing_Final_All_Features.show() - - val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } - val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) - - //===========================================================================Caharacter Features : Double , Integer Features ==================================================================================== - //Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : - var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) - Samples.registerTempTable("df") - - val Query = "select " + - "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + - "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + - "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + - "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + - "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + - "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + - "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + - "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + - "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + - "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + - "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + - "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + - "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + - "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + - "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" - - val medianValues = sqlContext.sql(Query).rdd - val Median = medianValues.first() - - // Median : - // Character Ratio Features: UDF - val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } - val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } - val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } - val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } - val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } - val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } - val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } - val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } - val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } - - val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } - val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } - val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } - val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } - val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } - val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } - val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } - val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } - val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } - val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } - val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } - val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } - val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } - val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } - val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } - - val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache() - val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache() - //df1.unpersist() - val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache() - //df2.unpersist() - val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache() - //df3.unpersist() - val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache() - //df4.unpersist() - val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache() - //df5.unpersist() - val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache() - //df6.unpersist() - val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache() - //df7.unpersist() - val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache() - - // Mean : - // character integer values : - val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() - val C10_Mean = Mean_C10longcharacterseq.getDouble(0) - val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } - val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) - - //Median - val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache() - // df9.unpersist() - val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache() - //df11.unpersist() - val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache() - // df12.unpersist() - val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache() - // df13.unpersist() - val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache() - // df14.unpersist() - val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache() - //df15.unpersist() - val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache() - //df16.unpersist() - val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache() - //df17.unpersist() - val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache() - //df18.unpersist() - val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache() - // df19.unpersist() - val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache() - // df20.unpersist() - val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache() - //df21.unpersist() - val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache() - // df22.unpersist() - val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache() - //df23.unpersist() - val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache() - - //************************************************End Character Features **************************************************************************************** - - //************************************************Start Word Features **************************************************************************************** - - // Word Ratio Features : UDF - val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } - val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } - val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } - val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } - val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } - - //1. - val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache() - - //2.Boolean(Double) IsContainLanguageWord - - //3. - val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache() - // df26.unpersist() - - //4. Integer " Mean: - val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() - val W4_Mean = Mean_W4longestword.getDouble(0) - val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } - val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) - - //5. Boolean (Double ) W5IscontainURL - //6. - val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache() - - //7. - val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache() - - //8. - val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache() - - //9.FemalFirst Boolean(Double) - //10.Male First Boolean(Double) - //11.ContainBadWord Boolean(Double) - //12ContainBanWord Boolean(Double) - - //13. Integer(Double): - val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() - val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) - val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } - val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) - - //14. Integer (Double): - val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() - val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) - val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } - val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) - - // 15. Double (Not ratio): - val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() - val W15_Mean = Mean_W15PortionQid.getDouble(0) - val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } - val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) - - //16. Double(Not Ratio): - val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() - val W16_Mean = Mean_W16PortionLnags.getDouble(0) - val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } - val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) - - //17.Double(Not ratio): - val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() - val W17_Mean = Mean_W17PortionLinks.getDouble(0) - val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } - val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) - - //************************************************End Word Features **************************************************************************************** - - //************************************************Start Sentences Features **************************************************************************************** - // 1. Integer(Double) - val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() - val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) - val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } - val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) - - //2. Double but Not ratio values : - val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() - val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) - val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } - val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) - - //3. Double but Not ratio values : - val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() - val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) - val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } - val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) - - //4. Double but Not ratio values : - val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() - val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) - val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } - val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) - - //df41.show() - //************************************************End Sentences Features **************************************************************************************** - //*********************************************** Start Statement Features **************************************************************************************** - //1. String - //2. String - //3. String - //************************************************End Statement Features **************************************************************************************** - //*********************************************** Start User Features **************************************************************************************** - - //1.Boolean(Double) - //2.Boolean(Double) - //3.Boolean(Double) - //4.Boolean(Double) - //5.Boolean(Double) - //6.Boolean(Double) - //7. (Double) IP No need to fill Missing Data - //8. (Double) ID No need to fill Missing Data - //9.Boolean(Double) - //10.Boolean(Double) - - //*********************************************** End User Features **************************************************************************************** - //*********************************************** Start Item Features **************************************************************************************** - //1. Integer (Double) No need to fill missing values - //2. Integer (Double) No need to fill missing values - //3. Integer (Double) No need to fill missing values - //4. Integer (Double) No need to fill missing values - //5. Integer (Double) No need to fill missing values - //6. Integer (Double) No need to fill missing values - //7. Integer (Double) No need to fill missing values - //8. Integer (Double) No need to fill missing values - //9. Integer (Double) No need to fill missing values - //10. Integer (Double) No need to fill missing values - //11. String - //*********************************************** End Item Features **************************************************************************************** - //*********************************************** Start Revision Features **************************************************************************************** - //1.String - //2.String - //3.Boolean (Double) - //4.Integer(Double) - //5.String - //6.String - //7. Boolean(Double) - //8. String - //9.String - //10. Integer (Double) - //11.String - //12. integer(Double) - //13. Long(Double) - //14. integer (Double) - //15.String - //16.String - //*********************************************** End Revision Features **************************************************************************************** - //*********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** - //Meta - // 1.Revision Session :Integer (Converted to Double) - //2. User Country Code - //3.User Continent Code - //4.User Time Size - //5.User Region Code - //6.User-city Name - //7.User Country Name - //8.RevisionTags - + // Streaming records: + val jobConf = new JobConf() + val NormalXML_Parser_OBJ = new ParseNormalXML() + val RDD_OBJ = new ParseNormalXML() + + val Training_RDD_All_Record1 = RDD_OBJ.Training_DB_NormalXML_Parser_Input1(sc) + val Training_RDD_All_Record2 = RDD_OBJ.Training_DB_NormalXML_Parser_Input2(sc) + val Training_RDD_All_Record3 = RDD_OBJ.Training_DB_NormalXML_Parser_Input3(sc) + // RDD_All_Record1.foreach(println) + // RDD_All_Record2.foreach(println) + // RDD_All_Record3.foreach(println) + + val Training_RDD_All_Record = Training_RDD_All_Record1.union(Training_RDD_All_Record2).union(Training_RDD_All_Record3).distinct().cache() + + // println(RDD_All_Record.count()) + println(Training_RDD_All_Record.count()) + + // ======= Json part : + // Json RDD : Each record has its Revision iD: + val JsonRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() + // JsonRDD.foreach(println) + // println(JsonRDD.count()) + + // Data set + val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() + // Ds_Json.show() + // println(Ds_Json.count()) + + // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage + val TagsRDD = Training_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() + val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", + "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() + // DF_Tags.show() + // println(DF_Tags.count()) + + // ======== Join Json part with Tag Part:============================ + // Joining to have full data + val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter") + .select("Rid", "itemid", "comment", "pid", "time", "contributorIP", + "contributorID", "contributorName", "JsonText", "labels", "descriptions", + "aliases", "claims", "sitelinks", "model", "format", "sha") // .orderBy("Rid", "Itemid") + DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") + val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() + + val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", + "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", + "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") + val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) // .distinct() + DF_Second.registerTempTable("Data2") + + // ===================================================================Parent // Previous Revision============================================================================================================== + // val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") + // .select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2") + + // Joining based on Parent Id to get the previous cases: ParentID + val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() + + val RDD_After_JoinDF = DF_Joined.rdd.distinct() + val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() + val part = new RangePartitioner(4, x) + val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. + // partitioned.foreach(println) + // + // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== + // + val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," + // Result_all_Features.foreach(println) + // println("nayef" + Result_all_Features.count()) + + // Conver the RDD of All Features to DataFrame: + + val schema = StructType( + + // 0 + StructField("Rid", IntegerType, false) :: + + // Character Features : + /* 1 */ StructField("C1uppercaseratio", DoubleType, false) :: /* 2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) :: + /* 4 */ StructField("C4asciiratio", DoubleType, false) :: /* 5 */ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) :: + /* 7 */ StructField("C7latinratio", DoubleType, false) :: /* 8 */ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) :: + /* 10 */ StructField("C10longcharacterseq", DoubleType, false) :: /* 11 */ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) :: + /* 13 */ StructField("C13brahmiratio", DoubleType, false) :: /* 14 */ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) :: + /* 16 */ StructField("c16malysiaratio", DoubleType, false) :: /* 17 */ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) :: + /* 19 */ StructField("C19symbolratio", DoubleType, false) :: /* 20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) :: + /* 22 */ StructField("C22printableratio", DoubleType, false) :: /* 23 */ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) :: + /* 25 */ StructField("C25hexaratio", DoubleType, false) :: + + // word Features: + /* 26 */ StructField("W1languagewordratio", DoubleType, false) :: /* 27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) :: + /* 29 Integer */ StructField("W4longestword", IntegerType, false) :: /* 30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) :: + /* 32 */ StructField("W7uppercaseratio", DoubleType, false) :: /* 33 */ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: + /* 35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /* 36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) :: + /* 38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /* 39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: + /* 40 */ StructField("W15PortionQid", DoubleType, false) :: /* 41 */ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) :: + + // + // // Sentences Features: + /* 43 */ StructField("S1CommentTailLength", DoubleType, false) :: /* 44 */ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) :: + // + // // Statements Features : + /* 47 */ StructField("SS1Property", StringType, false) :: /* 48 */ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) :: + // + // + // // User Features : + /* 50 Boolean */ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: + /* 53 Boolean */ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) :: + /* 56 */ StructField("U7IPValue", DoubleType, false) :: /* 57 */ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) :: + + // Items Features : + + /* 60 */ StructField("I1NumberLabels", DoubleType, false) :: /* 61 */ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) :: + /* 64 */ StructField("I5NumberSitelinks", DoubleType, false) :: /* 65 */ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) :: + /* 68 */ StructField("I9NumberQualifierOrder", DoubleType, false) :: /* 69 */ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) :: + + // Revision Features: + /* 71 */ StructField("R1languageRevision", StringType, false) :: /* 72 */ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) :: + /* 74 */ StructField("R4JsonLength", DoubleType, false) :: /* 75 */ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) :: + /* 77 */ StructField("R7RevisionAccountChange", DoubleType, false) :: /* 78 */ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) :: + /* 80 */ StructField("R10RevisionSize", DoubleType, false) :: /* 81 */ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) :: + /* 83 */ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /* 84 */ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) :: + /* 86 */ StructField("R16PrevReviSubaction", StringType, false) :: + + Nil) + + val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column + , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), + e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble // Word Feature column + , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: + , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble // Statement Features Column: + , e(47), e(48), e(49) // User Features Column: + , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble // Item Features column: + , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() // Revision Features Column: + , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) + + // a.User Frequency: + // number of revisions a user has contributed + // val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) + DF_Tags.registerTempTable("TagesTable") + val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1") + // ContributorFreq_for_Each_Revision_DF.show() + + // b.Cumulated : Number of a unique Item a user has contributed. + val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2") + // CumulatedNumberof_uniqueItemsForUser_DF.show() + + // 1.Item Frequency: + // number of revisions an Item has + val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") + // ItemFrequ_DF.show() + + // 2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name + val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") + // CumulatedNumberof_UniqueUserForItem_DF.show() + + // 3. freq each Item : + val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") + // Fre_Item_DF.show() + + // ***************************************************************************************************************************************** + // This is Main DataFrame: + val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) + // BeforeJoin_All_Features.show() + + // ********************************** User feature Join + + // Join1 for add The first User Feature : number of revisions a user has contributed + val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") + // AfterJoinUser1_All_Features.show() + + // Join2 for add The second User Feature + val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") + // AfterJoinUser2_All_Features.show() + + // ********************************** Item Feature Join + // Join3 for add The First Item Feature :number of revisions an Item has + val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // AfterJoinItem3_All_Features.show() + + // Join4 for add The Second Item Feature + val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // AfterJoinItem4_All_Features.show() + + // Join5 for add The Third Item Feature + val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // 2 AfterJoinItem5_All_Features.show() + + // ******************************** + + // *Geografical information Feature from Meta File + // REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS + val df_GeoInf = sqlContext.read + .format("com.databricks.spark.csv") + .option("header", "true") // Use first line of all files as header + .option("inferSchema", "true") // Automatically infer data types + .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") + // df_GeoInf.show() + + val df_Truth = sqlContext.read + .format("com.databricks.spark.csv") + .option("header", "true") // Use first line of all files as header + .option("inferSchema", "true") // Automatically infer data types + .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") + // df_GeoInf.show() + + val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() + // AfterJoinGeoInfo_All_Features.show() + + val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() + // Final_All_Features.show() + + // Pre- process Data ============================================================================================================================================================ + + // For String Column, We fill the Null values by "NA": + + var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() + + // For Integer Frequency Column, We fill the Null values by 0: + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() + // Fill_Missing_Final_All_Features.show() + + val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } + val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) + + // ===========================================================================Caharacter Features : Double , Integer Features ==================================================================================== + // Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : + var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) + Samples.registerTempTable("df") + + val Query = "select " + + "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + + "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + + "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + + "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + + "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + + "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + + "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + + "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + + "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + + "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + + "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + + "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + + "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + + "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + + "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" + + val medianValues = sqlContext.sql(Query).rdd + val Median = medianValues.first() + + // Median : + // Character Ratio Features: UDF + val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } + val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } + val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } + val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } + val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } + val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } + val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } + val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } + val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } + + val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } + val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } + val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } + val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } + val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } + val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } + val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } + val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } + val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } + val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } + val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } + val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } + val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } + val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } + val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } + + val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache() + val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache() + // df1.unpersist() + val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache() + // df2.unpersist() + val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache() + // df3.unpersist() + val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache() + // df4.unpersist() + val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache() + // df5.unpersist() + val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache() + // df6.unpersist() + val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache() + // df7.unpersist() + val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache() + + // Mean : + // character integer values : + val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() + val C10_Mean = Mean_C10longcharacterseq.getDouble(0) + val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } + val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) + + // Median + val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache() + // df9.unpersist() + val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache() + // df11.unpersist() + val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache() + // df12.unpersist() + val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache() + // df13.unpersist() + val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache() + // df14.unpersist() + val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache() + // df15.unpersist() + val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache() + // df16.unpersist() + val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache() + // df17.unpersist() + val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache() + // df18.unpersist() + val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache() + // df19.unpersist() + val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache() + // df20.unpersist() + val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache() + // df21.unpersist() + val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache() + // df22.unpersist() + val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache() + // df23.unpersist() + val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache() + + // ************************************************End Character Features **************************************************************************************** + + // ************************************************Start Word Features **************************************************************************************** + + // Word Ratio Features : UDF + val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } + val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } + val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } + val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } + val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } + + // 1. + val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache() + + // 2.Boolean(Double) IsContainLanguageWord + + // 3. + val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache() + // df26.unpersist() + + // 4. Integer " Mean: + val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() + val W4_Mean = Mean_W4longestword.getDouble(0) + val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } + val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) + + // 5. Boolean (Double ) W5IscontainURL + // 6. + val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache() + + // 7. + val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache() + + // 8. + val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache() + + // 9.FemalFirst Boolean(Double) + // 10.Male First Boolean(Double) + // 11.ContainBadWord Boolean(Double) + // 12ContainBanWord Boolean(Double) + + // 13. Integer(Double): + val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() + val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) + val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } + val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) + + // 14. Integer (Double): + val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() + val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) + val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } + val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) + + // 15. Double (Not ratio): + val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() + val W15_Mean = Mean_W15PortionQid.getDouble(0) + val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } + val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) + + // 16. Double(Not Ratio): + val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() + val W16_Mean = Mean_W16PortionLnags.getDouble(0) + val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } + val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) + + // 17.Double(Not ratio): + val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() + val W17_Mean = Mean_W17PortionLinks.getDouble(0) + val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } + val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) + + // ************************************************End Word Features **************************************************************************************** + + // ************************************************Start Sentences Features **************************************************************************************** + // 1. Integer(Double) + val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() + val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) + val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } + val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) + + // 2. Double but Not ratio values : + val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() + val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) + val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } + val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) + + // 3. Double but Not ratio values : + val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() + val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) + val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } + val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) + + // 4. Double but Not ratio values : + val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() + val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) + val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } + val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) + + // df41.show() + // ************************************************End Sentences Features **************************************************************************************** + // *********************************************** Start Statement Features **************************************************************************************** + // 1. String + // 2. String + // 3. String + // ************************************************End Statement Features **************************************************************************************** + // *********************************************** Start User Features **************************************************************************************** + + // 1.Boolean(Double) + // 2.Boolean(Double) + // 3.Boolean(Double) + // 4.Boolean(Double) + // 5.Boolean(Double) + // 6.Boolean(Double) + // 7. (Double) IP No need to fill Missing Data + // 8. (Double) ID No need to fill Missing Data + // 9.Boolean(Double) + // 10.Boolean(Double) + + // *********************************************** End User Features **************************************************************************************** + // *********************************************** Start Item Features **************************************************************************************** + // 1. Integer (Double) No need to fill missing values + // 2. Integer (Double) No need to fill missing values + // 3. Integer (Double) No need to fill missing values + // 4. Integer (Double) No need to fill missing values + // 5. Integer (Double) No need to fill missing values + // 6. Integer (Double) No need to fill missing values + // 7. Integer (Double) No need to fill missing values + // 8. Integer (Double) No need to fill missing values + // 9. Integer (Double) No need to fill missing values + // 10. Integer (Double) No need to fill missing values + // 11. String + // *********************************************** End Item Features **************************************************************************************** + // *********************************************** Start Revision Features **************************************************************************************** + // 1.String + // 2.String + // 3.Boolean (Double) + // 4.Integer(Double) + // 5.String + // 6.String + // 7. Boolean(Double) + // 8. String + // 9.String + // 10. Integer (Double) + // 11.String + // 12. integer(Double) + // 13. Long(Double) + // 14. integer (Double) + // 15.String + // 16.String + // *********************************************** End Revision Features **************************************************************************************** + // *********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** + // Meta + // 1.Revision Session :Integer (Converted to Double) + // 2. User Country Code + // 3.User Continent Code + // 4.User Time Size + // 5.User Region Code + // 6.User-city Name + // 7.User Country Name + // 8.RevisionTags + + // Truth: + // 1.Undo + + // Freq : + + // 1.5 features + + // Roll Boolean :Boolean (Double) + // Undo :Boolean (Double) + + // *********************************************** End Revision Features **************************************************************************************** + + // ===========================================================================String Features==================================================================================== + + val df42 = df41.withColumn( + // statement String features: + "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", + // Revision String Features: + lit(";"), $"R1languageRevision", + lit(";"), $"R2RevisionLanguageLocal", + lit(";"), $"R5RevisionAction", + lit(";"), $"R6PrevReviAction", + lit(";"), $"R8ParRevision", + lit(";"), $"R9RevisionTime", + lit(";"), $"R11ContentType", + lit(";"), $"R15RevisionSubaction", + lit(";"), $"R16PrevReviSubaction", + + lit(";"), $"USER_COUNTRY_CODE", + lit(";"), $"USER_CONTINENT_CODE", + lit(";"), $"USER_TIME_ZONE", + lit(";"), $"USER_REGION_CODE", + lit(";"), $"USER_CITY_NAME", + lit(";"), $"USER_COUNTY_NAME", + lit(";"), $"REVISION_TAGS")) + + val toArray = udf((record: String) => record.split(";").map(_.toString())) + val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) + // test1.show() + // test1.printSchema() + + val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) + val model = word2Vec.fit(test1) + val result = model.transform(test1) //.rdd + + // result.show() + + val Todense = udf((b: Vector) => b.toDense) + val test_new2 = result.withColumn("result", Todense(col("result"))) + + val assembler = new VectorAssembler().setInputCols(Array( + "result", + + // character + "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", + "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", + "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", + "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", + + // Words + "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", + "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", + "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", + + // Sentences : + "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", + + // User : + "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", + "U9HasBirthDate", "U10HasDeathDate", + + // Item: + + "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", + "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", + + // Revision: + "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", + "R13TimeSinceLastRevi", "R14CommentLength", + + // Meta , truth , Freq + // meta : + "FinalREVISION_SESSION_ID", // Truth: - //1.Undo - - // Freq : - - //1.5 features - - // Roll Boolean :Boolean (Double) - // Undo :Boolean (Double) - - //*********************************************** End Revision Features **************************************************************************************** - - //===========================================================================String Features==================================================================================== - - val df42 = df41.withColumn( - //statement String features: - "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", - //Revision String Features: - lit(";"), $"R1languageRevision", - lit(";"), $"R2RevisionLanguageLocal", - lit(";"), $"R5RevisionAction", - lit(";"), $"R6PrevReviAction", - lit(";"), $"R8ParRevision", - lit(";"), $"R9RevisionTime", - lit(";"), $"R11ContentType", - lit(";"), $"R15RevisionSubaction", - lit(";"), $"R16PrevReviSubaction", - - lit(";"), $"USER_COUNTRY_CODE", - lit(";"), $"USER_CONTINENT_CODE", - lit(";"), $"USER_TIME_ZONE", - lit(";"), $"USER_REGION_CODE", - lit(";"), $"USER_CITY_NAME", - lit(";"), $"USER_COUNTY_NAME", - lit(";"), $"REVISION_TAGS")) - - val toArray = udf((record: String) => record.split(";").map(_.toString())) - val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) - // test1.show() - // test1.printSchema() - - val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) - val model = word2Vec.fit(test1) - val result = model.transform(test1) //.rdd - - // result.show() - - val Todense = udf((b: Vector) => b.toDense) - val test_new2 = result.withColumn("result", Todense(col("result"))) - - val assembler = new VectorAssembler().setInputCols(Array( - "result", - - // character - "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", - "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", - "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", - "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", - - // Words - "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", - "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", - "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", - - //Sentences : - "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", + "FinalUNDO_RESTORE_REVERTED", - // User : - "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", - "U9HasBirthDate", "U10HasDeathDate", + // Freq: + "FinalNumberofRevisionsUserContributed", + "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") + val Training_Data = assembler.transform(test_new2) - //Item: - - "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", - "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", - - //Revision: - "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", - "R13TimeSinceLastRevi", "R14CommentLength", - - // Meta , truth , Freq - // meta : - "FinalREVISION_SESSION_ID", - // Truth: - "FinalUNDO_RESTORE_REVERTED", - - //Freq: - "FinalNumberofRevisionsUserContributed", - "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") - val Training_Data = assembler.transform(test_new2) - - // Prepare the data for classification: + // Prepare the data for classification: // NewData.registerTempTable("DB") // val Training_Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED from DB") - //val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision + // val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision - //Data.show() + // Data.show() - //val TestClassifiers = new Classifiers() -// - // TestClassifiers.RandomForestClassifer(Data, sqlContext) -// // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) -// // TestClassifiers.LogisticRegrision(Data, sqlContext) -// // TestClassifiers.GradientBoostedTree(Data, sqlContext) -// // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) + // val TestClassifiers = new Classifiers() + // + // TestClassifiers.RandomForestClassifer(Data, sqlContext) + // // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) + // // TestClassifiers.LogisticRegrision(Data, sqlContext) + // // TestClassifiers.GradientBoostedTree(Data, sqlContext) + // // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) Training_Data - - + } - - //*********************************************************************************************************************************************** - // Function 3:Testing XML and Vandalism Detection + // *********************************************************************************************************************************************** + // Function 3:Testing XML and Vandalism Detection def Testing_Start_StandardXMLParser_VD(sc: SparkContext): DataFrame = { val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ import org.apache.spark.sql.functions._ // for UDF import org.apache.spark.sql.types._ - // Streaming records: - val jobConf = new JobConf() - val NormalXML_Parser_OBJ = new ParseNormalXML() - val RDD_OBJ = new ParseNormalXML() - - val Testing_RDD_All_Record = RDD_OBJ.Testing_DB_NormalXML_Parser(sc).cache() - - - // ======= Json part : - //Json RDD : Each record has its Revision iD: - val JsonRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() - //JsonRDD.foreach(println) - //println(JsonRDD.count()) - - // Data set - val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() - //Ds_Json.show() - // println(Ds_Json.count()) - - // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage - val TagsRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() - val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() - // DF_Tags.show() - // println(DF_Tags.count()) - - //======== Join Json part with Tag Part:============================ - //Joining to have full data - val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") //.orderBy("Rid", "Itemid") - DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") - val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() - - val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") - val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct() - DF_Second.registerTempTable("Data2") - - //===================================================================Parent // Previous Revision============================================================================================================== - //val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") - //.select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2") - - //Joining based on Parent Id to get the previous cases: ParentID - val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() - - val RDD_After_JoinDF = DF_Joined.rdd.distinct() - val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() - val part = new RangePartitioner(4, x) - val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. - //partitioned.foreach(println) - // - // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== - // - val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," - //Result_all_Features.foreach(println) - // println("nayef" + Result_all_Features.count()) - - // Conver the RDD of All Features to DataFrame: - - val schema = StructType( - - //0 - StructField("Rid", IntegerType, false) :: - - // Character Features : - /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /*2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) :: - /*4*/ StructField("C4asciiratio", DoubleType, false) :: /*5*/ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) :: - /*7*/ StructField("C7latinratio", DoubleType, false) :: /*8*/ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) :: - /*10*/ StructField("C10longcharacterseq", DoubleType, false) :: /*11*/ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) :: - /*13 */ StructField("C13brahmiratio", DoubleType, false) :: /*14*/ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) :: - /*16*/ StructField("c16malysiaratio", DoubleType, false) :: /*17*/ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) :: - /*19 */ StructField("C19symbolratio", DoubleType, false) :: /*20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) :: - /*22*/ StructField("C22printableratio", DoubleType, false) :: /*23*/ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) :: - /* 25 */ StructField("C25hexaratio", DoubleType, false) :: - - //word Features: - /*26*/ StructField("W1languagewordratio", DoubleType, false) :: /*27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) :: - /*29 Integer */ StructField("W4longestword", IntegerType, false) :: /*30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) :: - /*32*/ StructField("W7uppercaseratio", DoubleType, false) :: /*33*/ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: - /*35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /*36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) :: - /*38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /*39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: - /*40*/ StructField("W15PortionQid", DoubleType, false) :: /*41*/ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) :: - - // - // // Sentences Features: - /*43*/ StructField("S1CommentTailLength", DoubleType, false) :: /*44*/ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) :: - // - // // Statements Features : - /*47*/ StructField("SS1Property", StringType, false) :: /*48*/ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) :: - // - // - // //User Features : - /*50 Boolean*/ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: - /*53 Boolean*/ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) :: - /*56*/ StructField("U7IPValue", DoubleType, false) :: /*57*/ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) :: - - //Items Features : - - /*60*/ StructField("I1NumberLabels", DoubleType, false) :: /*61*/ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) :: - /*64*/ StructField("I5NumberSitelinks", DoubleType, false) :: /*65*/ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) :: - /*68*/ StructField("I9NumberQualifierOrder", DoubleType, false) :: /*69*/ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) :: - - // Revision Features: - /*71*/ StructField("R1languageRevision", StringType, false) :: /*72*/ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) :: - /*74*/ StructField("R4JsonLength", DoubleType, false) :: /*75*/ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) :: - /*77*/ StructField("R7RevisionAccountChange", DoubleType, false) :: /*78*/ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) :: - /*80*/ StructField("R10RevisionSize", DoubleType, false) :: /*81*/ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) :: - /*83*/ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /*84*/ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) :: - /*86*/ StructField("R16PrevReviSubaction", StringType, false) :: - - Nil) - - val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column - , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), - e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column - , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: - , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: - , e(47), e(48), e(49) // User Features Column: - , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column: - , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: - , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) - - //a.User Frequency: - //number of revisions a user has contributed - //val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) - DF_Tags.registerTempTable("TagesTable") - val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1") - //ContributorFreq_for_Each_Revision_DF.show() - - //b.Cumulated : Number of a unique Item a user has contributed. - val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2") - //CumulatedNumberof_uniqueItemsForUser_DF.show() - - //1.Item Frequency: - // number of revisions an Item has - val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") - // ItemFrequ_DF.show() - - //2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name - val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") - //CumulatedNumberof_UniqueUserForItem_DF.show() - - //3. freq each Item : - val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") - // Fre_Item_DF.show() - - //***************************************************************************************************************************************** - // This is Main DataFrame: - val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) - //BeforeJoin_All_Features.show() - - //********************************** User feature Join - - // Join1 for add The first User Feature : number of revisions a user has contributed - val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") - //AfterJoinUser1_All_Features.show() - - // Join2 for add The second User Feature - val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") - //AfterJoinUser2_All_Features.show() - - //********************************** Item Feature Join - // Join3 for add The First Item Feature :number of revisions an Item has - val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - // AfterJoinItem3_All_Features.show() - - // Join4 for add The Second Item Feature - val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - // AfterJoinItem4_All_Features.show() - - // Join5 for add The Third Item Feature - val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") - //2 AfterJoinItem5_All_Features.show() - - //******************************** - - //*Geografical information Feature from Meta File - //REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS - val df_GeoInf = sqlContext.read - .format("com.databricks.spark.csv") - .option("header", "true") // Use first line of all files as header - .option("inferSchema", "true") // Automatically infer data types - .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") - // df_GeoInf.show() - - val df_Truth = sqlContext.read - .format("com.databricks.spark.csv") - .option("header", "true") // Use first line of all files as header - .option("inferSchema", "true") // Automatically infer data types - .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") - // df_GeoInf.show() - - val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() - // AfterJoinGeoInfo_All_Features.show() - - val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() - //Final_All_Features.show() - - // Pre- process Data ============================================================================================================================================================ - - // For String Column, We fill the Null values by "NA": - - var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() - - // For Integer Frequency Column, We fill the Null values by 0: - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() - //Fill_Missing_Final_All_Features.show() - - val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } - val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) - - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) - Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) - - //===========================================================================Caharacter Features : Double , Integer Features ==================================================================================== - //Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : - var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) - Samples.registerTempTable("df") - - val Query = "select " + - "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + - "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + - "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + - "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + - "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + - "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + - "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + - "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + - "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + - "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + - "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + - "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + - "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + - "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + - "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" - - val medianValues = sqlContext.sql(Query).rdd - val Median = medianValues.first() - - // Median : - // Character Ratio Features: UDF - val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } - val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } - val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } - val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } - val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } - val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } - val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } - val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } - val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } - - val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } - val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } - val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } - val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } - val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } - val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } - val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } - val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } - val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } - val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } - val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } - val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } - val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } - val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } - val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } - - val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache() - val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache() - //df1.unpersist() - val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache() - //df2.unpersist() - val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache() - //df3.unpersist() - val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache() - //df4.unpersist() - val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache() - //df5.unpersist() - val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache() - //df6.unpersist() - val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache() - //df7.unpersist() - val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache() - - // Mean : - // character integer values : - val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() - val C10_Mean = Mean_C10longcharacterseq.getDouble(0) - val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } - val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) - - //Median - val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache() - // df9.unpersist() - val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache() - //df11.unpersist() - val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache() - // df12.unpersist() - val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache() - // df13.unpersist() - val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache() - // df14.unpersist() - val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache() - //df15.unpersist() - val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache() - //df16.unpersist() - val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache() - //df17.unpersist() - val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache() - //df18.unpersist() - val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache() - // df19.unpersist() - val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache() - // df20.unpersist() - val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache() - //df21.unpersist() - val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache() - // df22.unpersist() - val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache() - //df23.unpersist() - val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache() - - //************************************************End Character Features **************************************************************************************** - - //************************************************Start Word Features **************************************************************************************** - - // Word Ratio Features : UDF - val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } - val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } - val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } - val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } - val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } - - //1. - val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache() - - //2.Boolean(Double) IsContainLanguageWord - - //3. - val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache() - // df26.unpersist() - - //4. Integer " Mean: - val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() - val W4_Mean = Mean_W4longestword.getDouble(0) - val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } - val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) - - //5. Boolean (Double ) W5IscontainURL - //6. - val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache() - - //7. - val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache() - - //8. - val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache() - - //9.FemalFirst Boolean(Double) - //10.Male First Boolean(Double) - //11.ContainBadWord Boolean(Double) - //12ContainBanWord Boolean(Double) - - //13. Integer(Double): - val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() - val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) - val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } - val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) - - //14. Integer (Double): - val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() - val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) - val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } - val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) - - // 15. Double (Not ratio): - val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() - val W15_Mean = Mean_W15PortionQid.getDouble(0) - val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } - val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) - - //16. Double(Not Ratio): - val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() - val W16_Mean = Mean_W16PortionLnags.getDouble(0) - val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } - val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) - - //17.Double(Not ratio): - val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() - val W17_Mean = Mean_W17PortionLinks.getDouble(0) - val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } - val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) - - //************************************************End Word Features **************************************************************************************** - - //************************************************Start Sentences Features **************************************************************************************** - // 1. Integer(Double) - val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() - val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) - val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } - val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) - - //2. Double but Not ratio values : - val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() - val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) - val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } - val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) - - //3. Double but Not ratio values : - val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() - val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) - val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } - val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) - - //4. Double but Not ratio values : - val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() - val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) - val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } - val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) - - //df41.show() - //************************************************End Sentences Features **************************************************************************************** - //*********************************************** Start Statement Features **************************************************************************************** - //1. String - //2. String - //3. String - //************************************************End Statement Features **************************************************************************************** - //*********************************************** Start User Features **************************************************************************************** - - //1.Boolean(Double) - //2.Boolean(Double) - //3.Boolean(Double) - //4.Boolean(Double) - //5.Boolean(Double) - //6.Boolean(Double) - //7. (Double) IP No need to fill Missing Data - //8. (Double) ID No need to fill Missing Data - //9.Boolean(Double) - //10.Boolean(Double) - - //*********************************************** End User Features **************************************************************************************** - //*********************************************** Start Item Features **************************************************************************************** - //1. Integer (Double) No need to fill missing values - //2. Integer (Double) No need to fill missing values - //3. Integer (Double) No need to fill missing values - //4. Integer (Double) No need to fill missing values - //5. Integer (Double) No need to fill missing values - //6. Integer (Double) No need to fill missing values - //7. Integer (Double) No need to fill missing values - //8. Integer (Double) No need to fill missing values - //9. Integer (Double) No need to fill missing values - //10. Integer (Double) No need to fill missing values - //11. String - //*********************************************** End Item Features **************************************************************************************** - //*********************************************** Start Revision Features **************************************************************************************** - //1.String - //2.String - //3.Boolean (Double) - //4.Integer(Double) - //5.String - //6.String - //7. Boolean(Double) - //8. String - //9.String - //10. Integer (Double) - //11.String - //12. integer(Double) - //13. Long(Double) - //14. integer (Double) - //15.String - //16.String - //*********************************************** End Revision Features **************************************************************************************** - //*********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** - //Meta - // 1.Revision Session :Integer (Converted to Double) - //2. User Country Code - //3.User Continent Code - //4.User Time Size - //5.User Region Code - //6.User-city Name - //7.User Country Name - //8.RevisionTags - + // Streaming records: + val jobConf = new JobConf() + val NormalXML_Parser_OBJ = new ParseNormalXML() + val RDD_OBJ = new ParseNormalXML() + + val Testing_RDD_All_Record = RDD_OBJ.Testing_DB_NormalXML_Parser(sc).cache() + + // ======= Json part : + // Json RDD : Each record has its Revision iD: + val JsonRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() + // JsonRDD.foreach(println) + // println(JsonRDD.count()) + + // Data set + val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() + // Ds_Json.show() + // println(Ds_Json.count()) + + // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage + val TagsRDD = Testing_RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() + val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() + // DF_Tags.show() + // println(DF_Tags.count()) + + // ======== Join Json part with Tag Part:============================ + // Joining to have full data + val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") // .orderBy("Rid", "Itemid") + DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") + val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() + + val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") + val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct() + DF_Second.registerTempTable("Data2") + + // ===================================================================Parent // Previous Revision============================================================================================================== + // val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") + // .select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2") + + // Joining based on Parent Id to get the previous cases: ParentID + val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() + + val RDD_After_JoinDF = DF_Joined.rdd.distinct() + val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() + val part = new RangePartitioner(4, x) + val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. + // partitioned.foreach(println) + // + // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== + // + val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," + // Result_all_Features.foreach(println) + // println("nayef" + Result_all_Features.count()) + + // Conver the RDD of All Features to DataFrame: + + val schema = StructType( + + // 0 + StructField("Rid", IntegerType, false) :: + + // Character Features : + /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /* 2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) :: + /* 4 */ StructField("C4asciiratio", DoubleType, false) :: /* 5 */ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) :: + /* 7 */ StructField("C7latinratio", DoubleType, false) :: /* 8 */ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) :: + /* 10 */ StructField("C10longcharacterseq", DoubleType, false) :: /* 11 */ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) :: + /* 13 */ StructField("C13brahmiratio", DoubleType, false) :: /* 14 */ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) :: + /* 16 */ StructField("c16malysiaratio", DoubleType, false) :: /* 17 */ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) :: + /* 19 */ StructField("C19symbolratio", DoubleType, false) :: /* 20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) :: + /* 22 */ StructField("C22printableratio", DoubleType, false) :: /* 23 */ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) :: + /* 25 */ StructField("C25hexaratio", DoubleType, false) :: + + // word Features: + /* 26 */ StructField("W1languagewordratio", DoubleType, false) :: /* 27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) :: + /* 29 Integer */ StructField("W4longestword", IntegerType, false) :: /* 30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) :: + /* 32 */ StructField("W7uppercaseratio", DoubleType, false) :: /* 33 */ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: + /* 35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /* 36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) :: + /* 38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /* 39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: + /* 40 */ StructField("W15PortionQid", DoubleType, false) :: /* 41 */ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) :: + + // + // // Sentences Features: + /* 43 */ StructField("S1CommentTailLength", DoubleType, false) :: /* 44 */ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) :: + // + // // Statements Features : + /* 47 */ StructField("SS1Property", StringType, false) :: /* 48 */ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) :: + // + // + // //User Features : + /* 50 Boolean */ StructField("U1IsPrivileged", DoubleType, false) :: /* 51 Boolean */ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: + /*53 Boolean */ StructField("U4IsProperty", DoubleType, false) :: /* 54 Boolean */ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) :: + /* 56 */ StructField("U7IPValue", DoubleType, false) :: /* 57 */ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) :: + + // Items Features : + + /* 60 */ StructField("I1NumberLabels", DoubleType, false) :: /* 61 */ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) :: + /* 64 */ StructField("I5NumberSitelinks", DoubleType, false) :: /* 65 */ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) :: + /* 68 */ StructField("I9NumberQualifierOrder", DoubleType, false) :: /* 69 */ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) :: + + // Revision Features: + /* 71 */ StructField("R1languageRevision", StringType, false) :: /* 72 */ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) :: + /* 74 */ StructField("R4JsonLength", DoubleType, false) :: /* 75 */ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) :: + /* 77 */ StructField("R7RevisionAccountChange", DoubleType, false) :: /* 78 */ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) :: + /* 80 */ StructField("R10RevisionSize", DoubleType, false) :: /* 81 */ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) :: + /* 83 */ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /* 84 */ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) :: + /* 86 */ StructField("R16PrevReviSubaction", StringType, false) :: + + Nil) + + val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column + , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), + e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column + , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: + , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: + , e(47), e(48), e(49) // User Features Column: + , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column: + , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: + , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) + + // a.User Frequency: + // number of revisions a user has contributed + // val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) + DF_Tags.registerTempTable("TagesTable") + val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1") + // ContributorFreq_for_Each_Revision_DF.show() + + // b.Cumulated : Number of a unique Item a user has contributed. + val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2") + // CumulatedNumberof_uniqueItemsForUser_DF.show() + + // 1.Item Frequency: + // number of revisions an Item has + val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") + // ItemFrequ_DF.show() + + // 2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name + val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") + // CumulatedNumberof_UniqueUserForItem_DF.show() + + // 3. freq each Item : + val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") + // Fre_Item_DF.show() + + // ***************************************************************************************************************************************** + // This is Main DataFrame: + val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) + // BeforeJoin_All_Features.show() + + // ********************************** User feature Join + + // Join1 for add The first User Feature : number of revisions a user has contributed + val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") + // AfterJoinUser1_All_Features.show() + + // Join2 for add The second User Feature + val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") + // AfterJoinUser2_All_Features.show() + + // ********************************** Item Feature Join + // Join3 for add The First Item Feature :number of revisions an Item has + val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // AfterJoinItem3_All_Features.show() + + // Join4 for add The Second Item Feature + val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // AfterJoinItem4_All_Features.show() + + // Join5 for add The Third Item Feature + val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") + // 2 AfterJoinItem5_All_Features.show() + + // ******************************** + + // *Geografical information Feature from Meta File + // REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS + val df_GeoInf = sqlContext.read + .format("com.databricks.spark.csv") + .option("header", "true") // Use first line of all files as header + .option("inferSchema", "true") // Automatically infer data types + .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") + // df_GeoInf.show() + + val df_Truth = sqlContext.read + .format("com.databricks.spark.csv") + .option("header", "true") // Use first line of all files as header + .option("inferSchema", "true") // Automatically infer data types + .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") + // df_GeoInf.show() + + val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() + // AfterJoinGeoInfo_All_Features.show() + + val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() + // Final_All_Features.show() + + // Pre- process Data ============================================================================================================================================================ + + // For String Column, We fill the Null values by "NA": + + var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() + + // For Integer Frequency Column, We fill the Null values by 0: + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() + // Fill_Missing_Final_All_Features.show() + + val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } + val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) + + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) + Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) + + // ===========================================================================Caharacter Features : Double , Integer Features ==================================================================================== + // Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : + var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) + Samples.registerTempTable("df") + + val Query = "select " + + "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + + "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + + "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + + "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + + "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + + "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + + "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + + "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + + "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + + "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + + "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + + "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + + "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + + "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + + "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" + + val medianValues = sqlContext.sql(Query).rdd + val Median = medianValues.first() + + // Median : + // Character Ratio Features: UDF + val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } + val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } + val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } + val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } + val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } + val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } + val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } + val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } + val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } + + val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } + val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } + val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } + val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } + val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } + val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } + val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } + val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } + val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } + val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } + val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } + val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } + val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } + val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } + val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } + + val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) // .drop("C1uppercaseratio").cache() + val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) // .drop("C2lowercaseratio").cache() + // df1.unpersist() + val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) // .drop("C3alphanumericratio").cache() + // df2.unpersist() + val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) // .drop("C4asciiratio").cache() + // df3.unpersist() + val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) // .drop("C5bracketratio").cache() + // df4.unpersist() + val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) // .drop("C6digitalratio").cache() + // df5.unpersist() + val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) // .drop("C7latinratio").cache() + // df6.unpersist() + val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) // .drop("C8whitespaceratio").cache() + // df7.unpersist() + val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) // .drop("C9puncratio").cache() + + // Mean : + // character integer values : + val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() + val C10_Mean = Mean_C10longcharacterseq.getDouble(0) + val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } + val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) + + // Median + val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) // .drop("C11arabicratio").cache() + // df9.unpersist() + val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) // .drop("C12bengaliratio").cache() + // df11.unpersist() + val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) // .drop("C13brahmiratio").cache() + // df12.unpersist() + val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) // .drop("C14cyrilinratio").cache() + // df13.unpersist() + val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) // .drop("C15hanratio").cache() + // df14.unpersist() + val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) // .drop("c16malysiaratio").cache() + // df15.unpersist() + val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) // .drop("C17tamiratio").cache() + // df16.unpersist() + val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) // .drop("C18telugratio").cache() + // df17.unpersist() + val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) // .drop("C19symbolratio").cache() + //df18.unpersist() + val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) // .drop("C20alpharatio").cache() + // df19.unpersist() + val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) // .drop("C21visibleratio").cache() + // df20.unpersist() + val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) // .drop("C22printableratio").cache() + // df21.unpersist() + val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) // .drop("C23blankratio").cache() + // df22.unpersist() + val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) // .drop("C24controlratio").cache() + // df23.unpersist() + val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) // .drop("C25hexaratio").cache() + + // ************************************************End Character Features **************************************************************************************** + + // ************************************************Start Word Features **************************************************************************************** + + // Word Ratio Features : UDF + val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } + val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } + val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } + val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } + val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } + + // 1. + val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache() + + // 2.Boolean(Double) IsContainLanguageWord + + // 3. + val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache() + // df26.unpersist() + + // 4. Integer " Mean: + val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() + val W4_Mean = Mean_W4longestword.getDouble(0) + val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } + val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) + + // 5. Boolean (Double ) W5IscontainURL + // 6. + val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache() + + // 7. + val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache() + + // 8. + val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache() + + // 9.FemalFirst Boolean(Double) + // 10.Male First Boolean(Double) + // 11.ContainBadWord Boolean(Double) + // 12ContainBanWord Boolean(Double) + + // 13. Integer(Double): + val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() + val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) + val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } + val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) + + // 14. Integer (Double): + val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() + val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) + val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } + val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) + + // 15. Double (Not ratio): + val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() + val W15_Mean = Mean_W15PortionQid.getDouble(0) + val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } + val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) + + // 16. Double(Not Ratio): + val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() + val W16_Mean = Mean_W16PortionLnags.getDouble(0) + val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } + val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) + + // 17.Double(Not ratio): + val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() + val W17_Mean = Mean_W17PortionLinks.getDouble(0) + val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } + val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) + + // ************************************************End Word Features **************************************************************************************** + + // ************************************************Start Sentences Features **************************************************************************************** + // 1. Integer(Double) + val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() + val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) + val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } + val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) + + // 2. Double but Not ratio values : + val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() + val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) + val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } + val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) + + // 3. Double but Not ratio values : + val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() + val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) + val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } + val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) + + // 4. Double but Not ratio values : + val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() + val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) + val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } + val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) + + // df41.show() + // ************************************************End Sentences Features **************************************************************************************** + // *********************************************** Start Statement Features **************************************************************************************** + // 1. String + // 2. String + // 3. String + // ************************************************End Statement Features **************************************************************************************** + // *********************************************** Start User Features **************************************************************************************** + + // 1.Boolean(Double) + // 2.Boolean(Double) + // 3.Boolean(Double) + // 4.Boolean(Double) + // 5.Boolean(Double) + // 6.Boolean(Double) + // 7. (Double) IP No need to fill Missing Data + // 8. (Double) ID No need to fill Missing Data + // 9.Boolean(Double) + // 10.Boolean(Double) + + // *********************************************** End User Features **************************************************************************************** + // *********************************************** Start Item Features **************************************************************************************** + // 1. Integer (Double) No need to fill missing values + // 2. Integer (Double) No need to fill missing values + // 3. Integer (Double) No need to fill missing values + // 4. Integer (Double) No need to fill missing values + // 5. Integer (Double) No need to fill missing values + // 6. Integer (Double) No need to fill missing values + // 7. Integer (Double) No need to fill missing values + // 8. Integer (Double) No need to fill missing values + // 9. Integer (Double) No need to fill missing values + // 10. Integer (Double) No need to fill missing values + // 11. String + // *********************************************** End Item Features **************************************************************************************** + // *********************************************** Start Revision Features **************************************************************************************** + // 1.String + // 2.String + // 3.Boolean (Double) + // 4.Integer(Double) + // 5.String + // 6.String + // 7. Boolean(Double) + // 8. String + // 9.String + // 10. Integer (Double) + // 11.String + // 12. integer(Double) + // 13. Long(Double) + // 14. integer (Double) + // 15.String + // 16.String + // *********************************************** End Revision Features **************************************************************************************** + // *********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** + // Meta + // 1.Revision Session :Integer (Converted to Double) + // 2. User Country Code + // 3.User Continent Code + // 4.User Time Size + // 5.User Region Code + // 6.User-city Name + // 7.User Country Name + // 8.RevisionTags + + // Truth: + // 1.Undo + + // Freq : + + // 1.5 features + + // Roll Boolean :Boolean (Double) + // Undo :Boolean (Double) + + // *********************************************** End Revision Features **************************************************************************************** + + // ===========================================================================String Features==================================================================================== + + val df42 = df41.withColumn( + // statement String features: + "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", + // Revision String Features: + lit(";"), $"R1languageRevision", + lit(";"), $"R2RevisionLanguageLocal", + lit(";"), $"R5RevisionAction", + lit(";"), $"R6PrevReviAction", + lit(";"), $"R8ParRevision", + lit(";"), $"R9RevisionTime", + lit(";"), $"R11ContentType", + lit(";"), $"R15RevisionSubaction", + lit(";"), $"R16PrevReviSubaction", + + lit(";"), $"USER_COUNTRY_CODE", + lit(";"), $"USER_CONTINENT_CODE", + lit(";"), $"USER_TIME_ZONE", + lit(";"), $"USER_REGION_CODE", + lit(";"), $"USER_CITY_NAME", + lit(";"), $"USER_COUNTY_NAME", + lit(";"), $"REVISION_TAGS")) + + val toArray = udf((record: String) => record.split(";").map(_.toString())) + val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) + // test1.show() + // test1.printSchema() + + val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) + val model = word2Vec.fit(test1) + val result = model.transform(test1) // .rdd + + // result.show() + + val Todense = udf((b: Vector) => b.toDense) + val test_new2 = result.withColumn("result", Todense(col("result"))) + + val assembler = new VectorAssembler().setInputCols(Array( + "result", + + // character + "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", + "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", + "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", + "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", + + // Words + "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", + "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", + "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", + + // Sentences : + "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", + + // User : + "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", + "U9HasBirthDate", "U10HasDeathDate", + + // Item: + + "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", + "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", + + // Revision: + "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", + "R13TimeSinceLastRevi", "R14CommentLength", + + // Meta , truth , Freq + // meta : + "FinalREVISION_SESSION_ID", // Truth: - //1.Undo - - // Freq : - - //1.5 features - - // Roll Boolean :Boolean (Double) - // Undo :Boolean (Double) - - //*********************************************** End Revision Features **************************************************************************************** - - //===========================================================================String Features==================================================================================== - - val df42 = df41.withColumn( - //statement String features: - "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", - //Revision String Features: - lit(";"), $"R1languageRevision", - lit(";"), $"R2RevisionLanguageLocal", - lit(";"), $"R5RevisionAction", - lit(";"), $"R6PrevReviAction", - lit(";"), $"R8ParRevision", - lit(";"), $"R9RevisionTime", - lit(";"), $"R11ContentType", - lit(";"), $"R15RevisionSubaction", - lit(";"), $"R16PrevReviSubaction", - - lit(";"), $"USER_COUNTRY_CODE", - lit(";"), $"USER_CONTINENT_CODE", - lit(";"), $"USER_TIME_ZONE", - lit(";"), $"USER_REGION_CODE", - lit(";"), $"USER_CITY_NAME", - lit(";"), $"USER_COUNTY_NAME", - lit(";"), $"REVISION_TAGS")) + "FinalUNDO_RESTORE_REVERTED", - val toArray = udf((record: String) => record.split(";").map(_.toString())) - val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) - // test1.show() - // test1.printSchema() + // Freq: + "FinalNumberofRevisionsUserContributed", + "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") + val Testing_Data = assembler.transform(test_new2) - val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) - val model = word2Vec.fit(test1) - val result = model.transform(test1) //.rdd - - // result.show() - - val Todense = udf((b: Vector) => b.toDense) - val test_new2 = result.withColumn("result", Todense(col("result"))) - - val assembler = new VectorAssembler().setInputCols(Array( - "result", - - // character - "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", - "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", - "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", - "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", - - // Words - "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", - "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", - "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", - - //Sentences : - "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", - - // User : - "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", - "U9HasBirthDate", "U10HasDeathDate", - - //Item: - - "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", - "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", - - //Revision: - "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", - "R13TimeSinceLastRevi", "R14CommentLength", - - // Meta , truth , Freq - // meta : - "FinalREVISION_SESSION_ID", - // Truth: - "FinalUNDO_RESTORE_REVERTED", - - //Freq: - "FinalNumberofRevisionsUserContributed", - "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") - val Testing_Data = assembler.transform(test_new2) - - // Prepare the data for classification: + // Prepare the data for classification: // NewData.registerTempTable("DB") // val Training_Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED from DB") - //val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision + // val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision - //Data.show() + // Data.show() // val TestClassifiers = new Classifiers() -// - // TestClassifiers.RandomForestClassifer(Testing_Data, sqlContext) -// // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) -// // TestClassifiers.LogisticRegrision(Data, sqlContext) -// // TestClassifiers.GradientBoostedTree(Data, sqlContext) -// // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) + // + // TestClassifiers.RandomForestClassifer(Testing_Data, sqlContext) + // // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) + // // TestClassifiers.LogisticRegrision(Data, sqlContext) + // // TestClassifiers.GradientBoostedTree(Data, sqlContext) + // // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) Testing_Data - - - } - - - - - def Triger(sc: SparkContext): Unit = { - -// val sqlContext = new org.apache.spark.sql.SQLContext(sc) -// import sqlContext.implicits._ -// import org.apache.spark.sql.functions._ // for UDF -// import org.apache.spark.sql.types._ -// -// //******************************************************************************************************************************* -// println("Please Enter 0 for JTriple and 1 for TRIX process and 2 for RDFXML process and 3 for NormalXML:") -// val num = scala.io.StdIn.readLine() -// -// if (num == "0") { -// println("JTriple.........!!!!!!") -// // Streaming records:RDFJtriple file : -// val jobConf = new JobConf() -// -// val JTriple_Parser_OBJ = new ParseJTriple() -// val DRF_Builder_JTripleOBJ = new FacilitiesClass() -// val RDD_JTriple = JTriple_Parser_OBJ.Start_JTriple_Parser(jobConf, sc) -// RDD_JTriple.foreach(println) -// //----------------------------DF for RDF TRIX ------------------------------------------ -// // Create SQLContext Object: -// val sqlContext = new org.apache.spark.sql.SQLContext(sc) -// val DFR_JTriple = DRF_Builder_JTripleOBJ.RDD_TO_DFR_JTriple(RDD_JTriple, sqlContext) -// DFR_JTriple.show() -// -// } - -// if (num == "1") { -// -// println("TRIX.........!!!!!!") -// // Streaming records:RDFTRIX file : -// val jobConf = new JobConf() -// -// val TRIX_Parser_OBJ = new ParseTRIX() -// val DRF_Builder_RDFTRIX_OBJ = new FacilitiesClass() -// -// val RDD_TRIX = TRIX_Parser_OBJ.Start_TriX_Parser(jobConf, sc) -// RDD_TRIX.foreach(println) -// -// //----------------------------DF for RDF TRIX ------------------------------------------ -// // Create SQLContext Object: -// val sqlContext = new org.apache.spark.sql.SQLContext(sc) -// val DFR_TRIX = DRF_Builder_RDFTRIX_OBJ.RDD_TO_DFR_TRIX(RDD_TRIX, sqlContext) -// DFR_TRIX.show() -// -// } //RDF XML file :********************************************************************************************************* -// else if (num == "2") { -// println("RDF XML .........!!!!!!") -// // Streaming records:RDFXML file : -// val jobConf_Record = new JobConf() -// val jobConf_Prefixes = new JobConf() -// -// val RDFXML_Parser_OBJ = new ParseRDFXML() -// val DRF_Builder_RDFXML_OBJ = new FacilitiesClass() -// -// val RDD_RDFXML = RDFXML_Parser_OBJ.start_RDFXML_Parser(jobConf_Record, jobConf_Prefixes, sc) -// RDD_RDFXML.foreach(println) -// -// //----------------------------DF for RDF XML ------------------------------------------ -// // Create SQLContext Object: -// val sqlContext = new org.apache.spark.sql.SQLContext(sc) -// val DFR_RDF_XML = DRF_Builder_RDFXML_OBJ.RDD_TO_DFR_RDFXML(RDD_RDFXML, sqlContext) -// DFR_RDF_XML.show() -// // -// // NOrmal XML Example WikiData: *************************************************************************************************** -// } else if (num == "3") { - // Streaming records: -// val jobConf = new JobConf() -// val NormalXML_Parser_OBJ = new ParseNormalXML() -// val RDD_OBJ = new ParseNormalXML() -// val RDD_All_Record1 = RDD_OBJ.Training_DB_NormalXML_Parser_Input1(sc) -// val RDD_All_Record2 = RDD_OBJ.Training_DB_NormalXML_Parser_Input2(sc) -// val RDD_All_Record3 = RDD_OBJ.Training_DB_NormalXML_Parser_Input3(sc) -// //RDD_All_Record1.foreach(println) -// //RDD_All_Record2.foreach(println) -// // RDD_All_Record3.foreach(println) -// -// val RDD_All_Record = RDD_All_Record1.union(RDD_All_Record2).union(RDD_All_Record3).distinct().cache() -// -// //println(RDD_All_Record.count()) -// // println(RDD_All_Record.count()) -// -// // ======= Json part : -// //Json RDD : Each record has its Revision iD: -// val JsonRDD = RDD_All_Record.map(_.split("NNLL")).map(v => replacing_with_Quoto(v(0), v(8))).cache() -// //JsonRDD.foreach(println) -// //println(JsonRDD.count()) -// -// // Data set -// val Ds_Json = sqlContext.jsonRDD(JsonRDD).select("key", "id", "labels", "descriptions", "aliases", "claims", "sitelinks").cache() -// //Ds_Json.show() -// // println(Ds_Json.count()) -// -// // ======= Tags part : // Contributor IP here is in Decimal format not IP format and It is converted in ParseNormalXml stage -// val TagsRDD = RDD_All_Record.map(_.split("NNLL")).map(x => (x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11))).cache() -// val DF_Tags = TagsRDD.toDF("Rid", "Itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "model", "format", "sha").cache() -// // DF_Tags.show() -// // println(DF_Tags.count()) -// -// //======== Join Json part with Tag Part:============================ -// //Joining to have full data -// val DF_First_DF_Result_Join_Tags_and_Json = DF_Tags.as("T1").join(Ds_Json.as("T2"), $"T1.Rid" === $"T2.key", "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha") //.orderBy("Rid", "Itemid") -// DF_First_DF_Result_Join_Tags_and_Json.registerTempTable("Data1") -// val dfr_DATA_JsonTages1 = sqlContext.sql("select * from Data1 order by itemid ,Rid ").cache() -// -// val colNames = Seq("Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") -// val DF_Second = DF_First_DF_Result_Join_Tags_and_Json.toDF(colNames: _*) //.distinct() -// DF_Second.registerTempTable("Data2") -// -// //===================================================================Parent // Previous Revision============================================================================================================== -// //val DF_Joined = result1.as("df1").join(result2.as("df2"), col("itemid") === col("itemid2") && col("index1") === col("index2") + 1, "leftouter").select("Rid", "itemid", "comment", "pid", "time", "contributorIP", "contributorID", "contributorName", "JsonText", "labels", "descriptions", "aliases", "claims", "sitelinks", "model", "format", "sha", "Rid2", "itemid2", "comment2", "pid2", "time2", "contributorIP2", "contributorID2", "contributorName2", "JsonText2", "labels2", "descriptions2", "aliases2", "claims2", "sitelinks2", "model2", "format2", "sha2") -// //.select("itemid", "Rid","pid","time","itemid2","Rid2","pid2","time2") -// -// //Joining based on Parent Id to get the previous cases: ParentID -// val DF_Joined = DF_First_DF_Result_Join_Tags_and_Json.as("df1").join(DF_Second.as("df2"), $"df1.pid" === $"df2.Rid2", "leftouter").distinct() -// -// val RDD_After_JoinDF = DF_Joined.rdd.distinct() -// val x = RDD_After_JoinDF.map(row => (row(0).toString().toInt, row)).cache() -// val part = new RangePartitioner(4, x) -// val partitioned = x.partitionBy(part).persist() // persist is important for this case and obligatory. -// //partitioned.foreach(println) -// // -// // //=====================================================All Features Based on Categories of Features Data Type :================================================================================== -// // -// val Result_all_Features = partitioned.map { case (x, y) => (x.toString() + "," + All_Features(y).toString()) } // we convert the Pair RDD to String one LineRDD to be able to make DF based on "," -// //Result_all_Features.foreach(println) -// // println("nayef" + Result_all_Features.count()) -// -// // Conver the RDD of All Features to DataFrame: -// -// val schema = StructType( -// -// //0 -// StructField("Rid", IntegerType, false) :: -// -// // Character Features : -// /* 1*/ StructField("C1uppercaseratio", DoubleType, false) :: /*2 */ StructField("C2lowercaseratio", DoubleType, false) :: /*3*/ StructField("C3alphanumericratio", DoubleType, false) :: -// /*4*/ StructField("C4asciiratio", DoubleType, false) :: /*5*/ StructField("C5bracketratio", DoubleType, false) :: /*6*/ StructField("C6digitalratio", DoubleType, false) :: -// /*7*/ StructField("C7latinratio", DoubleType, false) :: /*8*/ StructField("C8whitespaceratio", DoubleType, false) :: /* 9*/ StructField("C9puncratio", DoubleType, false) :: -// /*10*/ StructField("C10longcharacterseq", DoubleType, false) :: /*11*/ StructField("C11arabicratio", DoubleType, false) :: /*12*/ StructField("C12bengaliratio", DoubleType, false) :: -// /*13 */ StructField("C13brahmiratio", DoubleType, false) :: /*14*/ StructField("C14cyrilinratio", DoubleType, false) :: /*15*/ StructField("C15hanratio", DoubleType, false) :: -// /*16*/ StructField("c16malysiaratio", DoubleType, false) :: /*17*/ StructField("C17tamiratio", DoubleType, false) :: /*18*/ StructField("C18telugratio", DoubleType, false) :: -// /*19 */ StructField("C19symbolratio", DoubleType, false) :: /*20 */ StructField("C20alpharatio", DoubleType, false) :: /*21*/ StructField("C21visibleratio", DoubleType, false) :: -// /*22*/ StructField("C22printableratio", DoubleType, false) :: /*23*/ StructField("C23blankratio", DoubleType, false) :: /*24 */ StructField("C24controlratio", DoubleType, false) :: -// /* 25 */ StructField("C25hexaratio", DoubleType, false) :: -// -// //word Features: -// /*26*/ StructField("W1languagewordratio", DoubleType, false) :: /*27 Boolean */ StructField("W2Iscontainlanguageword", DoubleType, false) :: /*28*/ StructField("W3lowercaseratio", DoubleType, false) :: -// /*29 Integer */ StructField("W4longestword", IntegerType, false) :: /*30 Boolean */ StructField("W5IscontainURL", DoubleType, false) :: /*31*/ StructField("W6badwordratio", DoubleType, false) :: -// /*32*/ StructField("W7uppercaseratio", DoubleType, false) :: /*33*/ StructField("W8banwordratio", DoubleType, false) :: /*34 Boolean */ StructField("W9FemalFirstName", DoubleType, false) :: -// /*35 Boolean */ StructField("W10MaleFirstName", DoubleType, false) :: /*36 Boolean */ StructField("W11IscontainBadword", DoubleType, false) :: /*37 Boolean*/ StructField("W12IsContainBanword", DoubleType, false) :: -// /*38 integer */ StructField("W13NumberSharewords", DoubleType, false) :: /*39 Integer */ StructField("W14NumberSharewordswithoutStopwords", DoubleType, false) :: -// /*40*/ StructField("W15PortionQid", DoubleType, false) :: /*41*/ StructField("W16PortionLnags", DoubleType, false) :: /*42*/ StructField("W17PortionLinks", DoubleType, false) :: -// -// // -// // // Sentences Features: -// /*43*/ StructField("S1CommentTailLength", DoubleType, false) :: /*44*/ StructField("S2SimikaritySitelinkandLabel", DoubleType, false) :: /*45*/ StructField("S3SimilarityLabelandSitelink", DoubleType, false) :: /*46*/ StructField("S4SimilarityCommentComment", DoubleType, false) :: -// // -// // // Statements Features : -// /*47*/ StructField("SS1Property", StringType, false) :: /*48*/ StructField("SS2DataValue", StringType, false) :: /*49*/ StructField("SS3ItemValue", StringType, false) :: -// // -// // -// // //User Features : -// /*50 Boolean*/ StructField("U1IsPrivileged", DoubleType, false) :: /*51 Boolean*/ StructField("U2IsBotUser", DoubleType, false) :: /*52 Boolean*/ StructField("U3IsBotuserWithFlaguser", DoubleType, false) :: -// /*53 Boolean*/ StructField("U4IsProperty", DoubleType, false) :: /*54 Boolean*/ StructField("U5IsTranslator", DoubleType, false) :: /*55 Boolean*/ StructField("U6IsRegister", DoubleType, false) :: -// /*56*/ StructField("U7IPValue", DoubleType, false) :: /*57*/ StructField("U8UserID", IntegerType, false) :: /*58*/ StructField("U9HasBirthDate", DoubleType, false) :: /*59*/ StructField("U10HasDeathDate", DoubleType, false) :: -// -// //Items Features : -// -// /*60*/ StructField("I1NumberLabels", DoubleType, false) :: /*61*/ StructField("I2NumberDescription", DoubleType, false) :: /*62*/ StructField("I3NumberAliases", DoubleType, false) :: /*63*/ StructField("I4NumberClaims", DoubleType, false) :: -// /*64*/ StructField("I5NumberSitelinks", DoubleType, false) :: /*65*/ StructField("I6NumberStatement", DoubleType, false) :: /*66*/ StructField("I7NumberReferences", DoubleType, false) :: /*67*/ StructField("I8NumberQualifier", DoubleType, false) :: -// /*68*/ StructField("I9NumberQualifierOrder", DoubleType, false) :: /*69*/ StructField("I10NumberBadges", DoubleType, false) :: /*70*/ StructField("I11ItemTitle", StringType, false) :: -// -// // Revision Features: -// /*71*/ StructField("R1languageRevision", StringType, false) :: /*72*/ StructField("R2RevisionLanguageLocal", StringType, false) :: /*73*/ StructField("R3IslatainLanguage", DoubleType, false) :: -// /*74*/ StructField("R4JsonLength", DoubleType, false) :: /*75*/ StructField("R5RevisionAction", StringType, false) :: /*76*/ StructField("R6PrevReviAction", StringType, false) :: -// /*77*/ StructField("R7RevisionAccountChange", DoubleType, false) :: /*78*/ StructField("R8ParRevision", StringType, false) :: /*79*/ StructField("R9RevisionTime", StringType, false) :: -// /*80*/ StructField("R10RevisionSize", DoubleType, false) :: /*81*/ StructField("R11ContentType", StringType, false) :: /*82*/ StructField("R12BytesIncrease", DoubleType, false) :: -// /*83*/ StructField("R13TimeSinceLastRevi", DoubleType, false) :: /*84*/ StructField("R14CommentLength", DoubleType, false) :: /*85*/ StructField("R15RevisionSubaction", StringType, false) :: -// /*86*/ StructField("R16PrevReviSubaction", StringType, false) :: -// -// Nil) -// -// val rowRDD = Result_all_Features.map(line => line.split(",")).map(e ⇒ Row(e(0).toInt // character feature column -// , e(1).toDouble, e(2).toDouble, e(3).toDouble, e(4).toDouble, e(5).toDouble, e(6).toDouble, e(7).toDouble, e(8).toDouble, e(9).toDouble, RoundDouble(e(10).toDouble), -// e(11).toDouble, e(12).toDouble, e(13).toDouble, e(14).toDouble, e(15).toDouble, e(16).toDouble, e(17).toDouble, e(18).toDouble, e(19).toDouble, e(20).toDouble, e(21).toDouble, e(22).toDouble, e(23).toDouble, e(24).toDouble, e(25).toDouble //Word Feature column -// , e(26).toDouble, e(27).toDouble, e(28).toDouble, e(29).toDouble.toInt, e(30).toDouble, e(31).toDouble, e(32).toDouble, e(33).toDouble, e(34).toDouble, e(35).toDouble, e(36).toDouble, e(37).toDouble, RoundDouble(e(38).toDouble), RoundDouble(e(39).toDouble), e(40).toDouble, e(41).toDouble, e(42).toDouble // Sentences Features column: -// , RoundDouble(e(43).toDouble), e(44).toDouble, e(45).toDouble, e(46).toDouble //Statement Features Column: -// , e(47), e(48), e(49) // User Features Column: -// , e(50).toDouble, e(51).toDouble, e(52).toDouble, e(53).toDouble, e(54).toDouble, e(55).toDouble, e(56).toDouble, e(57).toDouble.toInt, e(58).toDouble, e(59).toDouble //Item Features column: -// , e(60).toDouble, e(61).toDouble, e(62).toDouble, e(63).toDouble, e(64).toDouble, e(65).toDouble, e(66).toDouble, e(67).toDouble, e(68).toDouble, e(69).toDouble, "Q" + e(70).toDouble.toInt.toString() //Revision Features Column: -// , e(71), e(72), e(73).toDouble, e(74).toDouble, e(75), e(76), e(77).toDouble, e(78), e(79), e(80).toDouble, e(81), e(82).toDouble, e(83).toDouble, e(84).toDouble, e(85), e(86))) -// -// //a.User Frequency: -// //number of revisions a user has contributed -// //val resu= DF_Tags.groupBy("contributorID").agg(count("Rid")) -// DF_Tags.registerTempTable("TagesTable") -// val ContributorFreq_for_Each_Revision_DF = sqlContext.sql("select contributorID as CIDUSER1, count(Rid) as NumberofRevisionsUserContributed from TagesTable where contributorID !='0' group by contributorID ") //.drop("CIDUSER1") -// //ContributorFreq_for_Each_Revision_DF.show() -// -// //b.Cumulated : Number of a unique Item a user has contributed. -// val CumulatedNumberof_uniqueItemsForUser_DF = sqlContext.sql("select contributorID as CIDUSER2, COUNT(DISTINCT itemid) as NumberofUniqueItemsUseredit from TagesTable where contributorID !='0' group by contributorID") //.drop("CIDUSER2") -// //CumulatedNumberof_uniqueItemsForUser_DF.show() -// -// //1.Item Frequency: -// // number of revisions an Item has -// val ItemFrequ_DF = sqlContext.sql("select itemid, count(Rid) as NumberRevisionItemHas from TagesTable group by itemid") -// // ItemFrequ_DF.show() -// -// //2. Cumulate number of unique users have edited the Item : Did not consider the users IP. Contributor is an IP or Name. we consider name -// val CumulatedNumberof_UniqueUserForItem_DF = sqlContext.sql("select itemid, COUNT(DISTINCT contributorID) as NumberUniqUserEditItem from TagesTable where contributorID !='0' group by itemid") -// //CumulatedNumberof_UniqueUserForItem_DF.show() -// -// //3. freq each Item : -// val Fre_Item_DF = sqlContext.sql("select itemid, COUNT(itemid) as FreqItem from TagesTable group by itemid") -// // Fre_Item_DF.show() -// -// //***************************************************************************************************************************************** -// // This is Main DataFrame: -// val BeforeJoin_All_Features = sqlContext.createDataFrame(rowRDD, schema) -// //BeforeJoin_All_Features.show() -// -// //********************************** User feature Join -// -// // Join1 for add The first User Feature : number of revisions a user has contributed -// val AfterJoinUser1_All_Features = BeforeJoin_All_Features.as("T1").join(ContributorFreq_for_Each_Revision_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER1", "leftouter").drop("CIDUSER1") -// //AfterJoinUser1_All_Features.show() -// -// // Join2 for add The second User Feature -// val AfterJoinUser2_All_Features = AfterJoinUser1_All_Features.as("T1").join(CumulatedNumberof_uniqueItemsForUser_DF.as("T2"), $"T1.U8UserID" === $"T2.CIDUSER2", "leftouter").drop("CIDUSER2") -// //AfterJoinUser2_All_Features.show() -// -// //********************************** Item Feature Join -// // Join3 for add The First Item Feature :number of revisions an Item has -// val AfterJoinItem3_All_Features = AfterJoinUser2_All_Features.as("T1").join(ItemFrequ_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") -// // AfterJoinItem3_All_Features.show() -// -// // Join4 for add The Second Item Feature -// val AfterJoinItem4_All_Features = AfterJoinItem3_All_Features.as("T1").join(CumulatedNumberof_UniqueUserForItem_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") -// // AfterJoinItem4_All_Features.show() -// -// // Join5 for add The Third Item Feature -// val AfterJoinItem5_All_Features = AfterJoinItem4_All_Features.as("T1").join(Fre_Item_DF.as("T2"), $"T1.I11ItemTitle" === $"T2.itemid", "leftouter").drop("itemid") -// //2 AfterJoinItem5_All_Features.show() -// -// //******************************** -// -// //*Geografical information Feature from Meta File -// //REVISION_ID|REVISION_SESSION_ID|USER_COUNTRY_CODE|USER_CONTINENT_CODE|USER_TIME_ZONE|USER_REGION_CODE|USER_CITY_NAME|USER_COUNTY_NAME|REVISION_TAGS -// val df_GeoInf = sqlContext.read -// .format("com.databricks.spark.csv") -// .option("header", "true") // Use first line of all files as header -// .option("inferSchema", "true") // Automatically infer data types -// .load("hdfs://localhost:9000/mydata/Meta.csv").select("REVISION_ID", "REVISION_SESSION_ID", "USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS") -// // df_GeoInf.show() -// -// val df_Truth = sqlContext.read -// .format("com.databricks.spark.csv") -// .option("header", "true") // Use first line of all files as header -// .option("inferSchema", "true") // Automatically infer data types -// .load("hdfs://localhost:9000/mydata/truth.csv").select("REVISION_ID", "ROLLBACK_REVERTED", "UNDO_RESTORE_REVERTED") -// // df_GeoInf.show() -// -// val AfterJoinGeoInfo_All_Features = AfterJoinItem5_All_Features.as("T1").join(df_GeoInf.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() -// // AfterJoinGeoInfo_All_Features.show() -// -// val Final_All_Features = AfterJoinGeoInfo_All_Features.as("T1").join(df_Truth.as("T2"), $"T1.Rid" === $"T2.REVISION_ID", "leftouter").drop("REVISION_ID").cache() -// //Final_All_Features.show() -// -// // Pre- process Data ============================================================================================================================================================ -// -// // For String Column, We fill the Null values by "NA": -// -// var Fill_Missing_Final_All_Features = Final_All_Features.na.fill("NA", Seq("USER_COUNTRY_CODE", "USER_CONTINENT_CODE", "USER_TIME_ZONE", "USER_REGION_CODE", "USER_CITY_NAME", "USER_COUNTY_NAME", "REVISION_TAGS")).cache() -// -// // For Integer Frequency Column, We fill the Null values by 0: -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.na.fill(0, Seq("FreqItem", "NumberUniqUserEditItem", "NumberRevisionItemHas", "NumberofUniqueItemsUseredit", "NumberofRevisionsUserContributed", "REVISION_SESSION_ID")).cache() -// //Fill_Missing_Final_All_Features.show() -// -// val BoolToDoubleUDF = udf { (BoolAsString: String) => if (BoolAsString == "T") 1.0 else 0.0 } -// val IntegerToDouble = udf { (IntegerRevisionSessionID: Integer) => IntegerRevisionSessionID.toDouble } -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalROLLBACK_REVERTED", BoolToDoubleUDF(col("ROLLBACK_REVERTED"))) -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalUNDO_RESTORE_REVERTED", BoolToDoubleUDF(col("UNDO_RESTORE_REVERTED"))) -// -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalREVISION_SESSION_ID", IntegerToDouble(col("REVISION_SESSION_ID"))) -// -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofRevisionsUserContributed", IntegerToDouble(col("NumberofRevisionsUserContributed"))) -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberofUniqueItemsUseredit", IntegerToDouble(col("NumberofUniqueItemsUseredit"))) -// -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberRevisionItemHas", IntegerToDouble(col("NumberRevisionItemHas"))) -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalNumberUniqUserEditItem", IntegerToDouble(col("NumberUniqUserEditItem"))) -// Fill_Missing_Final_All_Features = Fill_Missing_Final_All_Features.withColumn("FinalFreqItem", IntegerToDouble(col("FreqItem"))) -// -// //===========================================================================Caharacter Features : Double , Integer Features ==================================================================================== -// //Double Ratio: For Ratio Double column, Fill -1 value by Median:Character Features + Ratio of Word Features : -// var Samples = Fill_Missing_Final_All_Features.sample(false, 0.001).cache() //.where($"S2SimikaritySitelinkandLabel">0.0 || $"S3SimilarityLabelandSitelink">0.0 || $"S4SimilarityCommentComment">0.0) -// Samples.registerTempTable("df") -// -// val Query = "select " + -// "percentile_approx(C1uppercaseratio, 0.5) as meadian1" + "," + "percentile_approx(C2lowercaseratio, 0.5) as median2" + " ," + -// "percentile_approx(C3alphanumericratio, 0.5) as median3" + "," + "percentile_approx(C4asciiratio, 0.5) as median4" + "," + -// "percentile_approx(C5bracketratio, 0.5) as median5" + "," + "percentile_approx(C6digitalratio, 0.5) as median6" + "," + -// "percentile_approx(C7latinratio, 0.5) as median7" + "," + "percentile_approx(C8whitespaceratio, 0.5) as median8" + "," + -// "percentile_approx(C9puncratio, 0.5) as median9" + "," + "percentile_approx(C11arabicratio, 0.5) as median11" + "," + -// "percentile_approx(C12bengaliratio, 0.5) as median12" + "," + "percentile_approx(C13brahmiratio, 0.5) as median13" + "," + -// "percentile_approx(C14cyrilinratio, 0.5) as median14" + "," + "percentile_approx(C15hanratio, 0.5) as median15" + "," + -// "percentile_approx(c16malysiaratio, 0.5) as median16" + "," + -// "percentile_approx(C17tamiratio, 0.5) as median17" + "," + "percentile_approx(C18telugratio, 0.5) as median18" + "," + -// "percentile_approx(C19symbolratio, 0.5) as median19" + "," + "percentile_approx(C20alpharatio, 0.5) as median20" + "," + -// "percentile_approx(C21visibleratio, 0.5) as median21" + "," + "percentile_approx(C22printableratio, 0.5) as median22" + "," + -// "percentile_approx(C23blankratio, 0.5) as median23" + "," + "percentile_approx(C24controlratio, 0.5) as median24" + "," + -// "percentile_approx(C25hexaratio, 0.5) as median25" ++ "," + "percentile_approx(W1languagewordratio, 0.5) as median26" + "," + -// "percentile_approx(W3lowercaseratio, 0.5) as median27" + "," + "percentile_approx(W6badwordratio, 0.5) as median28" + "," + -// "percentile_approx(W7uppercaseratio, 0.5) as median27" + "," + "percentile_approx(W8banwordratio, 0.5) as median27" + " from df" -// -// val medianValues = sqlContext.sql(Query).rdd -// val Median = medianValues.first() -// -// // Median : -// // Character Ratio Features: UDF -// val lkpUDF1 = udf { (i: Double) => if (i == 0) Median(0).toString().toDouble else i } -// val lkpUDF2 = udf { (i: Double) => if (i == 0) Median(1).toString().toDouble else i } -// val lkpUDF3 = udf { (i: Double) => if (i == 0) Median(2).toString().toDouble else i } -// val lkpUDF4 = udf { (i: Double) => if (i == 0) Median(3).toString().toDouble else i } -// val lkpUDF5 = udf { (i: Double) => if (i == 0) Median(4).toString().toDouble else i } -// val lkpUDF6 = udf { (i: Double) => if (i == 0) Median(5).toString().toDouble else i } -// val lkpUDF7 = udf { (i: Double) => if (i == 0) Median(6).toString().toDouble else i } -// val lkpUDF8 = udf { (i: Double) => if (i == 0) Median(7).toString().toDouble else i } -// val lkpUDF9 = udf { (i: Double) => if (i == 0) Median(8).toString().toDouble else i } -// -// val lkpUDF11 = udf { (i: Double) => if (i == 0) Median(9).toString().toDouble else i } -// val lkpUDF12 = udf { (i: Double) => if (i == 0) Median(10).toString().toDouble else i } -// val lkpUDF13 = udf { (i: Double) => if (i == 0) Median(11).toString().toDouble else i } -// val lkpUDF14 = udf { (i: Double) => if (i == 0) Median(12).toString().toDouble else i } -// val lkpUDF15 = udf { (i: Double) => if (i == 0) Median(13).toString().toDouble else i } -// val lkpUDF16 = udf { (i: Double) => if (i == 0) Median(14).toString().toDouble else i } -// val lkpUDF17 = udf { (i: Double) => if (i == 0) Median(15).toString().toDouble else i } -// val lkpUDF18 = udf { (i: Double) => if (i == 0) Median(16).toString().toDouble else i } -// val lkpUDF19 = udf { (i: Double) => if (i == 0) Median(17).toString().toDouble else i } -// val lkpUDF20 = udf { (i: Double) => if (i == 0) Median(18).toString().toDouble else i } -// val lkpUDF21 = udf { (i: Double) => if (i == 0) Median(19).toString().toDouble else i } -// val lkpUDF22 = udf { (i: Double) => if (i == 0) Median(20).toString().toDouble else i } -// val lkpUDF23 = udf { (i: Double) => if (i == 0) Median(21).toString().toDouble else i } -// val lkpUDF24 = udf { (i: Double) => if (i == 0) Median(22).toString().toDouble else i } -// val lkpUDF25 = udf { (i: Double) => if (i == 0) Median(23).toString().toDouble else i } -// -// val df1 = Fill_Missing_Final_All_Features.withColumn("FinalC1uppercaseratio", lkpUDF1(col("C1uppercaseratio"))) //.drop("C1uppercaseratio").cache() -// val df2 = df1.withColumn("FinalC2lowercaseratio", lkpUDF2(col("C2lowercaseratio"))) //.drop("C2lowercaseratio").cache() -// //df1.unpersist() -// val df3 = df2.withColumn("FinalC3alphanumericratio", lkpUDF3(col("C3alphanumericratio"))) //.drop("C3alphanumericratio").cache() -// //df2.unpersist() -// val df4 = df3.withColumn("FinalC4asciiratio", lkpUDF4(col("C4asciiratio"))) //.drop("C4asciiratio").cache() -// //df3.unpersist() -// val df5 = df4.withColumn("FinalC5bracketratio", lkpUDF5(col("C5bracketratio"))) //.drop("C5bracketratio").cache() -// //df4.unpersist() -// val df6 = df5.withColumn("FinalC6digitalratio", lkpUDF6(col("C6digitalratio"))) //.drop("C6digitalratio").cache() -// //df5.unpersist() -// val df7 = df6.withColumn("FinalC7latinratio", lkpUDF7(col("C7latinratio"))) //.drop("C7latinratio").cache() -// //df6.unpersist() -// val df8 = df7.withColumn("FinalC8whitespaceratio", lkpUDF8(col("C8whitespaceratio"))) //.drop("C8whitespaceratio").cache() -// //df7.unpersist() -// val df9 = df8.withColumn("FinalC9puncratio", lkpUDF9(col("C9puncratio"))) //.drop("C9puncratio").cache() -// -// // Mean : -// // character integer values : -// val Mean_C10longcharacterseq = Samples.agg(mean("C10longcharacterseq")).head() -// val C10_Mean = Mean_C10longcharacterseq.getDouble(0) -// val lkpUDFC10 = udf { (i: Double) => if (i == 0) C10_Mean else i } -// val df10 = df9.withColumn("FinalC10longcharacterseq", lkpUDFC10(col("C10longcharacterseq"))) -// -// //Median -// val df11 = df10.withColumn("FinalC11arabicratio", lkpUDF11(col("C11arabicratio"))) //.drop("C11arabicratio").cache() -// // df9.unpersist() -// val df12 = df11.withColumn("FinalC12bengaliratio", lkpUDF12(col("C12bengaliratio"))) //.drop("C12bengaliratio").cache() -// //df11.unpersist() -// val df13 = df12.withColumn("FinalC13brahmiratio", lkpUDF13(col("C13brahmiratio"))) //.drop("C13brahmiratio").cache() -// // df12.unpersist() -// val df14 = df13.withColumn("FinalC14cyrilinratio", lkpUDF14(col("C14cyrilinratio"))) //.drop("C14cyrilinratio").cache() -// // df13.unpersist() -// val df15 = df14.withColumn("FinalC15hanratio", lkpUDF15(col("C15hanratio"))) //.drop("C15hanratio").cache() -// // df14.unpersist() -// val df16 = df15.withColumn("Finalc16malysiaratio", lkpUDF16(col("c16malysiaratio"))) //.drop("c16malysiaratio").cache() -// //df15.unpersist() -// val df17 = df16.withColumn("FinalC17tamiratio", lkpUDF17(col("C17tamiratio"))) //.drop("C17tamiratio").cache() -// //df16.unpersist() -// val df18 = df17.withColumn("FinalC18telugratio", lkpUDF18(col("C18telugratio"))) //.drop("C18telugratio").cache() -// //df17.unpersist() -// val df19 = df18.withColumn("FinalC19symbolratio", lkpUDF19(col("C19symbolratio"))) //.drop("C19symbolratio").cache() -// //df18.unpersist() -// val df20 = df19.withColumn("FinalC20alpharatio", lkpUDF20(col("C20alpharatio"))) //.drop("C20alpharatio").cache() -// // df19.unpersist() -// val df21 = df20.withColumn("FinalC21visibleratio", lkpUDF21(col("C21visibleratio"))) //.drop("C21visibleratio").cache() -// // df20.unpersist() -// val df22 = df21.withColumn("FinalC22printableratio", lkpUDF22(col("C22printableratio"))) //.drop("C22printableratio").cache() -// //df21.unpersist() -// val df23 = df22.withColumn("FinalC23blankratio", lkpUDF23(col("C23blankratio"))) //.drop("C23blankratio").cache() -// // df22.unpersist() -// val df24 = df23.withColumn("FinalC24controlratio", lkpUDF24(col("C24controlratio"))) //.drop("C24controlratio").cache() -// //df23.unpersist() -// val df25 = df24.withColumn("FinalC25hexaratio", lkpUDF25(col("C25hexaratio"))) //.drop("C25hexaratio").cache() -// -// //************************************************End Character Features **************************************************************************************** -// -// //************************************************Start Word Features **************************************************************************************** -// -// // Word Ratio Features : UDF -// val lkpUDFW1 = udf { (i: Double) => if (i == 0) Median(24).toString().toDouble else i } -// val lkpUDFW3 = udf { (i: Double) => if (i == 0) Median(25).toString().toDouble else i } -// val lkpUDFW6 = udf { (i: Double) => if (i == 0) Median(26).toString().toDouble else i } -// val lkpUDFW7 = udf { (i: Double) => if (i == 0) Median(27).toString().toDouble else i } -// val lkpUDFW8 = udf { (i: Double) => if (i == 0) Median(28).toString().toDouble else i } -// -// //1. -// val df26 = df25.withColumn("FinalW1languagewordratio", lkpUDFW1(col("W1languagewordratio"))) //.drop("W1languagewordratio").cache() -// -// //2.Boolean(Double) IsContainLanguageWord -// -// //3. -// val df27 = df26.withColumn("FinalW3lowercaseratio", lkpUDFW3(col("W3lowercaseratio"))) //.drop("W3lowercaseratio").cache() -// // df26.unpersist() -// -// //4. Integer " Mean: -// val Mean_W4longestword = Samples.agg(mean("W4longestword")).head() -// val W4_Mean = Mean_W4longestword.getDouble(0) -// val lkpUDFW4 = udf { (i: Double) => if (i == 0) W4_Mean else i } -// val df28 = df27.withColumn("FinalW4longestword", lkpUDFW4(col("W4longestword"))) -// -// //5. Boolean (Double ) W5IscontainURL -// //6. -// val df29 = df28.withColumn("FinalW6badwordratio", lkpUDFW6(col("W6badwordratio"))) //.drop("W6badwordratio").cache() -// -// //7. -// val df30 = df29.withColumn("FinalW7uppercaseratio", lkpUDFW7(col("W7uppercaseratio"))) //.drop("W7uppercaseratio").cache() -// -// //8. -// val df31 = df30.withColumn("FinalW8banwordratio", lkpUDFW8(col("W8banwordratio"))) //.drop("W8banwordratio").cache() -// -// //9.FemalFirst Boolean(Double) -// //10.Male First Boolean(Double) -// //11.ContainBadWord Boolean(Double) -// //12ContainBanWord Boolean(Double) -// -// //13. Integer(Double): -// val Mean_W13W13NumberSharewords = Samples.agg(mean("W13NumberSharewords")).head() -// val W13_Mean = Mean_W13W13NumberSharewords.getDouble(0) -// val lkpUDFW13 = udf { (i: Double) => if (i == 0) W13_Mean else i } -// val df32 = df31.withColumn("FinalW13NumberSharewords", lkpUDFW13(col("W13NumberSharewords"))) -// -// //14. Integer (Double): -// val Mean_W14NumberSharewordswithoutStopwords = Samples.agg(mean("W14NumberSharewordswithoutStopwords")).head() -// val W14_Mean = Mean_W14NumberSharewordswithoutStopwords.getDouble(0) -// val lkpUDFW14 = udf { (i: Double) => if (i == 0) W14_Mean else i } -// val df33 = df32.withColumn("FinalW14NumberSharewordswithoutStopwords", lkpUDFW14(col("W14NumberSharewordswithoutStopwords"))) -// -// // 15. Double (Not ratio): -// val Mean_W15PortionQid = Samples.agg(mean("W15PortionQid")).head() -// val W15_Mean = Mean_W15PortionQid.getDouble(0) -// val lkpUDFW15 = udf { (i: Double) => if (i == 0) W15_Mean else i } -// val df34 = df33.withColumn("FinalW15PortionQid", lkpUDFW15(col("W15PortionQid"))) -// -// //16. Double(Not Ratio): -// val Mean_W16PortionLnags = Samples.agg(mean("W16PortionLnags")).head() -// val W16_Mean = Mean_W16PortionLnags.getDouble(0) -// val lkpUDFW16 = udf { (i: Double) => if (i == 0) W16_Mean else i } -// val df35 = df34.withColumn("FinalW16PortionLnags", lkpUDFW16(col("W16PortionLnags"))) -// -// //17.Double(Not ratio): -// val Mean_W17PortionLinks = Samples.agg(mean("W17PortionLinks")).head() -// val W17_Mean = Mean_W17PortionLinks.getDouble(0) -// val lkpUDFW17 = udf { (i: Double) => if (i == 0) W17_Mean else i } -// val df36 = df35.withColumn("FinalW17PortionLinks", lkpUDFW17(col("W17PortionLinks"))) -// -// //************************************************End Word Features **************************************************************************************** -// -// //************************************************Start Sentences Features **************************************************************************************** -// // 1. Integer(Double) -// val Mean_S1CommentTailLength = Samples.agg(mean("S1CommentTailLength")).head() -// val S1_Mean = RoundDouble(Mean_S1CommentTailLength.getDouble(0)) -// val lkpUDFS1 = udf { (i: Double) => if (i == 0) S1_Mean else i } -// val df37 = df36.withColumn("FinalS1CommentTailLength", lkpUDFS1(col("S1CommentTailLength"))) -// -// //2. Double but Not ratio values : -// val Mean_S2SimikaritySitelinkandLabel = Samples.agg(mean("S2SimikaritySitelinkandLabel")).head() -// val S2_Mean = RoundDouble(Mean_S2SimikaritySitelinkandLabel.getDouble(0)) -// val lkpUDFS2 = udf { (i: Double) => if (i == 0) S2_Mean else i } -// val df39 = df37.withColumn("FinalS2SimikaritySitelinkandLabel", lkpUDFS2(col("S2SimikaritySitelinkandLabel"))) -// -// //3. Double but Not ratio values : -// val Mean_S3SimilarityLabelandSitelink = Samples.agg(mean("S3SimilarityLabelandSitelink")).head() -// val S3_Mean = RoundDouble(Mean_S3SimilarityLabelandSitelink.getDouble(0)) -// val lkpUDFS3 = udf { (i: Double) => if (i == 0.0) S3_Mean else i } -// val df40 = df39.withColumn("FinalS3SimilarityLabelandSitelink", lkpUDFS3(col("S3SimilarityLabelandSitelink"))) -// -// //4. Double but Not ratio values : -// val Mean_S4SimilarityCommentComment = Samples.agg(mean("S4SimilarityCommentComment")).head() -// val S4_Mean = RoundDouble(Mean_S4SimilarityCommentComment.getDouble(0)) -// val lkpUDFS4 = udf { (i: Double) => if (i == 0.0) S4_Mean else i } -// val df41 = df40.withColumn("FinalS4SimilarityCommentComment", lkpUDFS4(col("S4SimilarityCommentComment"))) -// -// //df41.show() -// //************************************************End Sentences Features **************************************************************************************** -// //*********************************************** Start Statement Features **************************************************************************************** -// //1. String -// //2. String -// //3. String -// //************************************************End Statement Features **************************************************************************************** -// //*********************************************** Start User Features **************************************************************************************** -// -// //1.Boolean(Double) -// //2.Boolean(Double) -// //3.Boolean(Double) -// //4.Boolean(Double) -// //5.Boolean(Double) -// //6.Boolean(Double) -// //7. (Double) IP No need to fill Missing Data -// //8. (Double) ID No need to fill Missing Data -// //9.Boolean(Double) -// //10.Boolean(Double) -// -// //*********************************************** End User Features **************************************************************************************** -// //*********************************************** Start Item Features **************************************************************************************** -// //1. Integer (Double) No need to fill missing values -// //2. Integer (Double) No need to fill missing values -// //3. Integer (Double) No need to fill missing values -// //4. Integer (Double) No need to fill missing values -// //5. Integer (Double) No need to fill missing values -// //6. Integer (Double) No need to fill missing values -// //7. Integer (Double) No need to fill missing values -// //8. Integer (Double) No need to fill missing values -// //9. Integer (Double) No need to fill missing values -// //10. Integer (Double) No need to fill missing values -// //11. String -// //*********************************************** End Item Features **************************************************************************************** -// //*********************************************** Start Revision Features **************************************************************************************** -// //1.String -// //2.String -// //3.Boolean (Double) -// //4.Integer(Double) -// //5.String -// //6.String -// //7. Boolean(Double) -// //8. String -// //9.String -// //10. Integer (Double) -// //11.String -// //12. integer(Double) -// //13. Long(Double) -// //14. integer (Double) -// //15.String -// //16.String -// //*********************************************** End Revision Features **************************************************************************************** -// //*********************************************** Meta Data , Truth Data and Frequnces **************************************************************************************** -// //Meta -// // 1.Revision Session :Integer (Converted to Double) -// //2. User Country Code -// //3.User Continent Code -// //4.User Time Size -// //5.User Region Code -// //6.User-city Name -// //7.User Country Name -// //8.RevisionTags -// -// // Truth: -// //1.Undo -// -// // Freq : -// -// //1.5 features -// -// // Roll Boolean :Boolean (Double) -// // Undo :Boolean (Double) -// -// //*********************************************** End Revision Features **************************************************************************************** -// -// //===========================================================================String Features==================================================================================== -// -// val df42 = df41.withColumn( -// //statement String features: -// "StringFeatures", concat($"SS1Property", lit(";"), $"SS2DataValue", lit(";"), $"SS3ItemValue", lit(";"), $"I11ItemTitle", -// //Revision String Features: -// lit(";"), $"R1languageRevision", -// lit(";"), $"R2RevisionLanguageLocal", -// lit(";"), $"R5RevisionAction", -// lit(";"), $"R6PrevReviAction", -// lit(";"), $"R8ParRevision", -// lit(";"), $"R9RevisionTime", -// lit(";"), $"R11ContentType", -// lit(";"), $"R15RevisionSubaction", -// lit(";"), $"R16PrevReviSubaction", -// -// lit(";"), $"USER_COUNTRY_CODE", -// lit(";"), $"USER_CONTINENT_CODE", -// lit(";"), $"USER_TIME_ZONE", -// lit(";"), $"USER_REGION_CODE", -// lit(";"), $"USER_CITY_NAME", -// lit(";"), $"USER_COUNTY_NAME", -// lit(";"), $"REVISION_TAGS")) -// -// val toArray = udf((record: String) => record.split(";").map(_.toString())) -// val test1 = df42.withColumn("StringFeatures", toArray(col("StringFeatures"))) -// // test1.show() -// // test1.printSchema() -// -// val word2Vec = new Word2Vec().setInputCol("StringFeatures").setOutputCol("result").setVectorSize(20).setMinCount(0) -// val model = word2Vec.fit(test1) -// val result = model.transform(test1) //.rdd -// -// // result.show() -// -// val Todense = udf((b: Vector) => b.toDense) -// val test_new2 = result.withColumn("result", Todense(col("result"))) -// -// val assembler = new VectorAssembler().setInputCols(Array( -// "result", -// -// // character -// "FinalC1uppercaseratio", "FinalC2lowercaseratio", "FinalC3alphanumericratio", "FinalC4asciiratio", "FinalC5bracketratio", "FinalC6digitalratio", -// "FinalC7latinratio", "FinalC8whitespaceratio", "FinalC9puncratio", "FinalC10longcharacterseq", "FinalC11arabicratio", "FinalC12bengaliratio", -// "FinalC13brahmiratio", "FinalC14cyrilinratio", "FinalC15hanratio", "Finalc16malysiaratio", "FinalC17tamiratio", "FinalC18telugratio", -// "FinalC19symbolratio", "FinalC20alpharatio", "FinalC21visibleratio", "FinalC22printableratio", "FinalC23blankratio", "FinalC24controlratio", "FinalC25hexaratio", -// -// // Words -// "FinalW1languagewordratio", "W2Iscontainlanguageword", "FinalW3lowercaseratio", "FinalW4longestword", "W5IscontainURL", "FinalW6badwordratio", -// "FinalW7uppercaseratio", "FinalW8banwordratio", "W9FemalFirstName", "W10MaleFirstName", "W11IscontainBadword", "W12IsContainBanword", -// "FinalW13NumberSharewords", "FinalW14NumberSharewordswithoutStopwords", "FinalW15PortionQid", "FinalW16PortionLnags", "FinalW17PortionLinks", -// -// //Sentences : -// "FinalS1CommentTailLength", "FinalS2SimikaritySitelinkandLabel", "FinalS3SimilarityLabelandSitelink", "FinalS4SimilarityCommentComment", -// -// // User : -// "U1IsPrivileged", "U2IsBotUser", "U3IsBotuserWithFlaguser", "U4IsProperty", "U5IsTranslator", "U6IsRegister", "U7IPValue", "U8UserID", -// "U9HasBirthDate", "U10HasDeathDate", -// -// //Item: -// -// "I1NumberLabels", "I2NumberDescription", "I3NumberAliases", "I4NumberClaims", "I5NumberSitelinks", "I6NumberStatement", -// "I7NumberReferences", "I8NumberQualifier", "I9NumberQualifierOrder", "I10NumberBadges", -// -// //Revision: -// "R3IslatainLanguage", "R4JsonLength", "R7RevisionAccountChange", "R10RevisionSize", "R12BytesIncrease", -// "R13TimeSinceLastRevi", "R14CommentLength", -// -// // Meta , truth , Freq -// // meta : -// "FinalREVISION_SESSION_ID", -// // Truth: -// "FinalUNDO_RESTORE_REVERTED", -// -// //Freq: -// "FinalNumberofRevisionsUserContributed", -// "FinalNumberofUniqueItemsUseredit", "FinalNumberRevisionItemHas", "FinalNumberUniqUserEditItem", "FinalFreqItem")).setOutputCol("features") -// val NewData = assembler.transform(test_new2) -// -// // Prepare the data for classification: -// NewData.registerTempTable("DB") -// val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED from DB") -// // val Data = sqlContext.sql("select Rid, features, FinalROLLBACK_REVERTED as label from DB") // for logistic regrision -// -// //Data.show() -// -// val TestClassifiers = new Classifiers() -// -// // TestClassifiers.RandomForestClassifer(Data, sqlContext) -// // TestClassifiers.DecisionTreeClassifier(Data, sqlContext) -// // TestClassifiers.LogisticRegrision(Data, sqlContext) -// // TestClassifiers.GradientBoostedTree(Data, sqlContext) -// // TestClassifiers.MultilayerPerceptronClassifier(Data, sqlContext) -// -// } + } + //=========================================================================================================================================== //=================================================Functions Part============================================================================= @@ -2007,11 +1331,11 @@ class VandalismDetection extends Serializable { def All_Features(row: Row): String = { var temp = "" - //all characters + // all characters val character_Str_String = Character_Features(row) temp = character_Str_String - //all Words + // all Words val Words_Str_String = Words_Features(row) temp = temp + "," + Words_Str_String @@ -2023,15 +1347,15 @@ class VandalismDetection extends Serializable { val Statement_Str_String = Statement_Features(row) temp = temp + "," + Statement_Str_String - //User Features - there are 3 Joins in last stage when we have Data Frame + // User Features - there are 3 Joins in last stage when we have Data Frame val User_Str_String = User_Features_Normal(row) temp = temp + "," + User_Str_String - //Item Features - there are 3 Joins in last stage when we have Data Frame + // Item Features - there are 3 Joins in last stage when we have Data Frame val Item_Str_String = Item_Features(row) temp = temp + "," + Item_Str_String - //Revision Features + // Revision Features val Revision_Str_String = Revision_Features(row) temp = temp + "," + Revision_Str_String @@ -2043,13 +1367,13 @@ class VandalismDetection extends Serializable { def Character_Features(row: Row): String = { var str_results = "" - //1. Row from partitioned Pair RDD: + // 1. Row from partitioned Pair RDD: var new_Back_Row = Row() - //2. Revision ID current operation: + // 2. Revision ID current operation: var RevisionID = row(0) - //3. row(2) = represent the Comment: + // 3. row(2) = represent the Comment: var CommentRecord_AsString = row(2).toString() - //4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail + // 4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail val CommentObj = new CommentProcessor() val Temp_commentTail = CommentObj.Extract_CommentTail(CommentRecord_AsString) @@ -2060,8 +1384,8 @@ class VandalismDetection extends Serializable { val FacilityOBJ = new FacilitiesClass() var Str_vector_Values = FacilityOBJ.ArrayToString(vectorElements) str_results = Str_vector_Values - //CharacterFeatures = Vector_AsArrayElements - //new_Back_Row = Row(vectorElements) + // CharacterFeatures = Vector_AsArrayElements + // new_Back_Row = Row(vectorElements) } else { @@ -2095,11 +1419,11 @@ class VandalismDetection extends Serializable { val FacilityOBJ = new FacilitiesClass() var Str_vector_Values = FacilityOBJ.ArrayToString(RatioValues) str_results = Str_vector_Values - //new_Back_Row = Row(vector_Values) + // new_Back_Row = Row(vector_Values) } // CharacterFeatures - //new_Back_Row + // new_Back_Row str_results.trim() } @@ -2107,13 +1431,13 @@ class VandalismDetection extends Serializable { def Words_Features(row: Row): String = { var str_results = "" - //Row from partitioned Pair RDD: + // Row from partitioned Pair RDD: var new_Back_Row = Row() - //Revision ID current operation: + // Revision ID current operation: var RevisionID = row(0) - //row(2) = represent the Comment: + // row(2) = represent the Comment: var CommentRecord_AsString = row(2).toString() - //Extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail + // Extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail val CommentObj = new CommentProcessor() val Temp_commentTail = CommentObj.Extract_CommentTail(CommentRecord_AsString) var tempQids = 0.0 @@ -2146,9 +1470,9 @@ class VandalismDetection extends Serializable { temLinks = porportion_links } else { - var porortion_Qids = tempQids //=0.0 - var porportion_Lang = temlangs //=0.0 - var porportion_links = temLinks //=0.0 + var porortion_Qids = tempQids // =0.0 + var porportion_Lang = temlangs // =0.0 + var porportion_links = temLinks // =0.0 } @@ -2164,11 +1488,11 @@ class VandalismDetection extends Serializable { var Prev_commentTail = CommentObj.Extract_CommentTail(prevComment.toString()) if (Prev_commentTail != "") { - //11.Feature Current_Previous_CommentTial_NumberSharingWords: + // 11.Feature Current_Previous_CommentTial_NumberSharingWords: val NumberSharingWords = WordsOBJ.Current_Previous_CommentTial_NumberSharingWords(Temp_commentTail, Prev_commentTail) ArrayElements(12) = NumberSharingWords.toDouble - //12.Feature Current_Previous_CommentTial_NumberSharingWords without Stopword: + // 12.Feature Current_Previous_CommentTial_NumberSharingWords without Stopword: val NumberSharingWordsWithoutStopwords = WordsOBJ.Current_Previous_CommentTial_NumberSharingWords_WithoutStopWords(Temp_commentTail, Prev_commentTail) ArrayElements(13) = NumberSharingWordsWithoutStopwords.toDouble @@ -2218,8 +1542,8 @@ class VandalismDetection extends Serializable { str_results = Str_vector_Values } - //new_Back_Row - //Word_Features + // new_Back_Row + // Word_Features str_results } @@ -2227,16 +1551,16 @@ class VandalismDetection extends Serializable { def Sentences_Features(row: Row): String = { var str_results = "" - //This will be used to save values in vector + // This will be used to save values in vector var DoubleValues = new Array[Double](4) - //1. Row from partitioned Pair RDD: + // 1. Row from partitioned Pair RDD: var new_Back_Row = Row() - //2. Revision ID current operation: + // 2. Revision ID current operation: var RevisionID = row(0) - //3. row(2) = represent the Full Comment: + // 3. row(2) = represent the Full Comment: var CommentRecord_AsString = row(2).toString() - //4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail + // 4. extract comment tail from the Normal comment-Depending on the paperes, we apply character feature extraction on comment Tail val CommentObj = new CommentProcessor() val Temp_commentTail = CommentObj.Extract_CommentTail(CommentRecord_AsString) @@ -2249,14 +1573,14 @@ class VandalismDetection extends Serializable { DoubleValues(0) = comment_Tail_Length // Feature 2 similarity between comment contain Sitelink and label : - //Check the language in comment that contain sitelinkword: -------------------- + // Check the language in comment that contain sitelinkword: -------------------- val Sitelink_inCommentObj = new SentencesFeatures() if (CommentRecord_AsString.contains("sitelink")) { // start 1 loop - //1. First step : get the language from comment + // 1. First step : get the language from comment val languagesitelink_from_Comment = Sitelink_inCommentObj.extract_CommentSiteLink_LanguageType(CommentRecord_AsString).trim() - //2. second step: get the Label tage from json table : + // 2. second step: get the Label tage from json table : if (row(9).toString() != "[]") { // start 2 loop // if (row(8).toString() != "") { val jsonStr = "\"\"\"" + row(9).toString() + "\"\"\"" // row(9) is the label record @@ -2271,7 +1595,7 @@ class VandalismDetection extends Serializable { DoubleValues(1) = 0.0 } - } // endd 2 loop + } // endd 2 loop else { DoubleValues(1) = 0.0 @@ -2285,12 +1609,12 @@ class VandalismDetection extends Serializable { } // Feature 3 similarity between comment contain label word and sitelink - //Check the language in comment that contain Label word:----------------------- + // Check the language in comment that contain Label word:----------------------- val Label_inCommentObj = new SentencesFeatures() if (CommentRecord_AsString.contains("label")) { - //1. First step : get the language from comment + // 1. First step : get the language from comment val languageLabel_from_Comment = Label_inCommentObj.extract_CommentLabel_LanguageType(CommentRecord_AsString).trim() - //2. second step: get the site link tage from json table : + // 2. second step: get the site link tage from json table : if (row(13).toString() != "[]") { // start 2 loop val jsonStr = "\"\"\"" + row(13).toString() + "\"\"\"" // row(13) is the sitelink record val jsonObj: JSONObject = new JSONObject(row(13).toString()) @@ -2351,7 +1675,7 @@ class VandalismDetection extends Serializable { } - //new_Back_Row + // new_Back_Row str_results } @@ -2359,7 +1683,7 @@ class VandalismDetection extends Serializable { // statement Features : def Statement_Features(row: Row): String = { var full_Str_Result = "" - //1. row(2) = represent the Comment: + // 1. row(2) = represent the Comment: var fullcomment = row(2).toString() val StatementOBJ = new StatementFeatures() @@ -2400,9 +1724,9 @@ class VandalismDetection extends Serializable { var str_results = "" var DoubleValues = new Array[Double](10) // you should change the index when add more element feature - //Row from partitioned Pair RDD: + // Row from partitioned Pair RDD: var new_Back_Row = Row() - //row(7) = represent the Contributor name: + // row(7) = represent the Contributor name: var full_comment = row(2).toString() var contributor_Name = row(7).toString() var contributor_ID = row(6).toString() @@ -2411,7 +1735,7 @@ class VandalismDetection extends Serializable { val useFeatureOBJ = new UserFeatures() - //1. Is privileged : There are 5 cases : if one of these cases is true that mean it is privileged else it is not privileged user + // 1. Is privileged : There are 5 cases : if one of these cases is true that mean it is privileged else it is not privileged user var flag_case1 = useFeatureOBJ.CheckName_isGlobalSysopUser(contributor_Name) var flag_case2 = useFeatureOBJ.CheckName_isGlobalRollBackerUser(contributor_Name) var flag_case3 = useFeatureOBJ.CheckName_isGlobalStewarUser(contributor_Name) @@ -2427,7 +1751,7 @@ class VandalismDetection extends Serializable { DoubleValues(0) = 0.0 } - //2. is BotUser : There are 3 cases : + // 2. is BotUser : There are 3 cases : var flag_case1_1 = useFeatureOBJ.CheckName_isLocalBotUser(contributor_Name) var flag_case2_2 = useFeatureOBJ.CheckName_isGlobalbotUser(contributor_Name) var flag_case3_3 = useFeatureOBJ.CheckName_isExtensionBotUser(contributor_Name) @@ -2441,7 +1765,7 @@ class VandalismDetection extends Serializable { DoubleValues(1) = 0.0 } - //3. is Bot User without BotflagUser : There is 1 case : + // 3. is Bot User without BotflagUser : There is 1 case : var flag_BUWBF = useFeatureOBJ.CheckName_isBotUserWithoutBotFlagUser(contributor_Name) if (flag_BUWBF == true) { @@ -2452,7 +1776,7 @@ class VandalismDetection extends Serializable { } - //4. is Property creator : + // 4. is Property creator : var flagCreator = useFeatureOBJ.CheckName_isPropertyCreator(contributor_Name) if (flagCreator == true) { @@ -2463,7 +1787,7 @@ class VandalismDetection extends Serializable { } - //5. is translator : + // 5. is translator : var flagTranslator = useFeatureOBJ.CheckName_isTranslator(contributor_Name) if (flagTranslator == true) { DoubleValues(4) = 1.0 @@ -2471,7 +1795,7 @@ class VandalismDetection extends Serializable { DoubleValues(4) = 0.0 } - //6. is register user: + // 6. is register user: var flagRegistered = useFeatureOBJ.IsRegisteroUser(contributor_Name) if (flagRegistered == true) { DoubleValues(5) = 1.0 @@ -2490,13 +1814,13 @@ class VandalismDetection extends Serializable { } - //7. IP as a long value + // 7. IP as a long value if (contributor_IP != "0") { DoubleValues(6) = contributor_IP.toDouble } else { DoubleValues(6) = 0.0 } - //8. ID + // 8. ID if (contributor_ID != "0") { DoubleValues(7) = contributor_ID.toDouble @@ -2504,7 +1828,7 @@ class VandalismDetection extends Serializable { DoubleValues(7) = 0.0 } - //9- 10 BitrthDate - DeatDate: + // 9- 10 BitrthDate - DeatDate: var DateObj = new UserFeatures() var BirthDate = DateObj.IsBirthDate(full_comment) @@ -2540,11 +1864,11 @@ class VandalismDetection extends Serializable { var str_results = "" var DoubleValues = new Array[Double](11) - //Row from partitioned Pair RDD: + // Row from partitioned Pair RDD: var new_Back_Row = Row() var ItemOBJ = new ItemFeatures() - //1. Feature depending on Label: + // 1. Feature depending on Label: var NumberOfLabel = 0.0 var Label_String = row(9).toString() if (Label_String != "[]") { @@ -2554,7 +1878,7 @@ class VandalismDetection extends Serializable { NumberOfLabel = 0.0 DoubleValues(0) = NumberOfLabel } - //2. Feature depending on Description: + // 2. Feature depending on Description: var Description_String = row(10).toString() var NumberOfDescription = 0.0 if (Description_String != "[]") { @@ -2566,7 +1890,7 @@ class VandalismDetection extends Serializable { DoubleValues(1) = NumberOfDescription } - //3. Feature depending on Aliases: + // 3. Feature depending on Aliases: var Aliases_String = row(11).toString() var NumberOfAliases = 0.0 if (Aliases_String != "[]") { @@ -2578,7 +1902,7 @@ class VandalismDetection extends Serializable { DoubleValues(2) = NumberOfAliases } - //4. Feature depending on Claims : + // 4. Feature depending on Claims : var Claims_String = row(12).toString() var NumberOfClaims = 0.0 if (Claims_String != "[]") { @@ -2590,7 +1914,7 @@ class VandalismDetection extends Serializable { DoubleValues(3) = NumberOfClaims } - //5. Feature depending on SiteLink + // 5. Feature depending on SiteLink var SiteLink_String = row(13).toString() var NumberOfSitelink = 0.0 if (SiteLink_String != "[]") { @@ -2603,7 +1927,7 @@ class VandalismDetection extends Serializable { } - //6. Feature depending on Claims - statements : + // 6. Feature depending on Claims - statements : var statement_String = row(12).toString() // from claim var NumberOfstatement = 0.0 if (statement_String != "[]") { @@ -2616,7 +1940,7 @@ class VandalismDetection extends Serializable { } - //7. Feature depending on Claims - References : + // 7. Feature depending on Claims - References : var References_String = row(12).toString() // from claim var NumberOfReferences = 0.0 if (References_String != "[]") { @@ -2628,7 +1952,7 @@ class VandalismDetection extends Serializable { DoubleValues(6) = NumberOfReferences } - //8. Feature depending on claim + // 8. Feature depending on claim var Qualifier_String = row(12).toString() // from claim var NumberOfQualifier = 0.0 if (Qualifier_String != "[]") { @@ -2654,7 +1978,7 @@ class VandalismDetection extends Serializable { } - //10. Feature depending on Site link + // 10. Feature depending on Site link var BadgesString = row(13).toString() // from claim var NumberOfBadges = 0.0 if (BadgesString != "[]") { @@ -2667,7 +1991,7 @@ class VandalismDetection extends Serializable { } - //11. Item Title (instead of Item ID) + // 11. Item Title (instead of Item ID) var Item_Id_Title = row(1).toString().replace("Q", "") var Item = Item_Id_Title.trim().toDouble DoubleValues(10) = Item @@ -2688,17 +2012,17 @@ class VandalismDetection extends Serializable { def Revision_Features(row: Row): String = { - //var DoubleValues = new Array[Double](6) + // var DoubleValues = new Array[Double](6) var full_Str_Result = "" - //1. Row from partitioned Pair RDD: + // 1. Row from partitioned Pair RDD: var new_Back_Row = Row() - //2. Revision ID current operation: + // 2. Revision ID current operation: var RevisionID = row(0) - //3. row(2) = represent the Comment: + // 3. row(2) = represent the Comment: var fullcomment = row(2).toString() // DoubleValues(0) = length - //1. Revision Language :--------------------------------------------------------------------------------- + // 1. Revision Language :--------------------------------------------------------------------------------- var comment_for_Language = row(2).toString() val CommentLanguageOBJ = new RevisionFeatures() @@ -2709,7 +2033,7 @@ class VandalismDetection extends Serializable { full_Str_Result = "NA".trim() } - //2. Revision Language local:---------------------------------------------------------------------------- + // 2. Revision Language local:---------------------------------------------------------------------------- if (language != "NA") { if (language.contains("-")) { // E.g.Revision ID = 10850 sample1 var LocalLangArray: Array[String] = language.split("-", 2) @@ -2724,7 +2048,7 @@ class VandalismDetection extends Serializable { full_Str_Result = full_Str_Result + "," + "NA" } - //3. Is it Latin Language or Not:------------------------------------------------------------------------- + // 3. Is it Latin Language or Not:------------------------------------------------------------------------- val revisionFeatureOBJ = new RevisionFeatures() val flagLatin = revisionFeatureOBJ.Check_ContainLanguageLatin_NonLatin(language) @@ -2737,26 +2061,26 @@ class VandalismDetection extends Serializable { full_Str_Result = full_Str_Result + "," + "0.0" } - //4. Json Length : be care full to RDD where the json before parsed-------------------------------------- + // 4. Json Length : be care full to RDD where the json before parsed-------------------------------------- // var Jason_Text = row(8).toString() - //replacing_with_Quoto for cleaning the Json tag from extr tags such as ... + // replacing_with_Quoto for cleaning the Json tag from extr tags such as ... var Jason_Text = replacing_with_Quoto(row(0).toString(), row(8).toString()) var Json_Length = Jason_Text.length() full_Str_Result = full_Str_Result + "," + Json_Length.toString() - //5. Revision Action -:----------------------------------------------------------------------- + // 5. Revision Action -:----------------------------------------------------------------------- val CommentProcessOBJ1 = new CommentProcessor() val actions1 = CommentProcessOBJ1.Extract_Actions_FromComments(fullcomment) var ActionsArray1: Array[String] = actions1.split("_", 2) var action1 = ActionsArray1(0).toString() - //var SubAction = ActionsArray(1) + // var SubAction = ActionsArray(1) full_Str_Result = full_Str_Result + "," + action1.trim() - //full_Str_Result = full_Str_Result + "," + SubAction.trim() + // full_Str_Result = full_Str_Result + "," + SubAction.trim() - //6. Revision Prev-Action :------------------------------------------------------------------------------- + // 6. Revision Prev-Action :------------------------------------------------------------------------------- if (row(19) != null) { var Prev_fullcomment1 = row(19).toString() val Prev_CommentProcessOBJ1 = new CommentProcessor() @@ -2765,7 +2089,7 @@ class VandalismDetection extends Serializable { var Prev_action1 = ActionsArray1(0).trim() // var Prev_SubAction = ActionsArray(1).trim() full_Str_Result = full_Str_Result + "," + Prev_action1.trim() - //full_Str_Result = full_Str_Result + "," + Prev_SubAction.trim() + // full_Str_Result = full_Str_Result + "," + Prev_SubAction.trim() // println(row(16).toString()) } else { @@ -2798,11 +2122,11 @@ class VandalismDetection extends Serializable { var RevisionParent = row(3).toString() full_Str_Result = full_Str_Result + "," + RevisionParent.toString().trim() - //9. Revision Time Stamp------------------------------------------------------------------------------------------------ + // 9. Revision Time Stamp------------------------------------------------------------------------------------------------ var RevisionTimeZone = row(4).toString() full_Str_Result = full_Str_Result + "," + RevisionTimeZone - //10. Revision Size:------------------------------------------------------------------------------------------------ + // 10. Revision Size:------------------------------------------------------------------------------------------------ var RevisionBody = row(0).toString() + row(2).toString() + row(3).toString() + row(4).toString() + row(8).toString() + row(14).toString() + row(15).toString() + row(16).toString() if (row(5).toString() != "0") { @@ -2816,7 +2140,7 @@ class VandalismDetection extends Serializable { } - //11. ContentType: take Action1 as input : -------------------------------------------------------------- + // 11. ContentType: take Action1 as input : -------------------------------------------------------------- val CommentProcessOBJ_New = new CommentProcessor() val actions_New = CommentProcessOBJ_New.Extract_Actions_FromComments(fullcomment) @@ -2868,7 +2192,7 @@ class VandalismDetection extends Serializable { } - //13. Time since last Revision: ---------------------------------------------------------------------- + // 13. Time since last Revision: ---------------------------------------------------------------------- if (row(21) != null) { @@ -2886,11 +2210,11 @@ class VandalismDetection extends Serializable { } - //14. Comment Length:--------------------------------------- + // 14. Comment Length:--------------------------------------- var lengthcomment = fullcomment.length().toString() full_Str_Result = full_Str_Result + "," + lengthcomment - //15. Revision SubAction: + // 15. Revision SubAction: val CommentProcessOBJ2 = new CommentProcessor() val actions2 = CommentProcessOBJ2.Extract_Actions_FromComments(fullcomment) @@ -2898,7 +2222,7 @@ class VandalismDetection extends Serializable { var SubAction2 = ActionsArray2(1) full_Str_Result = full_Str_Result + "," + SubAction2.trim() - //16.Prev_revision SubAction: + // 16.Prev_revision SubAction: if (row(19) != null) { var Prev_fullcomment2 = row(19).toString() val Prev_CommentProcessOBJ2 = new CommentProcessor() @@ -2921,7 +2245,7 @@ class VandalismDetection extends Serializable { } - //======================== + // ======================== def RoundDouble(va: Double): Double = { @@ -2984,4 +2308,4 @@ class VandalismDetection extends Serializable { } -}// endl class ------- +} diff --git a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala index 1cf0ee1..9462927 100644 --- a/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala +++ b/sansa-ml-spark/src/main/scala/net/sansa_stack/ml/spark/outliers/vandalismdetection/WordsFeatures.scala @@ -1,7 +1,8 @@ package net.sansa_stack.ml.spark.outliers.vandalismdetection -import java.util.regex.{ Pattern, Matcher } import java.util.{ List, Arrays, ArrayList } +import java.util.regex.{ Pattern, Matcher } + import org.apache.commons.lang3.StringUtils class WordsFeatures extends Serializable { @@ -15,53 +16,53 @@ class WordsFeatures extends Serializable { def Vector_Words_Feature(StrValue: String): Array[Double] = { var RatioValues = new Array[Double](17) val WordsFeature_OBJ = new WordsFeatures() - //1. Double for LanguageWord Ratio - ok + // 1. Double for LanguageWord Ratio - ok val LanguageWord = LanguageWordRatio_Character(StrValue) if (!LanguageWord.isNaN()) { RatioValues(0) = RoundDouble(LanguageWord) } - //2. Boolean --> Double for Contain language word - ok (1 Boolean) + // 2. Boolean --> Double for Contain language word - ok (1 Boolean) val IsContainLanguageWord = ContainLanguageWord(StrValue) if (IsContainLanguageWord == true) { RatioValues(1) = 1.0 } else if (IsContainLanguageWord == false) { RatioValues(1) = 0.0 } - //3.Double for LowerCaseWord Ratio - ok + // 3.Double for LowerCaseWord Ratio - ok val LowerCaseWord = LowercaseWordRation(StrValue) if (!LowerCaseWord.isNaN()) { RatioValues(2) = RoundDouble(LowerCaseWord) } - //4.Integer --> to Double for LongestWord - ok (1 Integer) + // 4.Integer --> to Double for LongestWord - ok (1 Integer) val LongWord = LongestWord(StrValue) if (LongWord != null) { val castedValue = LongWord.toDouble RatioValues(3) = castedValue } - //5.Boolean --> Double for word Contain URL -ok(2 boolean) + // 5.Boolean --> Double for word Contain URL -ok(2 boolean) val IsWordContainURL = ContainURLWord(StrValue) if (IsWordContainURL == true) { RatioValues(4) = 1.0 } else if (IsWordContainURL == false) { RatioValues(4) = 0.0 } - //6.Double for Bad Word Ratio - ok + // 6.Double for Bad Word Ratio - ok val BadWord = BadWordRation(StrValue) if (!BadWord.isNaN()) { RatioValues(5) = RoundDouble(BadWord) } - //7. Double for UppercaseWord Ratio -ok + // 7. Double for UppercaseWord Ratio -ok val UpperCaseWord = UppercaseWordRation(StrValue) if (!UpperCaseWord.isNaN()) { RatioValues(6) = RoundDouble(UpperCaseWord) } - //8.Double for Ban Word Ratio - ok + // 8.Double for Ban Word Ratio - ok val BanWord = BanWordRation(StrValue) if (!BanWord.isNaN()) { RatioValues(7) = RoundDouble(BanWord) } - //9.Boolean Femal FirstName (3 Boolean ) + // 9.Boolean Femal FirstName (3 Boolean ) val IsFemalFirstName = FemaleName_word(StrValue) if (IsFemalFirstName == true) { @@ -70,7 +71,7 @@ class WordsFeatures extends Serializable { RatioValues(8) = 0.0 } - //10. Boolean Male FirstName (4 Boolean) + // 10. Boolean Male FirstName (4 Boolean) val IsMaleFirstName = MaleName_word(StrValue) if (IsMaleFirstName == true) { RatioValues(9) = 1.0 @@ -78,7 +79,7 @@ class WordsFeatures extends Serializable { RatioValues(9) = 0.0 } - //11. Boolean containBadWord_word (5 Boolean ) + // 11. Boolean containBadWord_word (5 Boolean ) val IsContainBad_Word = containBadWord_word(StrValue) if (IsContainBad_Word == true) { @@ -87,7 +88,7 @@ class WordsFeatures extends Serializable { RatioValues(10) = 0.0 } - //12. Boolean containBanWord_word (6 Boolean) + // 12. Boolean containBanWord_word (6 Boolean) val IsContainBan_Word = BanBuilderWordlist_word(StrValue) if (IsContainBan_Word == true) { @@ -125,15 +126,66 @@ class WordsFeatures extends Serializable { } //1.Language Words Ratio : - val regex_LanguageWordRatio: String = "(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)"; + val regex_LanguageWordRatio: String = """(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h) + |rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439) + |b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski + |ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?) + |atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n + |d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h) + |speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese) + |g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian)) + |i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese) + |k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173] + |ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?) + |yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto + |ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano + |\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi) + |erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ngv i[e\\u1ec7]t + |[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh + |(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441 + |[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441 + |[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea + |[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b) + |\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f + |\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1) + |\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)? + |([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e + |\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40 + |\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd + |\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc + |\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53 + |\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)""" val pattern_LanguageWordRatio: Pattern = Pattern.compile(regex_LanguageWordRatio); def LanguageWordRatio_Character(str: String): Double = { val result: Double = WordRatio(str, pattern_LanguageWordRatio) result } - //2. Contain language word : - val regex_ContainLanguageWord: String = "(^|\\n)([ei]n )??(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)( language)??($|\\n)"; + // 2. Contain language word : + val regex_ContainLanguageWord: String = """(^|\\n)([ei]n )??(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h) + |rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439) + |b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)? + |ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese) + |[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h) + |speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k) + |eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?)) + |ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian? + |ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia) + |p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch) + |ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip) + |t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese) + |welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446 + |[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea + |[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b) + |\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f + |\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1) + |\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f + |[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0 + |\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940 + |\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41 + |\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02 + |\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53? + |\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4)( language)??($|\\n)""" val pattern_ContainLanguageWord: Pattern = Pattern.compile(regex_ContainLanguageWord); val matcher_ContainLanguageWord: Matcher = pattern_ContainLanguageWord.matcher(""); def ContainLanguageWord(str: String): Boolean = { @@ -149,20 +201,20 @@ class WordsFeatures extends Serializable { result } - //3. Upper case word Ratio: + // 3. Upper case word Ratio: def UppercaseWordRation(str: String): Double = { val pattern: Pattern = Pattern.compile("\\p{Lu}.*") val result: Double = WordRatio(str, pattern) result } - //4. Lower case word Ratio: + // 4. Lower case word Ratio: def LowercaseWordRation(str: String): Double = { val pattern: Pattern = Pattern.compile("[\\p{L}&&[^\\p{Lu}]].*") val result: Double = WordRatio(str, pattern) result } - //5.word Contain URL : + // 5.word Contain URL : val pattern_WordContainURL: Pattern = Pattern.compile("\\b(https?:\\/\\/|www\\.)\\S{10}.*", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ) val matcher_WordContainURL: Matcher = pattern_WordContainURL.matcher(""); @@ -179,7 +231,7 @@ class WordsFeatures extends Serializable { result } - //6. Longest Word + // 6. Longest Word val pattern_longestWord: Pattern = Pattern.compile("\\p{IsAlphabetic}+", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ); val matcher_longestWord: Matcher = pattern_WordContainURL.matcher(""); @@ -203,7 +255,7 @@ class WordsFeatures extends Serializable { max } - //7. Bad Word : It is Ok + // 7. Bad Word : It is Ok val luisVonAhnWordlist: Array[String] = Array("abbo", "abo", "abortion", "abuse", "addict", "addicts", "adult", "africa", @@ -465,7 +517,7 @@ class WordsFeatures extends Serializable { } - //8. Contain Bad Word:It is ok + // 8. Contain Bad Word:It is ok val tokens_containbadword: List[String] = new ArrayList[String](Arrays.asList(luisVonAhnWordlist: _*)) val patternString_containBadword: String = ".*\\b(" + StringUtils.join(tokens_containbadword, "|") + ")\\b.*" val pattern_containBadword: Pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL | Pattern.CANON_EQ) @@ -481,7 +533,7 @@ class WordsFeatures extends Serializable { results } - //9.Ban Builder Word:It is OK + // 9.Ban Builder Word:It is OK val BanBuilderWordlist: Array[String] = Array("$#!+", "$1ut", "$h1t", "$hit", "$lut", "'ho", "'hobag", "a$$", "anal", "anus", "ass", "assmunch", "b1tch", "ballsack", "bastard", "beaner", @@ -629,7 +681,7 @@ class WordsFeatures extends Serializable { results } - //10 Ban word Ratio: + // 10 Ban word Ratio: val tokens_ban: List[String] = new ArrayList[String](Arrays.asList(BanBuilderWordlist: _*)) val patternString_ban: String = StringUtils.join(tokens_ban, "|") val pattern_banWord: Pattern = Pattern.compile(patternString_ban) @@ -645,8 +697,33 @@ class WordsFeatures extends Serializable { } - //11.Contain language word:It is ok - val regex_containLanguageWord: String = ".*(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian?|ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439)|b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)?|ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese)|[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra)|f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian))|i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee])|l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro|ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)?| ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo?|panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh|(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446|[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea|[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b)|\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f|\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1)|\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8|\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f|[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438|\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4)|\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40|\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648|\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0|\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02|\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53|\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4).*"; + // 11.Contain language word:It is ok + val regex_containLanguageWord: String = """.*(a(frikaa?ns|lbanian?|lemanha|ng(lais|ol)|ra?b(e?|[ei]c|ian?|isc?h)|rmenian? + |ssamese|azeri|z[e\\u0259]rba(ijani?|ycan(ca)?|yjan)|\\u043d\\u0433\\u043b\\u0438\\u0439\\u0441\\u043a\\u0438\\u0439) + |b(ahasa( (indonesia|jawa|malaysia|melayu))?|angla|as(k|qu)e|[aeo]ng[ao]?li|elarusian?|okm\\u00e5l|osanski|ra[sz]il(ian?)? + |ritish( kannada)?|ulgarian?)|c(ebuano|hina|hinese( simplified)?|zech|roat([eo]|ian?)|atal[a\\u00e0]n?|\\u0440\\u043f\\u0441\\u043a\\u0438|antonese) + |[c\\u010d](esky|e[s\\u0161]tina)\r\n|d(an(isc?h|sk)|e?uts?ch)|e(esti|ll[hi]nika|ng(els|le(ski|za)|lisc?h)|spa(g?[n\\u00f1]h?i?ol|nisc?h)|speranto|stonian|usk[ae]ra) + |f(ilipino|innish|ran[c\\u00e7](ais|e|ez[ao])|ren[cs]h|arsi|rancese)|g(al(ego|ician)|uja?rati|ree(ce|k)|eorgian|erman[ay]?|ilaki)|h(ayeren|ebrew|indi|rvatski|ungar(y|ian)) + |i(celandic|ndian?|ndonesian?|ngl[e\\u00ea]se?|ngilizce|tali(ano?|en(isch)?))|ja(pan(ese)?|vanese)|k(a(nn?ada|zakh)|hmer|o(rean?|sova)|urd[i\\u00ee]) + |l(at(in[ao]?|vi(an?|e[s\\u0161]u))|ietuvi[u\\u0173]|ithuanian?)|m(a[ck]edon(ian?|ski)|agyar|alay(alam?|sian?)?|altese|andarin|arathi|elayu|ontenegro + |ongol(ian?)|yanmar)|n(e(d|th)erlands?|epali|orw(ay|egian)|orsk( bokm[a\\u00e5]l)?|ynorsk)|o(landese|dia)|p(ashto|ersi?an?|ol(n?isc?h|ski)|or?tugu?[e\\u00ea]se?(( d[eo])? brasil(eiro)? + | ?\\(brasil\\))?|unjabi)|r(om[a\\u00e2i]ni?[a\\u0103]n?|um(ano|\\u00e4nisch)|ussi([ao]n?|sch))|s(anskrit|erbian|imple english|inha?la|lov(ak(ian?)?|en\\u0161?[c\\u010d]ina|en(e|ij?an?)|uomi)|erbisch|pagnolo? + |panisc?h|rbeska|rpski|venska|c?wedisc?h|hqip)|t(a(galog|mil)|elugu|hai(land)?|i[e\\u1ebf]ng vi[e\\u1ec7]t|[u\\u00fc]rk([c\\u00e7]e|isc?h|i\\u015f|ey))|u(rdu|zbek)|v(alencia(no?)?|ietnamese)|welsh + |(\\u0430\\u043d\\u0433\\u043b\\u0438\\u0438\\u0441|[k\\u043a]\\u0430\\u043b\\u043c\\u044b\\u043a\\u0441|[k\\u043a]\\u0430\\u0437\\u0430\\u0445\\u0441|\\u043d\\u0435\\u043c\\u0435\\u0446 + |[p\\u0440]\\u0443\\u0441\\u0441|[y\\u0443]\\u0437\\u0431\\u0435\\u043a\\u0441)\\u043a\\u0438\\u0439( \\u044f\\u0437\\u044b\\u043a)??|\\u05e2\\u05d1\\u05e8\\u05d9\\u05ea + |[k\\u043a\\u049b](\\u0430\\u0437\\u0430[\\u043a\\u049b]\\u0448\\u0430|\\u044b\\u0440\\u0433\\u044b\\u0437\\u0447\\u0430|\\u0438\\u0440\\u0438\\u043b\\u043b) + |\\u0443\\u043a\\u0440\\u0430\\u0457\\u043d\\u0441\\u044c\\u043a(\\u0430|\\u043e\\u044e)|\\u0431(\\u0435\\u043b\\u0430\\u0440\\u0443\\u0441\\u043a\\u0430\\u044f + |\\u044a\\u043b\\u0433\\u0430\\u0440\\u0441\\u043a\\u0438( \\u0435\\u0437\\u0438\\u043a)?)|\\u03b5\\u03bb\\u03bb[\\u03b7\\u03b9]\\u03bd\\u03b9\\u03ba(\\u03ac|\\u03b1) + |\\u10e5\\u10d0\\u10e0\\u10d7\\u10e3\\u10da\\u10d8 + |\\u0939\\u093f\\u0928\\u094d\\u0926\\u0940|\\u0e44\\u0e17\\u0e22|[m\\u043c]\\u043e\\u043d\\u0433\\u043e\\u043b(\\u0438\\u0430)?|([c\\u0441]\\u0440\\u043f + |[m\\u043c]\\u0430\\u043a\\u0435\\u0434\\u043e\\u043d)\\u0441\\u043a\\u0438 + |\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629|\\u65e5\\u672c\\u8a9e|\\ud55c\\uad6d(\\ub9d0|\\uc5b4) + |\\u200c\\u0939\\u093f\\u0928\\u0926\\u093c\\u093f|\\u09ac\\u09be\\u0982\\u09b2\\u09be|\\u0a2a\\u0a70\\u0a1c\\u0a3e\\u0a2c\\u0a40 + |\\u092e\\u0930\\u093e\\u0920\\u0940|\\u0c95\\u0ca8\\u0ccd\\u0ca8\\u0ca1|\\u0627\\u064f\\u0631\\u062f\\u064f\\u0648 + |\\u0ba4\\u0bae\\u0bbf\\u0bb4\\u0bcd|\\u0c24\\u0c46\\u0c32\\u0c41\\u0c17\\u0c41|\\u0a97\\u0ac1\\u0a9c\\u0ab0\\u0abe\\u0aa4\\u0ac0 + |\\u0641\\u0627\\u0631\\u0633\\u06cc|\\u067e\\u0627\\u0631\\u0633\\u06cc|\\u0d2e\\u0d32\\u0d2f\\u0d3e\\u0d33\\u0d02 + |\\u067e\\u069a\\u062a\\u0648|\\u1019\\u103c\\u1014\\u103a\\u1019\\u102c\\u1018\\u102c\\u101e\\u102c|\\u4e2d\\u6587(\\u7b80\\u4f53 + |\\u7e41\\u9ad4)?|\\u4e2d\\u6587\\uff08(\\u7b80\\u4f53?|\\u7e41\\u9ad4)\\uff09|\\u7b80\\u4f53|\\u7e41\\u9ad4).*""".stripMargin val pattern_forContainLanguageWord: Pattern = Pattern.compile(regex_containLanguageWord); val matcher_containLanguageWord: Matcher = pattern_forContainLanguageWord.matcher(""); def containLanguageBadWord_word(str: String): Boolean = { @@ -660,7 +737,7 @@ class WordsFeatures extends Serializable { results } - //12. Male Names: It is ok + // 12. Male Names: It is ok val MaleNames: Array[String] = Array("AARON", "ADAM", "ADRIAN", "ALAN", "ALBERT", "ALBERTO", "ALEX", "ALEXANDER", "ALFRED", "ALFREDO", "ALLAN", "ALLEN", "ALVIN", "ANDRE", "ANDREW", "ANDY", @@ -725,7 +802,7 @@ class WordsFeatures extends Serializable { } - //13. Female Names: It is ok + // 13. Female Names: It is ok val FemaleNames: Array[String] = Array("AGNES", "ALICE", "ALICIA", "ALLISON", "ALMA", "AMANDA", "AMBER", "AMY", "ANA", "ANDREA", "ANGELA", "ANITA", "ANN", "ANNA", "ANNE", "ANNETTE", @@ -934,10 +1011,8 @@ class WordsFeatures extends Serializable { } } - results } - def GetNumberofLinks(str: String): Double = { val input: String = str @@ -971,5 +1046,4 @@ class WordsFeatures extends Serializable { result.toFloat } // Words features: ------ End calculation the Ratio for Words: - -} \ No newline at end of file +}