In [1]:
%%init_spark
launcher.packages= ["graphframes:graphframes:0.8.2-spark3.2-s_2.12"]

# Problem 1

In [1]:
import org.apache.spark.sql.DataFrame
val doc1 = ("doc 1","""
Columbia University is a large university in New York.
It has many schools including Columbia College, Engineering School, Law School, and Business School.
It was established in 1754
""")
val doc2 = ("doc 2","""
Operations Research is a department in the Engineering School of Columbia University.
Operations Research was established in 1919.
Operations Research has a BS major and offers many MS degrees.
Graduates of Operations Research get good jobs and have a very happy life.
""")

def both_uc(w1: String,w2: String): Boolean = 
{
    w1(0).isUpper & w2(0).isUpper
}

def split_data(a: String): Array[String] = a.split("\\s+")

def clean_data(a: String): String = 
{
//     a.filter(a=>a != ','  & a != '.' & a != '\n').trim() //not works because filter \n will concate words together
    a.replace("\n"," ").replace("."," ").replace(","," ").replace("  "," ").trim()
}


def replace_entities(a: Array[String]):Array[String] = {
    val indices = 0 to a.length-1
    indices.slice(0,indices.length-1) //not include indices.length-1
    .flatMap(i => 
         if (both_uc(a(i),a(i+1))) Some(a(i)+a(i+1))
         else None)
    .toArray
}

val clean_data_udf = udf(clean_data _)
val split_data_udf = udf(split_data _)
val replace_entities_udf = udf(replace_entities _)

def make_df(a: Seq[(String,String)]): DataFrame = {
    sc.parallelize(a)
        .toDF("document_id","document_text")
        .withColumn("cleaned_text",clean_data_udf($"document_text"))
        .withColumn("document_terms",split_data_udf($"cleaned_text"))
        .withColumn("entity_terms",replace_entities_udf($"document_terms"))
}

val df = make_df(Array(doc1,doc2))

//Using CountVectorizer, generate term_freqs column
import org.apache.spark.ml.feature.CountVectorizer
val countVectorizer = new CountVectorizer()
    .setInputCol("entity_terms")
    .setOutputCol("term_freqs")
    .setVocabSize(20)

val vocabModel = countVectorizer.fit(df)
val freqs = vocabModel.transform(df)

//Using IDF get the tfidfVec
import org.apache.spark.ml.feature.IDF

val idf = new IDF().setInputCol("term_freqs").setOutputCol("tfidfVec")
val idfModel = idf.fit(freqs)
val idfMatrix = idfModel.transform(freqs).select("document_id","tfidfVec")

idfMatrix.show(false) //The Result

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.149:4043
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1671562624389)
SparkSession available as 'spark'


+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document_id|tfidfVec                                                                                                                                                                                     |
+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|doc 1      |(12,[1,2,3,4,5,6,7,8,9,11],[0.0,0.0,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644])|
|doc 2      |(12,[0,1,2,10],[1.6218604324326575,0.0,0.0,0.4054651081081644])                                                                                                            

import org.apache.spark.sql.DataFrame
doc1: (String, String) =
(doc 1,"
Columbia University is a large university in New York.
It has many schools including Columbia College, Engineering School, Law School, and Business School.
It was established in 1754
")
doc2: (String, String) =
(doc 2,"
Operations Research is a department in the Engineering School of Columbia University.
Operations Research was established in 1919.
Operations Research has a BS major and offers many MS degrees.
Graduates of Operations Research get good jobs and have a very happy life.
")
both_uc: (w1: String, w2: String)Boolean
split_data: (a: String)Array[String]
clean_data: (a: String)String
replace_entities: (a: Array[String])Array[String]
clean_data_udf: org.apache.spark.sql.expressions.UserDefinedFunction = Spar...


# Problem 2

# note: GraphFrame, bfs, CONS!, mapValues, getOrElse

In [4]:
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.graphframes._


val vertexArray = Array(
  (1,1),
  (2,2),
  (3,3),
  (4,4),
  (5,5),
  (6,6),
    (7,7),
    (8,8)
)


val edgeArray = Array(
  (1, 3),
  (2, 3),
  (2, 4),
  (4, 5),
  (3, 5),
  (5, 6),
  (6, 7),
  (6, 8),
  (7, 8)
)

val vertex_df = spark.createDataFrame(vertexArray).toDF("id","v_desc")
val edge_df = spark.createDataFrame(edgeArray).toDF("src","dst")

val g = GraphFrame(vertex_df, edge_df)

//Function to get all vertex pairs. This is written for you
def getAllVertexPairs(g: GraphFrame): Array[(Int,Int)] = {
    def getAllPairs(nums: Seq[Int]) =
        nums.flatMap(x => nums.map(y => (x,y))).filter(p=>p._1 != p._2)

    val col_vals = g.vertices.select("id").map(_.getInt(0)).collect.toSeq.toArray
    val all_vertex_pairs = getAllPairs(col_vals).toArray
    all_vertex_pairs
}

//Function to get the shortest path between two vertices. This is also already written
//for you
//Note that this uses the bfs algorithm. So it will take some time to run and should
//not be run on large graphs!

//:_* a special notation that tells the compiler to pass each element as its own argument, rather than all of it as a single argument.

def getShortestPath(g: GraphFrame,i: Int, j: Int) = {
    val path_df = g.bfs.fromExpr(s"id=$i").toExpr(s"id=$j").run() 
    if (path_df.count > 0) {
        val cols = path_df.columns.filter(n=>n.contains("v")).map(n=>col(n+".id"))
        val a = path_df.select(cols:_*).rdd.collect()(0).toSeq.toArray.map(e => e.toString.toInt)
        a
    }
    else Array[Int]()
}

def getAllShortestPaths(g: GraphFrame):List[Array[Int]]  = {
    def loop(a: Array[(Int,Int)]):List[Array[Int]] = {
        if (a.length == 0) List[Array[Int]]()
        else {
            val sp = getShortestPath(g,a(0)._1,a(0)._2)
            if (a.length == 1)
                List(sp)
            else sp ::loop(a.slice(1,a.length))
        }
    }
    val all_vertex_pairs = getAllVertexPairs(g)
    loop(all_vertex_pairs)
}

def getBetweenessCentrality(g: GraphFrame) = {
    //get all shortest paths removing empty paths
    val all_shortest_paths = getAllShortestPaths(g).filter(p=>p.length>0)
    val vertices = g.vertices.select("id").rdd.map(v=>v(0).toString.toInt)
    val denominator = vertices.count * (vertices.count - 1)
    vertices.map(v => all_shortest_paths.flatten.groupBy(identity).mapValues(_.size).getOrElse(v,0)*1.0/denominator)
}

//Result
val b = getBetweenessCentrality(g)
b.collect

import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.graphframes._
vertexArray: Array[(Int, Int)] = Array((1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8))
edgeArray: Array[(Int, Int)] = Array((1,3), (2,3), (2,4), (4,5), (3,5), (5,6), (6,7), (6,8), (7,8))
vertex_df: org.apache.spark.sql.DataFrame = [id: int, v_desc: int]
edge_df: org.apache.spark.sql.DataFrame = [src: int, dst: int]
g: org.graphframes.GraphFrame = GraphFrame(v:[id: int, v_desc: int], e:[src: int, dst: int])
getAllVertexPairs: (g: org.graphframes.GraphFrame)Array[(Int, Int)]
getShortestPath: (g: org.graphframes.GraphFrame, i: Int, j: Int)Array[Int]
getAllShortestPaths: (g: org.graphframes.GraphFrame)List[Array[Int]]
getBetweenessCentrality: (g: org.graphframes.GraphFrame)org.apache.spark...


In [5]:
b.collect

res3: Array[Double] = Array(0.0, 0.0, 0.10714285714285714, 0.03571428571428571, 0.21428571428571427, 0.17857142857142858, 0.0, 0.0)


In [6]:
getAllShortestPaths(g).filter(p=>p.length>0).flatten

res4: List[Int] = List(3, 3, 5, 3, 5, 6, 3, 5, 6, 3, 4, 5, 4, 5, 6, 4, 5, 6, 5, 5, 6, 5, 6, 5, 5, 6, 5, 6, 6, 6)


In [8]:
val x = Array(1,1,3,2,1,4,4,4)
x.groupBy(identity)

x: Array[Int] = Array(1, 1, 3, 2, 1, 4, 4, 4)
res6: scala.collection.immutable.Map[Int,Array[Int]] = Map(1 -> Array(1, 1, 1), 3 -> Array(3), 2 -> Array(2), 4 -> Array(4, 4, 4))


# Problem 3

# GraphX, case class, aggregateMessages

In [9]:
import org.apache.spark.graphx._ 
import org.apache.spark.rdd.RDD

case class Demographics(age: Int,gender: Char, income: Double) 
case class Person(name: String,demographics: Demographics) 
case class Connection(strength: Int,msgProbability: Double)

val users = Array(
(1L, Person("Alice",Demographics(28,'F',150000.0))),
(2L, Person("Bob",Demographics(27,'M',50000.0))),
(3L, Person("Charlie",Demographics(65,'M',250000.0))),
(4L, Person("David",Demographics(42,'M',750000.0))),
(5L, Person("Ed",Demographics(55,'M',25000.0))),
(6L, Person("Fran",Demographics(50,'F',3150000.0))),
(7L, Person("Jack",Demographics(17,'M',5000.0))),
(8L, Person("Jill",Demographics(16,'F',1000.0)))
)

val connections = Array(
Edge(2L, 1L, Connection(7,.2)),
Edge(2L, 4L, Connection(2,.7)),
Edge(3L, 2L, Connection(4,.31)),
Edge(3L, 6L, Connection(3,.22)),
Edge(4L, 1L, Connection(1,.12)),
Edge(5L, 2L, Connection(2,.45)),
Edge(5L, 3L, Connection(8,.91))
)

val vertexRDD: RDD[(Long, Person)] = sc.parallelize(users) 
val edgeRDD: RDD[Edge[Connection]] = sc.parallelize(connections)

val social_graph = Graph(vertexRDD,edgeRDD)

val over21_friends = social_graph.aggregateMessages[Int](
    triplet => {
        if (triplet.dstAttr.demographics.age >= 21)
        triplet.sendToSrc(1)
    },
    (a, b) => (a+b)).map(t =>t._1).collect

val result = social_graph.aggregateMessages[Int](
triplet => {
    if (triplet.srcAttr.demographics.age > 21
       && triplet.srcAttr.demographics.income > 20000
       && triplet.attr.strength >= 3
       && (over21_friends contains triplet.srcId) )
    triplet.sendToDst(1)
}, (a, b) => (a+b)
)


import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
defined class Demographics
defined class Person
defined class Connection
users: Array[(Long, Person)] = Array((1,Person(Alice,Demographics(28,F,150000.0))), (2,Person(Bob,Demographics(27,M,50000.0))), (3,Person(Charlie,Demographics(65,M,250000.0))), (4,Person(David,Demographics(42,M,750000.0))), (5,Person(Ed,Demographics(55,M,25000.0))), (6,Person(Fran,Demographics(50,F,3150000.0))), (7,Person(Jack,Demographics(17,M,5000.0))), (8,Person(Jill,Demographics(16,F,1000.0))))
connections: Array[org.apache.spark.graphx.Edge[Connection]] = Array(Edge(2,1,Connection(7,0.2)), Edge(2,4,Connection(2,0.7)), Edge(3,2,Connection(4,0.31)), Edge(3,6,Connection(3,0.22)), Edge(4,1,Connection(1,0.12)), Edge(5,2,Connection(2,0.45)), Edge(5,3,C...


In [10]:
result.collect

res7: Array[(org.apache.spark.graphx.VertexId, Int)] = Array((1,1), (2,1), (3,1), (6,1))


# Problem 4

# Pregel

In [2]:
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

case class PersonData(age: Int,gender: Char, income: Double)
case class Client(name: String,data: PersonData)
case class Relationship(strength: Int,msgProbability: Double)

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.149:4041
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1671401582401)
SparkSession available as 'spark'


import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
defined class PersonData
defined class Client
defined class Relationship


In [3]:
val people = Array(
  (1L, Client("Alice",PersonData(28,'F',150000.0))),
  (2L, Client("Bob",PersonData(27,'M',50000.0))),
  (3L, Client("Charlie",PersonData(65,'M',250000.0))),
  (4L, Client("David",PersonData(42,'M',750000.0))),
  (5L, Client("Ed",PersonData(55,'M',25000.0))),
  (6L, Client("Fran",PersonData(50,'F',3150000.0))),
  (7L, Client("Jack",PersonData(17,'M',5000.0))),
  (8L, Client("Jill",PersonData(16,'F',1000.0)))
)

val relationships = Array(
  Edge(2L, 1L, Relationship(7,.2)),
  Edge(2L, 4L, Relationship(2,.7)),
  Edge(3L, 2L, Relationship(4,.31)),
  Edge(3L, 6L, Relationship(3,.22)),
  Edge(4L, 1L, Relationship(1,.12)),
  Edge(5L, 2L, Relationship(2,.45)),
  Edge(5L, 3L, Relationship(8,.91))
)

val vertexRDD: RDD[(Long, Client)] = sc.parallelize(people)
val edgeRDD: RDD[Edge[Relationship]] = sc.parallelize(relationships)

val social_graph = Graph(vertexRDD,edgeRDD)

people: Array[(Long, Client)] = Array((1,Client(Alice,PersonData(28,F,150000.0))), (2,Client(Bob,PersonData(27,M,50000.0))), (3,Client(Charlie,PersonData(65,M,250000.0))), (4,Client(David,PersonData(42,M,750000.0))), (5,Client(Ed,PersonData(55,M,25000.0))), (6,Client(Fran,PersonData(50,F,3150000.0))), (7,Client(Jack,PersonData(17,M,5000.0))), (8,Client(Jill,PersonData(16,F,1000.0))))
relationships: Array[org.apache.spark.graphx.Edge[Relationship]] = Array(Edge(2,1,Relationship(7,0.2)), Edge(2,4,Relationship(2,0.7)), Edge(3,2,Relationship(4,0.31)), Edge(3,6,Relationship(3,0.22)), Edge(4,1,Relationship(1,0.12)), Edge(5,2,Relationship(2,0.45)), Edge(5,3,Relationship(8,0.91)))
vertexRDD: org.apache.spark.rdd.RDD[(Long, Client)] = ParallelCollectionRDD[0] at parallelize at <console>:55
edgeR...


In [6]:
//Fill in the types for social_graph and sourceId
def get_max_path(social_graph: Graph[Client,Relationship] ,sourceId: VertexId) = {
    val initialGraph = social_graph.mapVertices((id,_) => if (id == sourceId) 1.0 else 0.0)
    val vertexProgram = (id: VertexId, prob: Double, newProb: Double) => math.max(prob, newProb)
    val sendMsg = (triplet:EdgeTriplet[Double,Relationship]) => {
        val edgeProb = triplet.attr.msgProbability
        if (triplet.srcAttr == 0.0) {
            Iterator.empty
        } else if (triplet.srcAttr*edgeProb > triplet.dstAttr) {
            Iterator((triplet.dstId,triplet.srcAttr*edgeProb))
        } else {
            Iterator.empty
        }
    }
    val mrgMsg = (a:Double, b:Double) => math.max(a, b)
    //Add the arguments for pregel
    val maxPath = initialGraph.pregel(0.0,3)(vertexProgram,sendMsg,mrgMsg)
    //return the vertices
    maxPath.vertices
    
}

get_max_path: (social_graph: org.apache.spark.graphx.Graph[Client,Relationship], sourceId: org.apache.spark.graphx.VertexId)org.apache.spark.graphx.VertexRDD[Double]


In [7]:
get_max_path(social_graph,5).collect

res1: Array[(org.apache.spark.graphx.VertexId, Double)] = Array((8,0.0), (1,0.09000000000000001), (2,0.45), (3,0.91), (4,0.315), (5,1.0), (6,0.20020000000000002), (7,0.0))


# Problem 5

# dataframe, ml

In [15]:
val df = spark.createDataFrame(Seq(
(1,1,3.2),
(1,2,4.3),
(2,4,1.9),
(2,2,3.3),
(2,1,4.1),
(3,15,4.5),
(3,2,4.3)))
.toDF("user_id","movie_id","rating")
val train_df = df
val test_df = df
val avg_df = train_df.groupBy("movie_id").avg("rating")
val new_df = test_df.join(avg_df, Seq("movie_id")).withColumnRenamed("avg(rating)","prediction")

import org.apache.spark.ml.evaluation.RegressionEvaluator
val evaluator = new RegressionEvaluator()
.setMetricName("rmse")
.setLabelCol("rating")
.setPredictionCol("prediction")

val rmse = evaluator.evaluate(new_df)

df: org.apache.spark.sql.DataFrame = [user_id: int, movie_id: int ... 1 more field]
train_df: org.apache.spark.sql.DataFrame = [user_id: int, movie_id: int ... 1 more field]
test_df: org.apache.spark.sql.DataFrame = [user_id: int, movie_id: int ... 1 more field]
avg_df: org.apache.spark.sql.DataFrame = [movie_id: int, avg(rating): double]
new_df: org.apache.spark.sql.DataFrame = [movie_id: int, user_id: int ... 2 more fields]
import org.apache.spark.ml.evaluation.RegressionEvaluator
evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = RegressionEvaluator: uid=regEval_fae5d8f0b3c6, metricName=rmse, throughOrigin=false
rmse: Double = 0.39127386584748797


In [17]:
new_df.show

+--------+-------+------+------------------+
|movie_id|user_id|rating|        prediction|
+--------+-------+------+------------------+
|       1|      2|   4.1|              3.65|
|       1|      1|   3.2|              3.65|
|       2|      3|   4.3|3.9666666666666663|
|       2|      2|   3.3|3.9666666666666663|
|       2|      1|   4.3|3.9666666666666663|
|       4|      2|   1.9|               1.9|
|      15|      3|   4.5|               4.5|
+--------+-------+------+------------------+



# Problem 6

# udf, dataframe

In [18]:
val dataRDD = spark.sparkContext.makeRDD(
"""[{"name":"Le Monde","reviews":{"count":14,"rating":3.2},"serves":{"alcohol":true,"vegetarian":false}} ,
{"name":"Junzi Kitchen","reviews":{"count":7,"rating":4.5},"serves":{"alcohol":false,"vegetarian":true}},
{"name":"Atlas Kitchen","reviews":{"count":9,"rating":2.9},"serves":{"alcohol":true,"vegetarian":true}}]""":: Nil)
val df = spark.read.json(dataRDD)

dataRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[24309] at makeRDD at <console>:40
df: org.apache.spark.sql.DataFrame = [name: string, reviews: struct<count: bigint, rating: double> ... 1 more field]


In [19]:
dataRDD.collect

res11: Array[String] =
Array([{"name":"Le Monde","reviews":{"count":14,"rating":3.2},"serves":{"alcohol":true,"vegetarian":false}} ,
{"name":"Junzi Kitchen","reviews":{"count":7,"rating":4.5},"serves":{"alcohol":false,"vegetarian":true}},
{"name":"Atlas Kitchen","reviews":{"count":9,"rating":2.9},"serves":{"alcohol":true,"vegetarian":true}}])


In [48]:
df.show

+-------------+---------+-------------+
|         name|  reviews|       serves|
+-------------+---------+-------------+
|     Le Monde|{14, 3.2}|{true, false}|
|Junzi Kitchen| {7, 4.5}|{false, true}|
|Atlas Kitchen| {9, 2.9}| {true, true}|
+-------------+---------+-------------+



In [49]:
import org.apache.spark.sql.functions.udf
def score(alc:Boolean, vg: Boolean, ra: Double) = {
    var score = 0.0
    if (alc) score =1
    if (vg) score=score+1
    score+ra/2.0
}
val score_udf = udf(score _)
val df2 = df.withColumn("score",score_udf($"serves.alcohol",$"serves.vegetarian",$"reviews.rating"))

import org.apache.spark.sql.functions.udf
score: (alc: Boolean, vg: Boolean, ra: Double)Double
score_udf: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$5874/0x0000000801c31840@3c23ca66,DoubleType,List(Some(class[value[0]: boolean]), Some(class[value[0]: boolean]), Some(class[value[0]: double])),Some(class[value[0]: double]),None,false,true)
df2: org.apache.spark.sql.DataFrame = [name: string, reviews: struct<count: bigint, rating: double> ... 2 more fields]


In [50]:
df2.show

+-------------+---------+-------------+-----+
|         name|  reviews|       serves|score|
+-------------+---------+-------------+-----+
|     Le Monde|{14, 3.2}|{true, false}|  2.6|
|Junzi Kitchen| {7, 4.5}|{false, true}| 3.25|
|Atlas Kitchen| {9, 2.9}| {true, true}| 3.45|
+-------------+---------+-------------+-----+



# Problem 7

# read, fit & transform

In [54]:
import org.apache.spark.sql.expressions.Window
val df = sc.parallelize(Array((1,2,3),(2,3,4),(3,4,5),
                              (4,5,6),(5,6,7),(6,7,8),
                             (7,8,9),(9,10,11))).toDF("c1","c2","c3")
//For row 5, value in df_ma will be the average of rows 2,3,4.                            
val df_ma = df.withColumn("ma",avg(df("c1")).over(Window.orderBy("c2").rowsBetween(-3,-1)))
df_ma.show

22/12/15 17:01:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 17:01:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 17:01:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 17:01:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 17:01:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+---+---+---+----+
| c1| c2| c3|  ma|
+---+---+---+----+
|  1|  2|  3|null|
|  2|  3|  4| 1.0|
|  3|  4|  5| 1.5|
|  4|  5|  6| 2.0|
|  5|  6|  7| 3.0|
|  6|  7|  8| 4.0|
|  7

import org.apache.spark.sql.expressions.Window
df: org.apache.spark.sql.DataFrame = [c1: int, c2: int ... 1 more field]
df_ma: org.apache.spark.sql.DataFrame = [c1: int, c2: int ... 2 more fields]


In [53]:
val df = spark.read.option("header","true").option("inferschema","true").csv("AAPL.csv")

df: org.apache.spark.sql.DataFrame = [date: timestamp, price: double]


In [57]:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.udf

val df = spark.read.option("header","true").option("inferschema","true").csv("AAPL.csv")
def set_label(v: Double):Double = if (v>0) 1 else 0
val label_udf = udf(set_label _)
val df_new = df.withColumn("ma8", avg(df("price")).over(Window.orderBy("date").rowsBetween(-7,0)))
    .withColumn("ma13", avg(df("price")).over(Window.orderBy("date").rowsBetween(-12,0)))
    .withColumn("diff", $"ma8" - $"ma13")
    .withColumn("label", label_udf($"diff"))
import org.apache.spark.ml.feature.QuantileDiscretizer

val discretizer = new QuantileDiscretizer()
    .setInputCol("diff")
    .setOutputCol("deciles")
    .setNumBuckets(10)

val result = discretizer.fit(df_new).transform(df_new)
result.show(100,false)

22/12/15 17:10:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 17:10:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 17:10:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 17:10:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 17:10:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 17:10:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/12/15 1

|1981-03-27 00:00:00|0.086604|0.090485625        |0.08630092307692308|0.0041847019230769195 |1.0  |6.0    |
|1981-03-30 00:00:00|0.086604|0.09004825000000001|0.08714207692307692|0.002906173076923091  |1.0  |5.0    |
|1981-03-31 00:00:00|0.085729|0.089610875        |0.08768046153846153|0.00193041346153848   |1.0  |5.0    |
|1981-04-01 00:00:00|0.084854|0.08895475         |0.08821876923076924|7.359807692307596E-4  |1.0  |4.0    |
|1981-04-02 00:00:00|0.09229 |0.08879075         |0.08909353846153847|-3.027884615384724E-4 |0.0  |4.0    |
|1981-04-03 00:00:00|0.092727|0.088736           |0.08969915384615385|-9.631538461538497E-4 |0.0  |3.0    |
|1981-04-06 00:00:00|0.090977|0.08868125         |0.08976638461538464|-0.0010851346153846336|0.0  |3.0    |
|1981-04-07 00:00:00|0.090103|0.088736           |0.08983369230769232|-0.0010976923076923273|0.0  |3.0    |
|1981-04-08 00:00:00|0.094477|0.08972012500000001|0.09017015384615383|-4.500288461538188E-4 |0.0  |4.0    |
|1981-04-09 00:00:00|0.09622

import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.udf
df: org.apache.spark.sql.DataFrame = [date: timestamp, price: double]
set_label: (v: Double)Double
label_udf: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$6407/0x0000000801ba5040@60c61238,DoubleType,List(Some(class[value[0]: double])),Some(class[value[0]: double]),None,false,true)
df_new: org.apache.spark.sql.DataFrame = [date: timestamp, price: double ... 4 more fields]
import org.apache.spark.ml.feature.QuantileDiscretizer
discretizer: org.apache.spark.ml.feature.QuantileDiscretizer = quantileDiscretizer_5aa5150f166b
result: org.apache.spark.sql.DataFrame = [date: timestamp, price: double ... 5 more fields]


In [58]:
values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
df1 = spark.createDataFrame(values, ["values"])
qds1 = QuantileDiscretizer(inputCol="values", outputCol="buckets")
qds1.setNumBuckets(2)

qds1.setRelativeError(0.01)

qds1.setHandleInvalid("error")

qds1.getRelativeError()

bucketizer = qds1.fit(df1)
qds1.setHandleInvalid("keep").fit(df1).transform(df1).count()

<console>: 2: error: illegal start of simple expression