In [1]:
%%init_spark
launcher.packages= ["graphframes:graphframes:0.8.2-spark3.2-s_2.12"]

# Problem 1

In [41]:
import org.apache.spark.sql.DataFrame
val doc1 = ("doc 1","""
Columbia University is a large university in New York.
It has many schools including Columbia College, Engineering School, Law School, and Business School.
It was established in 1754
""")
val doc2 = ("doc 2","""
Operations Research is a department in the Engineering School of Columbia University.
Operations Research was established in 1919.
Operations Research has a BS major and offers many MS degrees.
Graduates of Operations Research get good jobs and have a very happy life.
""")

def both_uc(w1: String,w2: String): Boolean = 
{
    w1(0).isUpper & w2(0).isUpper
}

def split_data(a: String): Array[String] = a.split("\\s+")

def clean_data(a: String): String = 
{
//     a.filter(a=>a != ','  & a != '.' & a != '\n').trim() //not works because filter \n will concate words together
    a.replace("\n"," ").replace("."," ").replace(","," ").replace("  "," ").trim()
}


def replace_entities(a: Array[String]):Array[String] = {
    val indices = 0 to a.length-1
    indices.slice(0,indices.length-1)
    .flatMap(i => 
         if (both_uc(a(i),a(i+1))) Some(a(i)+a(i+1))
         else None)
    .toArray
}

val clean_data_udf = udf(clean_data _)
val split_data_udf = udf(split_data _)
val replace_entities_udf = udf(replace_entities _)

def make_df(a: Seq[(String,String)]): DataFrame = {
    sc.parallelize(a)
        .toDF("document_id","document_text")
        .withColumn("cleaned_text",clean_data_udf($"document_text"))
        .withColumn("document_terms",split_data_udf($"cleaned_text"))
        .withColumn("entity_terms",replace_entities_udf($"document_terms"))
}

val df = make_df(Array(doc1,doc2))

//Using CountVectorizer, generate term_freqs column
import org.apache.spark.ml.feature.CountVectorizer
val countVectorizer = new CountVectorizer()
    .setInputCol("entity_terms")
    .setOutputCol("term_freqs")
    .setVocabSize(20)

val vocabModel = countVectorizer.fit(df)
val freqs = vocabModel.transform(df)

//Using IDF get the tfidfVec
import org.apache.spark.ml.feature.IDF

val idf = new IDF().setInputCol("term_freqs").setOutputCol("tfidfVec")
val idfModel = idf.fit(freqs)
val idfMatrix = idfModel.transform(freqs).select("document_id","tfidfVec")

idfMatrix.show(false) //The Result

+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document_id|tfidfVec                                                                                                                                                                                     |
+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|doc 1      |(12,[1,2,3,4,5,6,7,8,9,11],[0.0,0.0,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644])|
|doc 2      |(12,[0,1,2,10],[1.6218604324326575,0.0,0.0,0.4054651081081644])                                                                                                            

import org.apache.spark.sql.DataFrame
doc1: (String, String) =
(doc 1,"
Columbia University is a large university in New York.
It has many schools including Columbia College, Engineering School, Law School, and Business School.
It was established in 1754
")
doc2: (String, String) =
(doc 2,"
Operations Research is a department in the Engineering School of Columbia University.
Operations Research was established in 1919.
Operations Research has a BS major and offers many MS degrees.
Graduates of Operations Research get good jobs and have a very happy life.
")
both_uc: (w1: String, w2: String)Boolean
split_data: (a: String)Array[String]
clean_data: (a: String)String
replace_entities: (a: Array[String])Array[String]
clean_data_udf: org.apache.spark.sql.expressions.UserDefinedFunction = Spar...


# Problem 2

# note: GraphFrame, bfs, CONS! mapValues

In [136]:
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.graphframes._


val vertexArray = Array(
  (1,1),
  (2,2),
  (3,3),
  (4,4),
  (5,5),
  (6,6),
    (7,7),
    (8,8)
)


val edgeArray = Array(
  (1, 3),
  (2, 3),
  (2, 4),
  (4, 5),
  (3, 5),
  (5, 6),
  (6, 7),
  (6, 8),
  (7, 8)
)

val vertex_df = spark.createDataFrame(vertexArray).toDF("id","v_desc")
val edge_df = spark.createDataFrame(edgeArray).toDF("src","dst")

val g = GraphFrame(vertex_df, edge_df)

//Function to get all vertex pairs. This is written for you
def getAllVertexPairs(g: GraphFrame): Array[(Int,Int)] = {
    def getAllPairs(nums: Seq[Int]) =
        nums.flatMap(x => nums.map(y => (x,y))).filter(p=>p._1 != p._2)

    val col_vals = g.vertices.select("id").map(_.getInt(0)).collect.toSeq.toArray
    val all_vertex_pairs = getAllPairs(col_vals).toArray
    all_vertex_pairs
}

//Function to get the shortest path between two vertices. This is also already written
//for you
//Note that this uses the bfs algorithm. So it will take some time to run and should
//not be run on large graphs!

//:_* a special notation that tells the compiler to pass each element as its own argument, rather than all of it as a single argument.

def getShortestPath(g: GraphFrame,i: Int, j: Int) = {
    val path_df = g.bfs.fromExpr(s"id=$i").toExpr(s"id=$j").run() 
    if (path_df.count > 0) {
        val cols = path_df.columns.filter(n=>n.contains("v")).map(n=>col(n+".id"))
        val a = path_df.select(cols:_*).rdd.collect()(0).toSeq.toArray.map(e => e.toString.toInt)
        a
    }
    else Array[Int]()
}

def getAllShortestPaths(g: GraphFrame):List[Array[Int]]  = {
    def loop(a: Array[(Int,Int)]):List[Array[Int]] = {
        if (a.length == 0) List[Array[Int]]()
        else {
            val sp = getShortestPath(g,a(0)._1,a(0)._2)
            if (a.length == 1)
                List(sp)
            else sp ::loop(a.slice(1,a.length))
        }
    }
    val all_vertex_pairs = getAllVertexPairs(g)
    loop(all_vertex_pairs)
}

def getBetweenessCentrality(g: GraphFrame) = {
    //get all shortest paths removing empty paths
    val all_shortest_paths = getAllShortestPaths(g).filter(p=>p.length>0)
    val vertices = g.vertices.select("id").rdd.map(v=>v(0).toString.toInt)
    val denominator = vertices.count * (vertices.count - 1)
    vertices.map(v => all_shortest_paths.flatten.groupBy(identity).mapValues(_.size).getOrElse(v,0)*1.0/denominator)
}

//Result
val b = getBetweenessCentrality(g)
b.collect

import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.graphframes._
vertexArray: Array[(Int, Int)] = Array((1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8))
edgeArray: Array[(Int, Int)] = Array((1,3), (2,3), (2,4), (4,5), (3,5), (5,6), (6,7), (6,8), (7,8))
vertex_df: org.apache.spark.sql.DataFrame = [id: int, v_desc: int]
edge_df: org.apache.spark.sql.DataFrame = [src: int, dst: int]
g: org.graphframes.GraphFrame = GraphFrame(v:[id: int, v_desc: int], e:[src: int, dst: int])
getAllVertexPairs: (g: org.graphframes.GraphFrame)Array[(Int, Int)]
getShortestPath: (g: org.graphframes.GraphFrame, i: Int, j: Int)Array[Int]
getAllShortestPaths: (g: org.graphframes.GraphFrame)List[Array[Int]]
getBetweenessCentrality: (g: org.graphframes.GraphFrame)org.apache.spark...


In [137]:
b.collect

res91: Array[Double] = Array(0.0, 0.0, 0.10714285714285714, 0.03571428571428571, 0.21428571428571427, 0.17857142857142858, 0.0, 0.0)
