In [5]:
%%init_spark
launcher.packages= ["graphframes:graphframes:0.8.2-spark3.2-s_2.12"]

In [15]:
import org.apache.spark.sql.DataFrame
val doc1 = ("doc 1","""
Columbia University is a large university in New York.
It has many schools including Columbia College, Engineering School, Law School, and Business School.
It was established in 1754
""")
val doc2 = ("doc 2","""
Operations Research is a department in the Engineering School of Columbia University.
Operations Research was established in 1919.
Operations Research has a BS major and offers many MS degrees.
Graduates of Operations Research get good jobs and have a very happy life.
""")

def both_uc(w1: String,w2: String): Boolean = if (w1(0).isUpper & w2(0).isUpper) true else false
both_uc("columbia","University")

def split_data(a: String): Array[String] = a.split("\\s+")
def clean_data(a: String): String =
    a.replace("\n"," ").replace("."," ").replace(","," ").replace("  "," ").trim()


def replace_entities(a: Array[String]):Array[String] = {
    val indices = 0 to a.length-1
    indices.slice(0,indices.length-1)
    .flatMap(i => 
         if (both_uc(a(i),a(i+1))) Some(a(i)+a(i+1))
         else None)
    .toArray
}

val clean_data_udf = udf(clean_data _)
val split_data_udf = udf(split_data _)
val replace_entities_udf = udf(replace_entities _)

def make_df(s: Seq[(String,String)]): DataFrame = {
    sc.parallelize(s)
        .toDF("document_id","document_text")
        .withColumn("cleaned_string",clean_data_udf($"document_text"))
        .withColumn("document_terms",split_data_udf($"cleaned_string"))
        .withColumn("entity_terms",replace_entities_udf($"document_terms"))
}


val df = make_df(Array(doc1,doc2))

import org.apache.spark.ml.feature.CountVectorizer
val countVectorizer = new CountVectorizer()
    .setInputCol("entity_terms")
    .setOutputCol("term_freqs")
    .setVocabSize(20)

val vocabModel = countVectorizer.fit(df)
val freqs = vocabModel.transform(df)

import org.apache.spark.ml.feature.IDF

val idf = new IDF()
    .setInputCol("term_freqs")
    .setOutputCol("tfidfVec")
val idfModel = idf.fit(freqs)
val idfMatrix = idfModel
                .transform(freqs)
                .select("document_id", "tfidfVec")

idfMatrix.show(false)

+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|document_id|tfidfVec                                                                                                                                                                                      |
+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|doc 1      |(12,[1,2,4,5,6,7,8,9,10,11],[0.0,0.0,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644,0.4054651081081644])|
|doc 2      |(12,[0,1,2,3],[1.6218604324326575,0.0,0.0,0.4054651081081644])                                                                                                         

import org.apache.spark.sql.DataFrame
doc1: (String, String) =
(doc 1,"
Columbia University is a large university in New York.
It has many schools including Columbia College, Engineering School, Law School, and Business School.
It was established in 1754
")
doc2: (String, String) =
(doc 2,"
Operations Research is a department in the Engineering School of Columbia University.
Operations Research was established in 1919.
Operations Research has a BS major and offers many MS degrees.
Graduates of Operations Research get good jobs and have a very happy life.
")
both_uc: (w1: String, w2: String)Boolean
split_data: (a: String)Array[String]
clean_data: (a: String)String
replace_entities: (a: Array[String])Array[String]
clean_data_udf: org.apache.spark.sql.expressions.UserDefinedFunction = Spar...


In [3]:
make_df(Array(doc1,doc2))

res0: org.apache.spark.sql.DataFrame = [document_id: string, document_text: string ... 3 more fields]


In [7]:
df.show

+-----------+--------------------+--------------------+--------------------+--------------------+
|document_id|       document_text|      cleaned_string|      document_terms|        entity_terms|
+-----------+--------------------+--------------------+--------------------+--------------------+
|      doc 1|\nColumbia Univer...|Columbia Universi...|[Columbia, Univer...|[ColumbiaUniversi...|
|      doc 2|\nOperations Rese...|Operations Resear...|[Operations, Rese...|[OperationsResear...|
+-----------+--------------------+--------------------+--------------------+--------------------+



In [9]:
freqs.show

+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|document_id|       document_text|      cleaned_string|      document_terms|        entity_terms|          term_freqs|
+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      doc 1|\nColumbia Univer...|Columbia Universi...|[Columbia, Univer...|[ColumbiaUniversi...|(12,[1,2,3,4,5,6,...|
|      doc 2|\nOperations Rese...|Operations Resear...|[Operations, Rese...|[OperationsResear...|(12,[0,1,2,9],[4....|
+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [14]:
freqs.select($"term_freqs").show(false)

+----------------------------------------------------------------------+
|term_freqs                                                            |
+----------------------------------------------------------------------+
|(12,[1,2,3,4,5,6,7,8,10,11],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(12,[0,1,2,9],[4.0,1.0,1.0,1.0])                                      |
+----------------------------------------------------------------------+

