Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
47a9f89
commit 928322a
Showing
5 changed files
with
136 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
1 hello 2 | ||
1 twitter 1 | ||
2 conversation 1 | ||
2 celebrities 1 | ||
2 twitter 1 | ||
3 elections 1 | ||
3 debate 1 | ||
3 twitter 1 | ||
3 political 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
1 2 1 | ||
1 3 1 | ||
3 2 1 | ||
4 2 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import com.twitter.scalding._ | ||
import com.twitter.scalding.mathematics.Matrix | ||
|
||
/* | ||
* MatrixJaccardSimilarity9.scala | ||
* | ||
* Adapted from "MatrixTutorial5" in the tutorials that come with Scalding. | ||
* | ||
* Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] | ||
* and computes the jaccard similarity between any two pairs of vectors | ||
* | ||
* You invoke the script like this: | ||
* run.rb scripts/MatrixJaccardSimilarity9.scala \ | ||
* --input data/matrix/graph.tsv \ | ||
* --output output/jaccardSim.tsv | ||
* | ||
*/ | ||
|
||
class MatrixJaccardSimilarity9(args : Args) extends Job(args) { | ||
|
||
import Matrix._ | ||
|
||
val adjacencyMatrix = Tsv(args("input"), ('user1, 'user2, 'rel)) | ||
.read | ||
.toMatrix[Long,Long,Double]('user1, 'user2, 'rel) | ||
|
||
val aBinary = adjacencyMatrix.binarizeAs[Double] | ||
|
||
// intersectMat holds the size of the intersection of row(a)_i n row (b)_j | ||
val intersectMat = aBinary * aBinary.transpose | ||
val aSumVct = aBinary.sumColVectors | ||
val bSumVct = aBinary.sumRowVectors | ||
|
||
//Using zip to repeat the row and column vectors values on the right hand | ||
//for all non-zeroes on the left hand matrix | ||
val xMat = intersectMat.zip(aSumVct).mapValues( pair => pair._2 ) | ||
val yMat = intersectMat.zip(bSumVct).mapValues( pair => pair._2 ) | ||
|
||
val unionMat = xMat + yMat - intersectMat | ||
//We are guaranteed to have Double both in the intersection and in the union matrix | ||
intersectMat.zip(unionMat) | ||
.mapValues(pair => pair._1 / pair._2) | ||
.write(Tsv(args("output"))) | ||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import com.twitter.scalding._ | ||
import com.twitter.scalding.mathematics.Matrix | ||
|
||
/* | ||
* TfIdf10.scala | ||
* | ||
* Adapted from "MatrixTutorial6" in the tutorials that come with Scalding. | ||
* | ||
* Loads a document to word matrix where a[i,j] = freq of the word j in the document i | ||
* computes the Tf-Idf score of each word w.r.t. to each document and keeps the top nrWords in each document | ||
* (see http://en.wikipedia.org/wiki/Tf*idf for more info) | ||
* | ||
* You invoke the script like this: | ||
* run.rb scripts/TfIdf10.scala \ | ||
* --input data/matrix/docBOW.tsv \ | ||
* --output output/featSelectedMatrix.tsv \ | ||
* --nWords 300 | ||
*/ | ||
class TfIdf10(args : Args) extends Job(args) { | ||
|
||
import Matrix._ | ||
|
||
val docWordMatrix = Tsv( args("input"), ('doc, 'word, 'count) ) | ||
.read | ||
.toMatrix[Long,String,Double]('doc, 'word, 'count) | ||
|
||
// compute the overall document frequency of each row | ||
val docFreq = docWordMatrix.sumRowVectors | ||
|
||
// compute the inverse document frequency vector | ||
val invDocFreqVct = docFreq.toMatrix(1).rowL1Normalize.mapValues( x => log2(1/x) ) | ||
|
||
// zip the row vector along the entire document - word matrix | ||
val invDocFreqMat = docWordMatrix.zip(invDocFreqVct.getRow(1)).mapValues( pair => pair._2 ) | ||
|
||
// multiply the term frequency with the inverse document frequency and keep the top nrWords | ||
docWordMatrix.hProd(invDocFreqMat).topRowElems( args("nWords").toInt ).write(Tsv( args("output") )) | ||
|
||
def log2(x : Double) = scala.math.log(x)/scala.math.log(2.0) | ||
|
||
} | ||
|