forked from trananh/word2vec-scala
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathReader.scala
More file actions
130 lines (102 loc) · 3.66 KB
/
Reader.scala
File metadata and controls
130 lines (102 loc) · 3.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
package word2vec
import java.io._
import scala.Array
case class Vocab(vectors: Map[String, Array[Float]], size: Int)
trait TypeReader[A] {
def read(dis: DataInputStream): A
}
object TypeReader {
def apply[A: TypeReader] = implicitly[TypeReader[A]]
implicit object ByteReader extends TypeReader[Byte] {
def read(dis: DataInputStream): Byte = dis.readByte()
}
implicit def readerToStream[A:TypeReader] = new TypeReader[Stream[A]] {
def read(dis: DataInputStream): Stream[A] = TypeReader[A].read(dis) #:: read(dis)
}
implicit object StringReader extends TypeReader[String] {
/** ASCII values for common delimiter characters */
private val SPACE = 32
private val LF = 10
private val DELIMS = Set(SPACE, LF)
def read(dis: DataInputStream): String = {
val s = TypeReader[Stream[Byte]].read(dis).takeWhile(!DELIMS.contains(_))
new String(s.toArray[Byte])
}
}
implicit object FloatReader extends TypeReader[Float] {
def read(dis: DataInputStream): Float = {
val i = java.lang.Integer.reverseBytes(dis.readInt())
java.lang.Float.intBitsToFloat(i)
}
}
implicit object IntReader extends TypeReader[Int] {
def read(dis: DataInputStream): Int = TypeReader[String].read(dis).toInt
}
}
/** A simple binary file reader.
* @constructor Create a binary file reader.
* @param file The binary file to be read.
*
* @author trananh
*/
class VecBinaryReader(file: File) {
/** Open input streams */
private val fis = new FileInputStream(file)
private val bis = new BufferedInputStream(fis)
private val dis = new DataInputStream(bis)
/** Close the stream. */
def close() { dis.close(); bis.close(); fis.close() }
def read[A:TypeReader] = TypeReader[A].read(dis)
}
object VecBinaryReader {
def apply(file: File):VecBinaryReader = new VecBinaryReader(file)
def apply(filename: String): VecBinaryReader = apply(new File(filename))
def loadFile(filename: String): Option[File] = {
val file = new File(filename)
if(!file.exists()) None else Some(file)
}
def withReader[A](file: File)(f: VecBinaryReader => A): A = {
val reader = VecBinaryReader(file)
try {
f(reader)
} finally {
reader.close()
}
}
/** Compute the magnitude of the vector.
* @param vec The vector.
* @return The magnitude of the vector.
*/
def magnitude(vec: Array[Float]): Double = {
math.sqrt(vec.toStream.map(a => a * a).sum)
}
def normVector(vec: Array[Float]): Array[Float] = {
val norm = magnitude(vec)
vec.map(a => (a / norm).toFloat)
}
def readVector(reader: VecBinaryReader, vecSize:Int, normalize: Boolean): (String, Array[Float]) = {
// Read the word
val word = reader.read[String]
val vector = new Array[Float](vecSize)
for((f, i) <- reader.read[Stream[Float]].take(vecSize).zipWithIndex) {
vector(i) = f
}
// Eat up the next delimiter character
reader.read[Byte]
// Store the normalized vector representation, keyed by the word
word -> (if (normalize) normVector(vector) else vector)
}
def load(filename: String, limit: Integer = Int.MaxValue, normalize: Boolean = true): Option[Vocab] = {
for(file <- loadFile(filename)) yield VecBinaryReader.withReader(file) { reader =>
// Read header info
val numWords = reader.read[Int]
val vecSize = reader.read[Int]
println("\nFile contains " + numWords + " words with vector size " + vecSize)
def wordPairs = Stream.continually(readVector(reader, vecSize, normalize))
val N = numWords.min(limit)
val map = wordPairs.take(N).toMap
println("Loaded " + math.min(numWords, limit) + " words.\n")
Vocab(map, vecSize)
}
}
}