/
Library.scala
154 lines (125 loc) · 3.98 KB
/
Library.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
package info.kwarc.mmt.oeis
import java.io.File
import java.net.URL
import java.util.Calendar
import scala.io.Source
import scala.util.Random
import scala.xml.{Elem, XML}
import parser.{DocumentParser}
object Library {
//store everything, check before crawling
//hardcoded for now
val dictionary = Source.fromFile("../../mmt-oeis/resources/dictionary").getLines().map(_.trim).toSet
val docParser = new DocumentParser(dictionary)
private def getURL(entryID : String) : URL = new URL("""http://oeis.org/search?q=id:"""+entryID+"""&fmt=text""")
//will just give id of number-th OEIS entry
private def createID( number : String) : String = "A"+"000000".substring(0,6-number.length) + number
def crawlDocuments(from : Int, to :Int) = {
if(from < 1){
throw new Error("There is no entry "+from+" in OEIS!")
}
from to to foreach(i =>{
val theory = createID(i.toString)
val file = Source.fromURL(getURL(theory))
printToFile(new File("resources/"+theory)){
p => file.getLines().foreach(p.println)
}
if(i % 10 == 0){
println("Fetching entry "+ theory)
}
file.close()
})
}
def crawlXML(from : Int, to : Int)= {
if(from < 1){
throw new Error("There is no entry "+from+" in OEIS!")
}
from to to foreach(i =>{
val theory = createID(i.toString)
val file = Source.fromURL(getURL(theory))
val xml = docParser.fromReaderToXML(file)
if(i % 10 == 0){
println("Fetching entry "+ theory)
}
file.close()
writeXML(xml, theory)
})
}
def crawlXMLLocal(from : Int, to : Int)= {
if(from < 1){
throw new Error("There is no entry "+from+" in OEIS!")
}
from to to foreach(i =>{
val theory = createID(i.toString)
val fileLoc = "/home/enxhi/github/OEISLAB/oeis/source/oeis_source/"+theory+".txt"
val ioFile = new java.io.File(fileLoc)
if(ioFile.exists) {
val file = Source.fromFile(ioFile)
val xml = docParser.fromReaderToXML(file)
if (i % 1000 == 0) {
println("Fetching entry " + theory)
}
file.close()
writeXML(xml, theory)
}else{
println("File doesn't exists: " + theory)
}
})
}
def crawlText(from : Int, to : Int) = {
if(from < 1){
throw new Error("There is no entry "+from+" in OEIS!")
}
from to to foreach(i =>{
val theory = createID(i.toString)
val doc: List[String] = docParser.getFormulas(Source.fromURL(getURL(theory)))
if(i % 10 == 0){
println("Fetching entry "+ theory)
}
writeFormula(doc, theory)
})
}
def getXML(entry : Int) : Elem = {
val id = createID(entry.toString)
// if(storage.get(id).isEmpty){
docParser.fromReaderToXML(Source.fromURL(getURL(id)))
// }else{
// storage.get(id).get.toNode
// }
}
def printToFile(f: java.io.File)(op: java.io.PrintWriter => Unit): Unit = {
val print = new java.io.PrintWriter(f)
try {
op(print)
} finally {
print.close()
}
}
def writeXML(xml : Elem, theory : String) = {
XML.save("xml_out/" + theory +".omdoc", xml, "UTF-8", true, null)
}
def writeFormula(formulas : List[String], theory : String) : Unit = {
printToFile(new File("xml_out/"+theory)) { p =>
formulas.foreach(p.println)
}
}
def main(args : Array[String]) = {
// crawlXMLLocal(1, 3000)
println(Calendar.getInstance().getTime())
val documents = 5000
val max = 255000
val scriptPath = "/home/enxhi/github/logs/oeis_sc"
println(1 to 1)
val rndm = new Random()
1 to documents foreach { x =>
val n = rndm.nextInt(max) + 1
val theory = createID(n.toString)
printToFile(new File(scriptPath))( p => "cp ../../oeis/source/oeis_omdoc/" + theory + " ../source/" )
crawlXMLLocal(n,n)
}
println(docParser.textParser.succeded)
println(docParser.textParser.calls)
println(docParser.textParser.exceptions)
println(Calendar.getInstance().getTime())
}
}