-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tokenizer.java
129 lines (119 loc) · 3.51 KB
/
Tokenizer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package Analyzing;
import Indexing.Dictionary;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.regex.Pattern;
/**
* This class is used to remove stop words and remove punctuation of words.
*
* @author rl07bebb
*/
public class Tokenizer {
ArrayList<String> stopWords;
Pattern pattern = Pattern.compile("[\\W\\d]");
public Tokenizer() {
}
/**
* This method is responsible for creating a list of words to be ignored
*
* @param stopList
*/
public Tokenizer(File stopList) {
stopWords = new ArrayList<>();
Scanner scanner = null;
try {
scanner = new Scanner(stopList);
while (scanner.hasNext()) {
stopWords.add(pattern.matcher(scanner.next()).replaceAll(""));
}
} catch (IOException e) {
System.err.println("Tokenizer.Constructor:Error");
e.toString();
} finally {
if (scanner != null) {
scanner.close();
}
}
}
/**
* This method reads from the doc file and accounts for ever term and calls
* the proceeding methods for the inclusion of the terms to the dictionary.
*
* @param doc the document to be read from
* @param dictionary the dictionary to be used. static Dictionary by default
*/
public void readFile(Document doc, Dictionary dictionary) {
int count = 0;
try (Scanner scanner = new Scanner(doc.getFile())) {
while (scanner.hasNext()) {
String word = scanner.next().toLowerCase();
word = pattern.matcher(word).replaceAll("");
if (word.length() > 0) {
count++;
if (stopWords.contains(word)) {
} else {
dictionary.checkNewTerm(word, doc, count);
}
}
}
} catch (IOException e) {
System.err.println("Tokenizer.readFile():Error");
e.toString();
}
}
/**
* This method is suppose to preform the same operations as readFile, but
* includes a local dictionary and is pertaining to a query. See readFile.
*
* @param doc
* @param dictionary
*/
public void readQuery(String doc, Dictionary dictionary) {
int count = 0;
if (doc.equals("")) {
return;
}
Scanner scanner = new Scanner(doc);
while (scanner.hasNext()) {
String word = scanner.next().toLowerCase();
word = pattern.matcher(word).replaceAll("");
if (word.length() > 0) {
count++;
if (stopWords.contains(word)) {
continue;
} else {
dictionary.checkNewTerm(word, null, count);
}
}
}
scanner.close();
}
/**
* This method returns all elements in the stoplist to string.
*
* @return
*/
public String toStringAll() {
String all = "";
if (stopWords == null) {
return "stopWords = null";
}
for (String s : stopWords) {
all += s;
}
return all;
}
/**
* This method prints all elements in the stoplist to string.
*/
public void toPrintAll() {
System.out.println(toStringAll());
}
}