diff --git a/src/main/java/focusedCrawler/util/vsm/VSMVector.java b/src/main/java/focusedCrawler/util/vsm/VSMVector.java index 19a2d7e81..183629543 100644 --- a/src/main/java/focusedCrawler/util/vsm/VSMVector.java +++ b/src/main/java/focusedCrawler/util/vsm/VSMVector.java @@ -1,33 +1,5 @@ -/* -############################################################################ -## -## Copyright (C) 2006-2009 University of Utah. All rights reserved. -## -## This file is part of DeepPeep. -## -## This file may be used under the terms of the GNU General Public -## License version 2.0 as published by the Free Software Foundation -## and appearing in the file LICENSE.GPL included in the packaging of -## this file. Please review the following to ensure GNU General Public -## Licensing requirements will be met: -## http://www.opensource.org/licenses/gpl-license.php -## -## If you are unsure which license is appropriate for your use (for -## instance, you are interested in developing a commercial derivative -## of DeepPeep), please contact us at deeppeep@sci.utah.edu. -## -## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE -## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. -## -############################################################################ -*/ package focusedCrawler.util.vsm; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.io.StringReader; import java.net.MalformedURLException; import java.net.URL; import java.util.Collections; @@ -36,27 +8,10 @@ import java.util.Map; import java.util.Vector; -import org.cyberneko.html.parsers.DOMParser; -import org.w3c.dom.Document; -import org.w3c.dom.NamedNodeMap; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; - import focusedCrawler.util.parser.PaginaURL; import focusedCrawler.util.string.PorterStemmer; import focusedCrawler.util.string.StopList; -/** - *

Title:

- * - *

Description:

- * - * @author not attributable - * @version 1.0 - */ - public class VSMVector { private HashMap elems; @@ -67,96 +22,10 @@ public class VSMVector { private String id; - public VSMVector() { - this.elems = new HashMap<>(); - } - public VSMVector(StopList stoplist) { this.elems = new HashMap<>(); this.stoplist = stoplist; } - - public VSMVector(String file, boolean isForm, StopList stoplist) throws MalformedURLException, IOException, SAXException { - this.stoplist = stoplist; - this.elems = new HashMap<>(); - if(isForm){ - DOMParser parser = new DOMParser(); - if((file.toLowerCase()).indexOf("
2 ){ - word = "meta" + word; - VSMElement vsmElem = this.getElement(word); - if(vsmElem == null){ - this.addElement(new VSMElement(word,metaOccurrencies[i])); - }else{ - double weight = vsmElem.getWeight(); - this.addElement(new VSMElement(word,weight+1)); - } - } - } - - for (int i = 0; i < titleWords.length; i++) { - String word = titleWords[i].toLowerCase(); - - word = stemmer.stem(word); - if(word.indexOf("No term") != -1){ - continue; - } - if(word.length() > 2 ){ - word = "title" + word; - VSMElement vsmElem = this.getElement(word); - if(vsmElem == null){ - this.addElement(new VSMElement(word,1)); - }else{ - double weight = vsmElem.getWeight(); - this.addElement(new VSMElement(word,weight+1)); - } - } - } - } public VSMVector(String document, StopList stoplist, boolean stem) throws MalformedURLException { if(!document.contains("")){ @@ -176,7 +45,6 @@ public VSMVector(String document, StopList stoplist, boolean stem) throws Malf if(frequencies[i] == 0){ continue; } -// if(words[i].length() > 2 ){ VSMElement vsmElem = this.getElement(words[i]); if(vsmElem == null){ this.addElement(new VSMElement(words[i],1)); @@ -184,57 +52,58 @@ public VSMVector(String document, StopList stoplist, boolean stem) throws Malf double weight = vsmElem.getWeight(); this.addElement(new VSMElement(words[i],1+weight)); } -// } } } } - public String getId(){ - return this.id; - } - - public VSMVector(String id, String document, StopList stoplist) throws - MalformedURLException { - this(document,stoplist); - this.id = id; - } - - public VSMVector(String document, StopList stoplist) throws MalformedURLException { - - if(!document.contains("")){ - document = " " + document + " "; - } - this.stoplist = stoplist; - this.elems = new HashMap<>(); - PaginaURL page = new PaginaURL(new URL("http://www"), document, stoplist); - addTitle(page, stoplist); - stemPage(page, false); + private void addTitle(PaginaURL page, StopList stoplist) throws MalformedURLException { + this.stoplist = stoplist; + PaginaURL title = new PaginaURL(new URL("http://www"), page.titulo(), stoplist); + String[] titleWords = title.palavras(); + String[] metaTerms = page.palavrasMeta(); + int[] metaOccurrencies = page.ocorrenciasMeta(); + for (int i = 0; i < metaTerms.length; i++) { + String word = metaTerms[i].toLowerCase(); + word = stemmer.stem(word); + if (word.indexOf("No term") != -1) { + continue; + } + if (word.length() > 2) { + word = "meta" + word; + VSMElement vsmElem = this.getElement(word); + if (vsmElem == null) { + this.addElement(new VSMElement(word, metaOccurrencies[i])); + } else { + double weight = vsmElem.getWeight(); + this.addElement(new VSMElement(word, weight + 1)); + } + } + } - } + for (int i = 0; i < titleWords.length; i++) { + String word = titleWords[i].toLowerCase(); - public VSMVector(PaginaURL page, StopList stoplist) throws MalformedURLException { - this.stoplist = stoplist; - this.elems = new HashMap<>(); - stemPage(page, false); - } + word = stemmer.stem(word); + if (word.indexOf("No term") != -1) { + continue; + } + if (word.length() > 2) { + word = "title" + word; + VSMElement vsmElem = this.getElement(word); + if (vsmElem == null) { + this.addElement(new VSMElement(word, 1)); + } else { + double weight = vsmElem.getWeight(); + this.addElement(new VSMElement(word, weight + 1)); + } + } + } + } - - public VSMVector(String []words, StopList stoplist) throws MalformedURLException, IOException, SAXException { - this.stoplist = stoplist; - String word; - - for (int i = 0; i < words.length; i++) { - word = stemmer.stem(words[i]); - VSMElement vsmElem = this.getElement(word); - if(vsmElem == null){ - this.addElement(new VSMElement(word, 1)); - }else{ - double weight = vsmElem.getWeight(); - this.addElement(new VSMElement(word, 1+weight)); - } - } + public String getId(){ + return this.id; } - + public void addElements(String []words) { for (int i = 0; i < words.length; i++) { this.addElement(words[i]); @@ -247,8 +116,6 @@ public void addElement(String word) { public void addElement(VSMElement elem){ -// if(!stoplist.eIrrelevante(elem.getWord())){ -// word = stemmer.stem(word); VSMElement vsmElem = this.getElement(elem.getWord()); if(vsmElem == null){ elems.put(elem.getWord(), elem); @@ -256,7 +123,6 @@ public void addElement(VSMElement elem){ double weight = vsmElem.getWeight(); elems.put(elem.getWord(),new VSMElement(elem.getWord(), elem.getWeight()+weight)); } -// } } public VSMElement getElement(String word){ @@ -360,13 +226,10 @@ public double jaccardSimilarity(VSMVector vectorB){ VSMElement elemA = iterA.next(); VSMElement elemB = vectorB.getElement(elemA.getWord()); if( elemB != null){ //overlap -// numerator = numerator + elemA.getWeight()*elemB.getWeight(); numerator = numerator + 1; } } double denominator = vectorA.size() + vectorB.size() - numerator; -// System.out.println("NUMERATOR:"+numerator); -// System.out.println("NUMERATOR:"+denominator); return numerator/denominator; } @@ -378,7 +241,6 @@ public double intersection(VSMVector vectorB){ VSMElement elemA = iterA.next(); VSMElement elemB = vectorB.getElement(elemA.getWord()); if( elemB != null){ //overlap -// numerator = numerator + elemA.getWeight()*elemB.getWeight(); numerator = numerator + 1; } } @@ -435,16 +297,6 @@ public double vectorSpaceSimilarity(VSMVector vectorB){ double den = (Math.sqrt(denominatorA)*Math.sqrt(denominatorB)); double weight = numerator/den; -// if(weight > 0.52 && weight < 0.53){ -// System.out.println("A:" + vectorA.toString()); -// System.out.println("B:" + vectorB.toString()); -// System.out.println("NUMERATOR:"+numerator); -// System.out.println("DENOMINA:"+denominatorA); -// System.out.println("DENOMINB:"+denominatorB); -// System.out.println("DENOMIN:"+den); -// System.out.println("SIM:"+weight); -// } - return weight; } @@ -479,8 +331,7 @@ public void negativeVector(){ } } - public static HashMap calculateIDFs(VSMVector[] vectors) throws IOException, - SAXException { + public static HashMap calculateIDFs(VSMVector[] vectors) { HashMap idfs = new HashMap<>(); @@ -505,8 +356,7 @@ public static HashMap calculateIDFs(VSMVector[] vectors) throws return idfs; } - public HashMap calculateWordOccurence(VSMVector[] vectors) throws IOException, - SAXException { + public HashMap calculateWordOccurence(VSMVector[] vectors) { HashMap idfs = new HashMap<>(); @@ -664,92 +514,4 @@ public void remove(String word){ elems.remove(word); } - private void parse(Node node, StringBuffer source, StringBuffer sourceTemp, String father,StopList stoplist) { -// System.out.println(node.getClass().getName()); -// System.out.println("Name "+ node.getNodeName()); -// System.out.println("Type "+ node.getNodeType()); -// System.out.println("Value "+ node.getNodeValue()); - if(node == null){ - return; - } - String value = node.getNodeValue() + " of"; - if(Node.TEXT_NODE == node.getNodeType()){ - - if(value.trim().indexOf("<") == -1){ - - PaginaURL pageTemp = null; - String[] words = null; - try { - pageTemp = new PaginaURL(new URL("http://www"),value, stoplist); - words = pageTemp.palavras(); - } - catch (MalformedURLException ex) { - - } - for(int i = 0; words != null && i < words.length; i++){ - -// String stem = stemmer.stem(words[i]); -// if(stem.equals("Invalid term")){ -// stem = words[i]; -// } -// if(stem.indexOf("check") != -1){ -// stem = "check"; -// } -// -// if(!father.equals("OPTION")){ -// source.append("body"); -// source.append(stem); -// source.append(" "); -// }else{ -// source.append(stem); -// source.append(" "); -// } - String stem = words[i]; - try{ - stem = stemmer.stem(words[i]); - }catch(Exception e){ - } - if(!father.equals("OPTION")){ - - if(stem.equals("Invalid term")){ - stem = words[i]; - } - if(stem.indexOf("check") != -1){ - stem = "check"; - } - source.append("body"); - source.append(stem); - source.append(" "); - }else{ - source.append(stem); - source.append(" "); - } - } - - - } - return; - } - if(node.getNodeName().equals("INPUT")){ - NamedNodeMap attrs = node.getAttributes(); - for (int i = 0; i < attrs.getLength(); i++) { - Node attr = attrs.item(i); - String attrName = ((attr.getNodeName().trim()).toLowerCase()); - String attrValue = ((attr.getNodeValue().trim()).toLowerCase()); - if(attrName.equals("type") && !attrValue.equals("hidden")){ - source = source.append(sourceTemp); - sourceTemp.delete(0,sourceTemp.length()); - } - } - } - father = node.getNodeName(); - NodeList children = node.getChildNodes(); - if (children != null) { - int len = children.getLength(); - for (int i = 0; i < len; i++){ - parse(children.item(i),source,sourceTemp, father, stoplist); - } - } - } - }