Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
189 lines (172 sloc) 6.53 KB
/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2013 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/
package org.wikipediacleaner.api.check.algorithm;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import org.wikipediacleaner.api.check.CheckErrorResult;
import org.wikipediacleaner.api.check.CheckErrorResult.ErrorLevel;
import org.wikipediacleaner.api.data.PageAnalysis;
import org.wikipediacleaner.api.data.PageElementTitle;
/**
* Algorithm for analyzing error 92 of check wikipedia project.
* Error 92: Headline double
*/
public class CheckErrorAlgorithm092 extends CheckErrorAlgorithmBase {
public CheckErrorAlgorithm092() {
super("Headline double");
}
/**
* Analyze a page to check if errors are present.
*
* @param analysis Page analysis.
* @param errors Errors found in the page.
* @param onlyAutomatic True if analysis could be restricted to errors automatically fixed.
* @return Flag indicating if the error was found.
*/
@Override
public boolean analyze(
PageAnalysis analysis,
Collection<CheckErrorResult> errors, boolean onlyAutomatic) {
if ((analysis == null) || (analysis.getPage() == null)) {
return false;
}
if (!analysis.getPage().isArticle()) {
return false;
}
boolean result = false;
int previousTitleLevel = 0;
HashMap<Integer, HashMap<String, PageElementTitle>> titles =
new HashMap<Integer, HashMap<String, PageElementTitle>>();
for (PageElementTitle title : analysis.getTitles()) {
// Clean up titles with a lower level
int titleLevel = title.getLevel();
if (titleLevel < previousTitleLevel) {
for (int i = previousTitleLevel; i > titleLevel; i--) {
titles.remove(Integer.valueOf(i));
}
}
// Analyze current level
HashMap<String, PageElementTitle> knownTitles = titles.get(Integer.valueOf(titleLevel));
String titleValue = title.getTitle();
if (knownTitles == null) {
knownTitles = new HashMap<String, PageElementTitle>();
knownTitles.put(titleValue, title);
titles.put(Integer.valueOf(titleLevel), knownTitles);
} else if (!knownTitles.containsKey(titleValue)) {
knownTitles.put(titleValue, title);
} else {
if (errors == null) {
return true;
}
result = true;
PageElementTitle previousTitle = knownTitles.get(titleValue);
if (previousTitle != null) {
CheckErrorResult errorResult = createCheckErrorResult(
analysis,
previousTitle.getBeginIndex(), previousTitle.getEndIndex(),
ErrorLevel.CORRECT);
errors.add(errorResult);
knownTitles.put(titleValue, null);
}
CheckErrorResult errorResult = createCheckErrorResult(
analysis,
title.getBeginIndex(), title.getEndIndex());
errorResult.addEditTocAction(title);
errors.add(errorResult);
}
previousTitleLevel = titleLevel;
}
return result;
}
/**
* Automatic fixing of all the errors in the page.
*
* @param analysis Page analysis.
* @return Page contents after fix.
*/
@Override
protected String internalAutomaticFix(PageAnalysis analysis) {
String contents = analysis.getContents();
if ((!analysis.getPage().isArticle()) ||
(!analysis.getPage().isInMainNamespace()) ||
(!analysis.areTitlesReliable())) {
return contents;
}
List<PageElementTitle> titles = analysis.getTitles();
if ((titles == null) || (titles.size() < 2)) {
return contents;
}
// Fix double headlines
int lastIndex = 0;
StringBuilder buffer = new StringBuilder();
for (int i = 1; i < titles.size(); i++) {
PageElementTitle previousTitle = titles.get(i - 1);
PageElementTitle currentTitle = titles.get(i);
if ((previousTitle.getLevel() == currentTitle.getLevel()) &&
(previousTitle.getTitle().equals(currentTitle.getTitle()))) {
// Analyze if first title can be removed
String betweenTitles = contents.substring(
previousTitle.getEndIndex(), currentTitle.getBeginIndex()).trim();
boolean shouldRemove = false;
if (betweenTitles.length() == 0) {
shouldRemove = true;
} else {
int tmpIndex = currentTitle.getEndIndex();
while ((tmpIndex < contents.length()) &&
(Character.isWhitespace(contents.charAt(tmpIndex)))) {
tmpIndex++;
}
if ((tmpIndex < contents.length()) &&
(contents.startsWith(betweenTitles, tmpIndex))) {
shouldRemove = true;
}
}
if (shouldRemove) {
if (previousTitle.getBeginIndex() > lastIndex) {
buffer.append(contents.substring(lastIndex, previousTitle.getBeginIndex()));
lastIndex = previousTitle.getBeginIndex();
}
lastIndex = currentTitle.getBeginIndex();
} else {
// Analyze if second title can be removed
PageElementTitle nextTitle = (i + 1 < titles.size()) ? titles.get(i + 1) : null;
String afterTitle = contents.substring(
currentTitle.getEndIndex(),
(nextTitle != null) ? nextTitle.getBeginIndex() : contents.length()).trim();
if (afterTitle.length() == 0) {
shouldRemove = true;
} else {
int tmpIndex = previousTitle.getEndIndex();
while ((tmpIndex < contents.length()) &&
(Character.isWhitespace(contents.charAt(tmpIndex)))) {
tmpIndex++;
}
if ((tmpIndex < contents.length()) &&
(contents.startsWith(afterTitle, tmpIndex))) {
shouldRemove = true;
}
}
if (shouldRemove) {
if (currentTitle.getBeginIndex() > lastIndex) {
buffer.append(contents.substring(lastIndex, currentTitle.getBeginIndex()));
lastIndex = currentTitle.getBeginIndex();
}
lastIndex = (nextTitle != null) ? nextTitle.getBeginIndex() : contents.length();
}
}
}
}
if (lastIndex == 0) {
return contents;
}
if (lastIndex < contents.length()) {
buffer.append(contents.substring(lastIndex));
}
return buffer.toString();
}
}