Skip to content

Commit

Permalink
T256312: same reference used twice in the same place
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicolas authored and Nicolas committed Nov 4, 2020
1 parent 9520408 commit 71d184b
Show file tree
Hide file tree
Showing 11 changed files with 356 additions and 26 deletions.
2 changes: 1 addition & 1 deletion WikipediaCleaner/resources/tasks/enwiki/ListCheckWiki.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ DoTasks ListCheckWiki_Before.txt
Set Namespaces 0 6
Set Prefix [[User:WikiCleanerBot#T1|Bot_T1]]
ListCheckWiki C:\Users\Nicolas\Downloads\enwiki-$-pages-articles.xml.bz2 wiki:Wikipedia:CHECKWIKI/WPC_{0}_dump 1 2 3 4 5 7 8 9 13 14 15 16 17 19 20 23 24 25 26 28 29 38 42 46 48 54 55 64 69 70 71 72 73
ListCheckWiki C:\Users\Nicolas\Downloads\enwiki-$-pages-articles.xml.bz2 wiki:Wikipedia:CHECKWIKI/WPC_{0}_dump 83 85 88 90 91 92 98 99 100 101 102 103 104 105 106 107 108 109 111 504 513 543 547 548 549 550 551 552 553 554 557
ListCheckWiki C:\Users\Nicolas\Downloads\enwiki-$-pages-articles.xml.bz2 wiki:Wikipedia:CHECKWIKI/WPC_{0}_dump 83 85 88 90 91 92 98 99 100 101 102 103 104 105 106 107 108 109 111 504 513 543 547 548 549 550 551 552 553 554 557 558

DoTasks ListCheckWiki_After.txt
2 changes: 1 addition & 1 deletion WikipediaCleaner/resources/tasks/frwiki/ListCheckWiki.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ DoTasks _Common.txt
DoTasks ListCheckWiki_Before.txt

Set Namespaces 0 6
ListCheckWiki C:\Users\Nicolas\Downloads\frwiki-$-pages-articles.xml.bz2 wiki:Projet:Correction_syntaxique/Analyse_{0} 1 2 3 4 5 7 8 9 10 13 14 15 16 17 18 19 20 21 22 23 24 25 26 28 38 42 46 48 51 52 54 55 64 69 70 71 72 73 83 85 88 90 92 98 99 102 104 105 106 107 108 109 111 112 513 526 542 543 544 546 547 548 549 550 551 552 553 554 555 557
ListCheckWiki C:\Users\Nicolas\Downloads\frwiki-$-pages-articles.xml.bz2 wiki:Projet:Correction_syntaxique/Analyse_{0} 1 2 3 4 5 7 8 9 10 13 14 15 16 17 18 19 20 21 22 23 24 25 26 28 38 42 46 48 51 52 54 55 64 69 70 71 72 73 83 85 88 90 92 98 99 102 104 105 106 107 108 109 111 112 513 526 542 543 544 546 547 548 549 550 551 552 553 554 555 557 558

DoTasks ListCheckWiki_After.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ FixListCheckWiki Projet:Correction_syntaxique/Analyse_551 551
FixListCheckWiki Projet:Correction_syntaxique/Analyse_553 553
FixListCheckWiki Projet:Correction_syntaxique/Analyse_554 554
FixListCheckWiki Projet:Correction_syntaxique/Analyse_555 555
FixListCheckWiki Projet:Correction_syntaxique/Analyse_557 557
FixListCheckWiki Projet:Correction_syntaxique/Analyse_558 558

# File namespace
DoTasks _Common_Other.txt
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
DoTasks _Common.txt
Set Prefix [[Utilisateur:WikiCleanerBot#T4|Bot_T4]]
ListCheckWiki -onlyRecheck C:\Users\Nicolas\Downloads\frwiki-$-pages-articles.xml.bz2 wiki:Projet:Correction_syntaxique/Analyse_{0} 1 2 3 4 5 7 8 9 13 14 15 16 17 18 19 20 21 22 23 24 25 26 28 38 42 48 51 52 54 55 64 69 70 71 72 73 83 88 90 98 99 102 104 105 106 107 108 109 111 112 513 526 542 543 544 546 547 548 549 550 551
ListCheckWiki -onlyRecheck C:\Users\Nicolas\Downloads\frwiki-$-pages-articles.xml.bz2 wiki:Projet:Correction_syntaxique/Analyse_{0} 1 2 3 4 5 7 8 9 10 13 14 15 16 17 18 19 20 21 22 23 24 25 26 28 38 42 46 48 51 52 54 55 64 69 70 71 72 73 83 85 88 90 92 98 99 102 104 105 106 107 108 109 111 112 513 526 542 543 544 546 547 548 549 550 551 552 553 554 555 557 558
DoTasks ISBN_ISSN.txt
2 changes: 1 addition & 1 deletion WikipediaCleaner/resources/tasks/frwiki/_Common.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Set AdditionalAlgorithms 1 2 7 9 10 16 17 18 19 22 25 46 48 52 55 64 83 85 88 90 91 92 104 106 513 524 526 532 533 534 537 538 539 540 541 542 543 547 548 549 550 551 553 554 555 557
Set AdditionalAlgorithms 1 2 7 9 10 16 17 18 19 22 25 46 48 52 55 64 83 85 88 90 91 92 104 106 513 524 526 532 533 534 537 538 539 540 541 542 543 547 548 549 550 551 553 554 555 557 558
Set Configuration TimeBetweenEdit 5
Set Configuration MaxEditsPerMinute 10
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ public boolean analyze(
while (refIndex < maxRefs) {

// Group references separated only by punctuation characters
int lastRefIndex = PageElement.groupElements(refs, refIndex, contents, ",;.\'", separator);
int lastRefIndex = PageElement.groupElements(refs, refIndex, contents, ",;.\'", separators);
result |= analyzeGroupOfTags(analysis, contents, errors, refs, refIndex, lastRefIndex);
refIndex = lastRefIndex + 1;
}
Expand Down Expand Up @@ -437,9 +437,12 @@ protected String internalAutomaticFix(PageAnalysis analysis) {
*/
@Override
protected void initializeSettings() {
separators.clear();
separator = getSpecificProperty(PARAMETER_SEPARATOR, true, false, false);
if (separator == null) {
separator = "";
} else {
separators.add(separator);
}

String tmp = getSpecificProperty(PARAMETER_TEMPLATES, true, true, false);
Expand All @@ -455,9 +458,12 @@ protected void initializeSettings() {
forceSeparator = Boolean.valueOf(tmp);
}

/** Separator between consecutive tags */
/** Valid separator between consecutive tags */
private String separator = "";

/** List of separators between consecutive tags */
private final List<String> separators = new ArrayList<>();

/** Force usage of separator between consecutive tags */
private boolean forceSeparator = false;

Expand All @@ -470,6 +476,12 @@ protected void initializeSettings() {
@Override
protected void addParameters() {
super.addParameters();
addParameter(new AlgorithmParameter(
PARAMETER_FORCE_SEPARATOR,
GT._T("To force the usage of the separator between consecutive {0} tags", "l&t;ref;gt;"),
new AlgorithmParameterElement(
"true/false",
GT._T("To force the usage of the separator between consecutive {0} tags", "l&t;ref;gt;"))));
addParameter(new AlgorithmParameter(
PARAMETER_SEPARATOR,
GT._T("Used as a separator between consecutive {0} tags", "&lt;ref&gt;"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2013 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/

package org.wikipediacleaner.api.check.algorithm;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;
import org.wikipediacleaner.api.algorithm.AlgorithmParameter;
import org.wikipediacleaner.api.algorithm.AlgorithmParameterElement;
import org.wikipediacleaner.api.check.CheckErrorResult;
import org.wikipediacleaner.api.configuration.WPCConfiguration;
import org.wikipediacleaner.api.configuration.WPCConfigurationString;
import org.wikipediacleaner.api.configuration.WPCConfigurationStringList;
import org.wikipediacleaner.api.data.Page;
import org.wikipediacleaner.api.data.PageElement;
import org.wikipediacleaner.api.data.PageElementFullTag;
import org.wikipediacleaner.api.data.PageElementTag;
import org.wikipediacleaner.api.data.PageElementTag.Parameter;
import org.wikipediacleaner.api.data.PageElementTemplate;
import org.wikipediacleaner.api.data.analysis.PageAnalysis;
import org.wikipediacleaner.api.data.contents.IntervalComparator;
import org.wikipediacleaner.i18n.GT;


/**
* Algorithm for analyzing error 558 of check wikipedia project.
* Error 558: Duplicated reference
*/
public class CheckErrorAlgorithm558 extends CheckErrorAlgorithmBase {

public CheckErrorAlgorithm558() {
super("Duplicated reference");
}

/**
* Analyze a page to check if errors are present.
*
* @param analysis Page analysis.
* @param errors Errors found in the page.
* @param onlyAutomatic True if analysis could be restricted to errors automatically fixed.
* @return Flag indicating if the error was found.
*/
@Override
public boolean analyze(
PageAnalysis analysis,
Collection<CheckErrorResult> errors, boolean onlyAutomatic) {
if (analysis == null) {
return false;
}

// Analyze from the beginning
List<PageElement> refs = getRefs(analysis);
if ((refs == null) || (refs.isEmpty())) {
return false;
}
boolean result = false;
String contents = analysis.getContents();
int refIndex = 0;
int maxRefs = refs.size();
while (refIndex < maxRefs) {

// Group references separated only by punctuation characters
int lastRefIndex = PageElement.groupElements(refs, refIndex, contents, ",;.\'", separators);
result |= analyzeGroupOfTags(analysis, contents, errors, refs, refIndex, lastRefIndex);
refIndex = lastRefIndex + 1;
}
return result;
}

/**
* Analyze a group of tags.
*
* @param analysis Page analysis.
* @param contents Page contents.
* @param errors Errors found in the page.
* @param refs List of references.
* @param firstRefIndex Index of the first reference of the group.
* @param lastRefIndex Index of the last reference of the group.
* @return True if the error was found in the group of tags.
*/
private boolean analyzeGroupOfTags(
PageAnalysis analysis, String contents,
Collection<CheckErrorResult> errors,
List<PageElement> refs,
int firstRefIndex, int lastRefIndex) {

if (lastRefIndex == firstRefIndex) {
return false;
}
for (int firstIndex = firstRefIndex; firstIndex < lastRefIndex; firstIndex++) {
PageElement firstRef = refs.get(firstIndex);
PageElementTag firstRefTag = (firstRef instanceof PageElementTag) ? (PageElementTag) firstRef : null;
String firstContent = contents.substring(firstRef.getBeginIndex(), firstRef.getEndIndex());
for (int secondIndex = firstIndex + 1; secondIndex <= lastRefIndex; secondIndex++) {
PageElement secondRef = refs.get(secondIndex);
String secondContent = contents.substring(secondRef.getBeginIndex(), secondRef.getEndIndex());
if (firstContent.equals(secondContent)) {
if (errors == null) {
return true;
}
CheckErrorResult errorResult = createCheckErrorResult(analysis, firstRef.getBeginIndex(), secondRef.getEndIndex());
errorResult.addReplacement(
contents.substring(firstRef.getBeginIndex(), refs.get(secondIndex - 1).getEndIndex()),
canRemoveBetween(contents, refs.get(secondIndex - 1), refs.get(secondIndex)));
errors.add(errorResult);
return true;
}
PageElementTag secondRefTag = null;
if ((firstRefTag != null) && (secondRef instanceof PageElementTag)) {
PageElementTag tmpTag = (PageElementTag) secondRef;
Parameter firstName = firstRefTag.getParameter("name");
Parameter secondName = tmpTag.getParameter("name");
if ((firstName != null) &&
(secondName != null) &&
StringUtils.equals(firstName.getValue(), secondName.getValue())) {
Parameter firstGroup = firstRefTag.getParameter("group");
Parameter secondGroup = tmpTag.getParameter("group");
if ((firstGroup != null) &&
(secondGroup != null) &&
StringUtils.equals(firstGroup.getValue(), secondGroup.getValue())) {
secondRefTag = tmpTag;
} else if ((firstGroup == null) && (secondGroup == null)) {
secondRefTag = tmpTag;
}
}
}
if ((firstRefTag != null) && (secondRefTag != null)) {
CheckErrorResult errorResult = createCheckErrorResult(analysis, firstRef.getBeginIndex(), secondRef.getEndIndex());
if (secondRefTag.isFullTag()) {
errorResult.addReplacement(
contents.substring(firstRef.getBeginIndex(), refs.get(secondIndex - 1).getEndIndex()),
canRemoveBetween(contents, refs.get(secondIndex - 1), refs.get(secondIndex)));
} else if (firstRefTag.isFullTag()) {
errorResult.addReplacement(
contents.substring(refs.get(firstIndex + 1).getBeginIndex(), secondRef.getEndIndex()),
canRemoveBetween(contents, refs.get(firstIndex), refs.get(firstIndex + 1)));
}
errors.add(errorResult);
return true;
}
}
}

return false;
}

/**
* Check if text can be removed between references.
*
* @param contents Page contents.
* @param previousRef Previous reference.
* @param nextRef Next reference.
* @return True if the texte between the two references can be safely removed.
*/
private boolean canRemoveBetween(
String contents,
PageElement previousRef,
PageElement nextRef) {
String text = contents.substring(previousRef.getEndIndex(), nextRef.getBeginIndex());
return !text.contains("''");
}

/**
* @param analysis Page analysis.
* @return List of references (tags, templates, ...).
*/
private List<PageElement> getRefs(PageAnalysis analysis) {
List<PageElement> refs = new ArrayList<PageElement>();

// Retrieve references defined by tags
List<PageElementTag> refTags = analysis.getCompleteTags(PageElementTag.TAG_WIKI_REF);
if (refTags != null) {
for (PageElementTag refTag : refTags) {
refs.add(new PageElementFullTag(refTag));
}
}

// Retrieve references defined by templates
if (!templatesName.isEmpty()) {
List<PageElementTemplate> templates = analysis.getTemplates();
for (PageElementTemplate template : templates) {
if (templatesName.contains(template.getTemplateName())) {
refs.add(template);
}
}
}

Collections.sort(refs, new IntervalComparator());
return refs;
}

/**
* Automatic fixing of all the errors in the page.
*
* @param analysis Page analysis.
* @return Page contents after fix.
*/
@Override
protected String internalAutomaticFix(PageAnalysis analysis) {
if (!analysis.getPage().isArticle() ||
!analysis.getPage().isInMainNamespace()) {
return analysis.getContents();
}
return fixUsingAutomaticReplacement(analysis);
}

/* ====================================================================== */
/* PARAMETERS */
/* ====================================================================== */

/** Separator between consecutive tags */
private static final String PARAMETER_SEPARATOR = "separator";

/** Templates that can replace a tag */
private static final String PARAMETER_TEMPLATES = "templates";

/**
* Initialize settings for the algorithm.
*
* @see org.wikipediacleaner.api.check.algorithm.CheckErrorAlgorithmBase#initializeSettings()
*/
@Override
protected void initializeSettings() {
separators.clear();
separator = getSpecificProperty(PARAMETER_SEPARATOR, true, false, false);
if (separator == null) {
separator = getWPCConfiguration().getString(WPCConfigurationString.REF_SEPARATOR);
}
if (separator == null) {
separator = "";
} else {
separators.add(separator);
}
List<String> tmpList = getWPCConfiguration().getStringList(WPCConfigurationStringList.REF_OTHER_SEPARATORS);
if (tmpList != null) {
for (String tmp : tmpList) {
if (!separators.contains(tmp)) {
separators.add(tmp);
}
}
}

String tmp = getSpecificProperty(PARAMETER_TEMPLATES, true, true, false);
templatesName.clear();
if (tmp != null) {
tmpList = WPCConfiguration.convertPropertyToStringList(tmp);
for (String tmpElement : tmpList) {
templatesName.add(Page.normalizeTitle(tmpElement));
}
}
}

/** Valid separator between consecutive tags */
private String separator = "";

/** Separators between consecutive tags */
private final List<String> separators = new ArrayList<>();

/** Templates that can replace a tag */
private final Set<String> templatesName = new HashSet<>();

/**
* Build the list of parameters for this algorithm.
*/
@Override
protected void addParameters() {
super.addParameters();
addParameter(new AlgorithmParameter(
PARAMETER_SEPARATOR,
GT._T("Used as a separator between consecutive {0} tags", "&lt;ref&gt;"),
new AlgorithmParameterElement(
"text",
GT._T("Used as a separator between consecutive {0} tags", "&lt;ref&gt;"))));
addParameter(new AlgorithmParameter(
PARAMETER_TEMPLATES,
GT._T("Templates that can be used to replace {0} tags", "&lt;ref&gt;"),
new AlgorithmParameterElement(
"template name",
GT._T("Template that can be used to replace {0} tags", "&lt;ref&gt;")),
true));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ public enum WPCConfigurationString {
REDIRECT_TEMPLATES_COMMENT("redirect_templates_comment", null, false, true, true),
// Warning when replacing links to redirect pages by direct links
REDIRECT_WARNING_BEFORE_REPLACEMENT("redirect_warning_before_replacement", null, true, true, true),
// Preferred separator between consecutive <ref/> tags
REF_SEPARATOR("general_ref_separator", null, true, true, true),
// WPCleaner tag for modifications
TAG("general_wpcleaner_tag", null, false, true, true),
// "To do" sub-page
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ public enum WPCConfigurationStringList {
REDIRECT_TEMPLATES("redirect_templates", true, true, true),
// Templates that can be used instead of <references/>
REFERENCES_TEMPLATES("general_references_templates", true, true, true),
// Other separators between consecutive <ref/> tags
REF_OTHER_SEPARATORS("general_ref_other_separators", true, true, true),
// Templates to ask for help about RFC
RFC_HELP_NEEDED_TEMPLATES("general_rfc_help_needed_templates", true, true, true),
// Templates to ignore for RFC
Expand Down
Loading

0 comments on commit 71d184b

Please sign in to comment.