Skip to content

Commit

Permalink
Adds script to remove duplicates, refs 2883 (#2932)
Browse files Browse the repository at this point in the history
  • Loading branch information
mwjames committed Jan 6, 2018
1 parent c6a27a3 commit fcc1755
Show file tree
Hide file tree
Showing 11 changed files with 428 additions and 6 deletions.
2 changes: 1 addition & 1 deletion includes/storage/SQLStore/SMW_Sql3SmwIds.php
Expand Up @@ -501,7 +501,7 @@ protected function getDatabaseIdAndSort( $title, $namespace, $iw, $subobjectName
*
* @return []
*/
public function findDuplicateEntries() {
public function findDuplicateEntityRecords() {

$connection = $this->store->getConnection( 'mw.db' );

Expand Down
84 changes: 84 additions & 0 deletions maintenance/removeDuplicateEntities.php
@@ -0,0 +1,84 @@
<?php

namespace SMW\Maintenance;

use SMW\ApplicationFactory;

$basePath = getenv( 'MW_INSTALL_PATH' ) !== false ? getenv(
'MW_INSTALL_PATH' ) : __DIR__ . '/../../..';

require_once $basePath . '/maintenance/Maintenance.php';

/**
* @license GNU GPL v2+
* @since 3.0
*
* @author mwjames
*/
class RemoveDuplicateEntities extends \Maintenance {

/**
* @since 3.0
*/
public function __construct() {
$this->mDescription = 'Remove duplicates entities without active references.';
$this->addOption( 's', 'ID starting point', false, true );

parent::__construct();
}

/**
* @see Maintenance::addDefaultParams
*
* @since 3.0
*/
protected function addDefaultParams() {
parent::addDefaultParams();
}

/**
* @see Maintenance::execute
*/
public function execute() {

if ( !defined( 'SMW_VERSION' ) ) {
$this->output( "You need to have SMW enabled in order to use this maintenance script!\n\n" );
exit;
}

$this->reportMessage(
"\nThe script will only dispose of those duplicate entities that have no active\n" .
"references. The log section 'untouched' contains IDs that have not been\n" .
"removed and the user is asked to verify the content and manually remove\n".
"those listed entities.\n\n"
);

$applicationFactory = ApplicationFactory::getInstance();
$maintenanceFactory = $applicationFactory->newMaintenanceFactory();

$duplicateEntitiesDisposer = $maintenanceFactory->newDuplicateEntitiesDisposer(
$applicationFactory->getStore( 'SMW\SQLStore\SQLStore' ),
array( $this, 'reportMessage' )
);

$duplicateEntityRecords = $duplicateEntitiesDisposer->findDuplicateEntityRecords();
$duplicateEntitiesDisposer->verifyAndDispose( $duplicateEntityRecords );

return true;
}

/**
* @see Maintenance::reportMessage
*
* @since 1.9
*
* @param string $message
*/
public function reportMessage( $message ) {
$this->output( $message );
}

}

$maintClass = 'SMW\Maintenance\RemoveDuplicateEntities';
require_once( RUN_MAINTENANCE_IF_MAIN );
111 changes: 111 additions & 0 deletions src/Maintenance/DuplicateEntitiesDisposer.php
@@ -0,0 +1,111 @@
<?php

namespace SMW\Maintenance;

use Onoi\MessageReporter\MessageReporterAwareTrait;
use SMW\Store;
use SMW\SQLStore\SQLStore;
use SMW\SQLStore\PropertyTableIdReferenceDisposer;

/**
* @license GNU GPL v2+
* @since 3.0
*
* @author mwjames
*/
class DuplicateEntitiesDisposer {

use MessageReporterAwareTrait;

/**
* @var Store
*/
private $store = null;

/**
* @since 3.0
*
* @param Store $store
*/
public function __construct( Store $store ) {
$this->store = $store;
}

/**
* @since 3.0
*/
public function findDuplicateEntityRecords() {
return $this->store->getObjectIds()->findDuplicateEntityRecords();
}

/**
* @since 3.0
*
* @param array $duplicateEntityRecords
*/
public function verifyAndDispose( array $duplicateEntityRecords ) {

$count = count( $duplicateEntityRecords );
$this->messageReporter->reportMessage( "Found: $count duplicates\n" );

if ( $count > 0 ) {
$this->doDispose( $duplicateEntityRecords );
}
}

private function doDispose( array $duplicateEntityRecords ) {

$propertyTableIdReferenceDisposer = new PropertyTableIdReferenceDisposer(
$this->store
);

$propertyTableIdReferenceDisposer->setRedirectRemoval( true );
$connection = $this->store->getConnection( 'mw.db' );

$log = [
'disposed' => [],
'untouched' => []
];

$i = 0;
foreach ( $duplicateEntityRecords as $entityRecord ) {
unset( $entityRecord['count'] );

if ( ( $i ) % 60 === 0 ) {
$this->messageReporter->reportMessage( "\n" );
}

$this->messageReporter->reportMessage( '.' );

$res = $connection->select(
SQLStore::ID_TABLE,
[
'smw_id',
],
[
'smw_title'=> $entityRecord['smw_title'],
'smw_namespace'=> $entityRecord['smw_namespace'],
'smw_iw'=> $entityRecord['smw_iw'],
'smw_subobject'=> $entityRecord['smw_subobject']
],
__METHOD__
);

foreach ( $res as $row ) {
if ( $propertyTableIdReferenceDisposer->isDisposable( $row->smw_id ) ) {
$propertyTableIdReferenceDisposer->cleanUpTableEntriesById( $row->smw_id );
$log['disposed'][$row->smw_id] = $entityRecord;
} else {
$log['untouched'][$row->smw_id] = $entityRecord;
}
}

$i++;
}

$this->messageReporter->reportMessage(
"\n\nLog\n\n" . json_encode( $log, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE ) . "\n"
);
}

}
22 changes: 22 additions & 0 deletions src/Maintenance/MaintenanceFactory.php
Expand Up @@ -6,6 +6,7 @@
use SMW\ApplicationFactory;
use SMW\MediaWiki\ManualEntryLogger;
use SMW\SQLStore\PropertyStatisticsStore;
use SMW\Maintenance\DuplicateEntitiesDisposer;
use SMW\SQLStore\SQLStore;
use SMW\Store;

Expand Down Expand Up @@ -108,6 +109,27 @@ public function newRebuildPropertyStatistics() {
return new RebuildPropertyStatistics();
}

/**
* @since 3.0
*
* @return DuplicateEntitiesDisposer
*/
public function newDuplicateEntitiesDisposer( Store $store, $reporterCallback = null ) {

$messageReporter = MessageReporterFactory::getInstance()->newObservableMessageReporter();
$messageReporter->registerReporterCallback( $reporterCallback );

$duplicateEntitiesDisposer = new DuplicateEntitiesDisposer(
$store
);

$duplicateEntitiesDisposer->setMessageReporter(
$messageReporter
);

return $duplicateEntitiesDisposer;
}

/**
* @since 2.4
*
Expand Down
2 changes: 1 addition & 1 deletion src/MediaWiki/Api/Task.php
Expand Up @@ -75,7 +75,7 @@ private function callDupLookupTask( $parameters ) {
return $result + ['isFromCache' => true ];
}

$rows = $applicationFactory->getStore()->getObjectIds()->findDuplicateEntries();
$rows = $applicationFactory->getStore()->getObjectIds()->findDuplicateEntityRecords();

$result = [
'list' => $rows,
Expand Down
27 changes: 26 additions & 1 deletion src/SQLStore/PropertyTableIdReferenceDisposer.php
Expand Up @@ -36,6 +36,11 @@ class PropertyTableIdReferenceDisposer {
*/
private $onTransactionIdle = false;

/**
* @var boolean
*/
private $redirectRemoval = false;

/**
* @since 2.4
*
Expand All @@ -46,6 +51,15 @@ public function __construct( SQLStore $store ) {
$this->connection = $this->store->getConnection( 'mw.db' );
}

/**
* @since 3.0
*
* @param boolean $redirectRemoval
*/
public function setRedirectRemoval( $redirectRemoval ) {
$this->redirectRemoval = $redirectRemoval;
}

/**
* @note MW 1.29+ showed transaction collisions when executed using the
* JobQueue in connection with purging the BagOStuff cache, use
Expand All @@ -57,6 +71,17 @@ public function waitOnTransactionIdle() {
$this->onTransactionIdle = true;
}

/**
* @since 3.0
*
* @param integer $id
*
* @return boolean
*/
public function isDisposable( $id ) {
return $this->store->getPropertyTableIdReferenceFinder()->hasResidualReferenceForId( $id ) === false;
}

/**
* Use case: After a property changed its type (_wpg -> _txt), object values in the
* ID table are not removed at the time of the conversion process.
Expand Down Expand Up @@ -176,7 +201,7 @@ public function cleanUpTableEntriesById( $id ) {
private function doRemoveEntityReferencesById( $id, $isRedirect ) {

// When marked as redirect, don't remove the reference
if ( $isRedirect === false ) {
if ( $isRedirect === false || ( $isRedirect && $this->redirectRemoval ) ) {
$this->connection->delete(
SQLStore::ID_TABLE,
array( 'smw_id' => $id ),
Expand Down

0 comments on commit fcc1755

Please sign in to comment.