Skip to content

Commit

Permalink
synonyms cleanup [AVOF-1782]
Browse files Browse the repository at this point in the history
  • Loading branch information
Timur committed Jul 2, 2019
1 parent de396bd commit 805cc50
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 2 deletions.
33 changes: 31 additions & 2 deletions working/generic_update.sql
Expand Up @@ -22,8 +22,9 @@ BEGIN
ANALYSE concept_relationship_stage;
ANALYSE concept_synonym_stage;

-- 1. clearing the concept_name
-- 1. Clearing

-- 1.1 Clearing the concept_name
--remove double spaces, carriage return, newline, vertical tab and form feed
UPDATE concept_stage
SET concept_name = REGEXP_REPLACE(concept_name, '[[:cntrl:]]+', ' ')
Expand All @@ -36,13 +37,41 @@ BEGIN
--remove leading and trailing spaces
UPDATE concept_stage
SET concept_name = TRIM(concept_name)
WHERE concept_name <> TRIM(concept_name);
WHERE concept_name <> TRIM(concept_name)
AND NOT (
concept_name = ' '
AND vocabulary_id = 'GPI'
);--exclude GPI empty names

--remove long dashes
UPDATE concept_stage
SET concept_name = REPLACE(concept_name, '', '-')
WHERE concept_name LIKE '%–%';

-- 1.2 Clearing the synonym_name
--remove double spaces, carriage return, newline, vertical tab and form feed
UPDATE concept_synonym_stage
SET synonym_name = REGEXP_REPLACE(synonym_name, '[[:cntrl:]]+', ' ')
WHERE synonym_name ~ '[[:cntrl:]]';

UPDATE concept_synonym_stage
SET synonym_name = REGEXP_REPLACE(synonym_name, ' {2,}', ' ')
WHERE synonym_name ~ ' {2,}';

--remove leading and trailing spaces
UPDATE concept_synonym_stage
SET synonym_name = TRIM(synonym_name)
WHERE synonym_name <> TRIM(synonym_name)
AND NOT (
synonym_name = ' '
AND synonym_vocabulary_id = 'GPI'
);--exclude GPI empty names

--remove long dashes
UPDATE concept_synonym_stage
SET synonym_name = REPLACE(synonym_name, '', '-')
WHERE synonym_name LIKE '%–%';

/***************************
* Update the concept table *
****************************/
Expand Down
49 changes: 49 additions & 0 deletions working/manual_changes/2019/manual_changes_01-July-2019.sql
@@ -0,0 +1,49 @@
--fix name duplicates in concept_synonym [AVOF-1782]

DO $$
BEGIN
DELETE
FROM concept_synonym cs
WHERE concept_synonym_name ~ ' {2,}'
AND EXISTS (
SELECT 1
FROM concept_synonym cs_int
WHERE cs_int.concept_id = cs.concept_id
AND cs_int.concept_synonym_name = REGEXP_REPLACE(cs.concept_synonym_name, ' {2,}', ' ')
AND cs.language_concept_id = cs_int.language_concept_id
);

DELETE
FROM concept_synonym cs
WHERE (
concept_synonym_name LIKE ' %'
OR concept_synonym_name LIKE '% '
)
AND EXISTS (
SELECT 1
FROM concept_synonym cs_int
WHERE cs_int.concept_id = cs.concept_id
AND cs_int.concept_synonym_name = TRIM(cs.concept_synonym_name)
AND cs.language_concept_id = cs_int.language_concept_id
);

--remove double spaces, carriage return, newline, vertical tab and form feed
UPDATE concept_synonym
SET concept_synonym_name = REGEXP_REPLACE(concept_synonym_name, '[[:cntrl:]]+', ' ')
WHERE concept_synonym_name ~ '[[:cntrl:]]';

UPDATE concept_synonym
SET concept_synonym_name = REGEXP_REPLACE(concept_synonym_name, ' {2,}', ' ')
WHERE concept_synonym_name ~ ' {2,}';

--remove leading and trailing spaces
UPDATE concept_synonym
SET concept_synonym_name = TRIM(concept_synonym_name)
WHERE concept_synonym_name <> TRIM(concept_synonym_name)
AND concept_synonym_name <> ' ';--exclude GPI empty names

--remove long dashes
UPDATE concept_synonym
SET concept_synonym_name = REPLACE(concept_synonym_name, '', '-')
WHERE concept_synonym_name LIKE '%–%';
END $$;

0 comments on commit 805cc50

Please sign in to comment.