Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

implemented "datanest.organizations.batch_size" which will make

OrganizationsDatanestHarvester push data into back-end(s) in smaller
batches thus avoiding OutOfMemoryError
note: This forced out calls like 'clear()' or 'deleteByQuery("*:*")'
which served as substitute for missing update procedure. Thus, as of
now, records which "vanish" from the source will simply accumulate in
ODN. Updated records should get updated properly also in ODN. On the
other hand, it's sort of archiving feature, insurance against data
vanishing both from the source and ODN copy. But anyway, roper update
mechanism is needed.
  • Loading branch information...
commit 7de3508b217726e628fe21445ca4eec7dde8f376 1 parent 4ad887d
Peter Hanecak hanecak authored
14 src/main/java/sk/opendata/odn/harvester/datanest/organizations/OrganizationsDatanestHarvester.java
@@ -56,6 +56,7 @@
56 56
57 57 public final static String KEY_DATANEST_ORGANIZATIONS_URL = "datanest.organizations.url";
58 58 public final static String KEY_DATANEST_ORGANIZATIONS_SEZAME_REPO_NAME = "datanest.organizations.sesame_repo_name";
  59 + public final static String KEY_DATANEST_ORGANIZATIONS_BATCH_SIZE = "datanest.organizations.batch_size";
59 60
60 61 private final static int ATTR_INDEX_ID = 0;
61 62 protected final static int ATTR_INDEX_NAME = 1;
@@ -134,6 +135,8 @@ public void update() throws OdnHarvesterException,
134 135
135 136 // read the rows
136 137 String[] row;
  138 + int batchSize = Integer.valueOf(datanestProperties.getProperty(KEY_DATANEST_ORGANIZATIONS_BATCH_SIZE));
  139 + int itemCount = 0;
137 140 int debugProcessOnlyNItems = Integer.valueOf(datanestProperties.getProperty(KEY_DEBUG_PROCESS_ONLY_N_ITEMS));
138 141 while ((row = csvReader.readNext()) != null) {
139 142 try {
@@ -144,9 +147,14 @@ public void update() throws OdnHarvesterException,
144 147 logger.warn("skipping following record: "
145 148 + Arrays.deepToString(row));
146 149 }
147   -
148   - if (debugProcessOnlyNItems > 0 &&
149   - records.size() > debugProcessOnlyNItems)
  150 +
  151 + if (records.size() >= batchSize) {
  152 + store(records);
  153 + records.clear();
  154 + }
  155 +
  156 + if (debugProcessOnlyNItems > 0 &&
  157 + ++itemCount > debugProcessOnlyNItems)
150 158 break;
151 159 }
152 160
31 src/main/java/sk/opendata/odn/repository/sesame/SesameBackend.java
@@ -195,18 +195,18 @@ public void store(RdfData records)
195 195
196 196 connection = repo.getConnection();
197 197
198   - // As of now, the "update" consist of fresh "whole at once" copy of
199   - // the new data loaded into the repository. Thus, we need to remove
200   - // existing data from the repository before loading the new data so
201   - // as to prevent old, stale data to be left in the repository (like
202   - // items which were valid yesterday, but then deemed "bad" or
203   - // whatever and deleted).
204   - // Note: Yes, that is costly and we want to fix that later on.
205 198 // FIXME: Implement proper "update" procedure.
  199 + // As of now, we're not clearing old records, only replacing old
  200 + // copies with fresh copies (assuming "ID" was not changed). If we
  201 + // want a clean-up, we need to manually clean the back-end and rerun
  202 + // harvesting.
  203 + // Note of caution: 'store()' can be called for "a batch" (at least
  204 + // for 'OrganizationsDatanestHarvester' it is) which means that
  205 + // simple "DELETE all" here wont have a desired effect as it removed
  206 + // all the "new" items from previous batch and leave the back-end
  207 + // only with content from last batch.
206 208 if (contexts != null && contexts.length > 0) {
207   - connection.clear(convertedContexts);
208   -
209   - // why we duplicate the 'clear()' and 'add()' statements:
  209 + // why we duplicate the 'add()' statements:
210 210 // 'getStatements(null, null, null, true);' is not the same as
211 211 // 'getStatements(null, null, null, true, (Resource)null);' -
212 212 // see
@@ -215,17 +215,6 @@ public void store(RdfData records)
215 215 records.getRdfBaseURI(), RDFFormat.RDFXML,
216 216 convertedContexts);
217 217 } else {
218   - // CRUDE HACK, FIXME: If we use contexts for the "all"
219   - // repository to distinguish statements in terms of where they
220   - // came from so that we can do a proper clean-up before
221   - // "update", I'm then not able yet to make a proper query on top
222   - // of statements from different contexts. Thus for now I'm not
223   - // using contexts and for "all" repository I'm not doing the
224   - // automatic clean-up, which means that "Clean" needs to be done
225   - // on the repo manually!!!
226   - if (!repoName.equals("all"))
227   - connection.clear();
228   -
229 218 connection.add(new StringReader(records.getRdfData()),
230 219 records.getRdfBaseURI(), RDFFormat.RDFXML);
231 220 }
12 src/main/java/sk/opendata/odn/repository/solr/SolrBackend.java
@@ -127,17 +127,13 @@ public void store(List<SolrItem> records)
127 127 OdnRepositoryException odnRepoException = null;
128 128
129 129 try {
130   - // As of now, the "update" consist of fresh "whole at once" copy of
131   - // the new data loaded into the repository. Thus, we need to remove
132   - // existing data from the repository before loading the new data so
133   - // as to prevent old, stale data to be left in the repository (like
134   - // items which were valid yesterday, but then deemed "bad" or
135   - // whatever and deleted).
136   - // Note: Yes, that is costly and we want to fix that later on.
137 130 // FIXME: Implement proper "update" procedure. For now disabled as
138 131 // we're pushing multiple data sets into one index meaning that if
139 132 // we left this here, insertion of 2nd data set will mean deletion
140   - // of 1st etc. Workaround: Clean the index manualy if necessary.
  133 + // of 1st etc. Plus, 'store()' might be called for multiple batches
  134 + // and we do not want to be left with only last batch in th
  135 + // back-end.
  136 + // Workaround: Clean the index manualy if necessary.
141 137 //solrServer.deleteByQuery("*:*"); // CAUTION: deletes everything!
142 138
143 139 solrServer.addBeans(records);
3  src/main/resources/datanest.properties
@@ -4,8 +4,11 @@ datanest.api.key = xxx
4 4 # code, I'm going to use local copy of the dump:
5 5 datanest.organizations.url = file:///tmp/organisations-dump.csv
6 6 datanest.organizations.sesame_repo_name=organizations
  7 +datanest.organizations.batch_size = 100000
  8 +
7 9 datanest.procurements.url = file:///tmp/procurements-dump.csv
8 10 datanest.procurements.sesame_repo_name=procurements
  11 +
9 12 datanest.political_party_donors.url = file:///tmp/sponzori_stran-dump.csv
10 13 datanest.political_party_donors.sesame_repo_name=political_party_donors
11 14

0 comments on commit 7de3508

Please sign in to comment.
Something went wrong with that request. Please try again.