Skip to content

Commit

Permalink
Merge 0e02ae0 into ecc5cbf
Browse files Browse the repository at this point in the history
  • Loading branch information
surilindur committed Aug 7, 2024
2 parents ecc5cbf + 0e02ae0 commit e2e1d75
Show file tree
Hide file tree
Showing 3 changed files with 199 additions and 171 deletions.
22 changes: 11 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ The config file that should be passed to the command line tool has the following
"log": true,
"outputFormat": "application/n-quads",
"iriToPath": {
"http://example.org/base/": "output/base/",
"http://example.org/other/": "output/other/"
"^http://example.org/base/": "output/base/",
"^http://example.org/other/": "output/other/"
}
}
}
Expand Down Expand Up @@ -285,7 +285,7 @@ A quad sink is able to direct a stream of quads as output from the fragmentation

#### File Quad Sink

A quad sink that writes to files using an IRI to local file system path mapping.
A quad sink that writes to files using an IRI to local file system path mapping with regular expressions.

```json
{
Expand All @@ -295,8 +295,8 @@ A quad sink that writes to files using an IRI to local file system path mapping.
"outputFormat": "application/n-quads",
"fileExtension": "$.nq",
"iriToPath": {
"http://example.org/base/": "output/base/",
"http://example.org/other/": "output/other/"
"^http://example.org/base/": "output/base/",
"^http://example.org/other/": "output/other/"
}
}
}
Expand Down Expand Up @@ -359,8 +359,8 @@ A quad sink that combines multiple quad sinks.
"outputFormat": "application/n-quads",
"fileExtension": "$.nq",
"iriToPath": {
"http://example.org/base/": "output/base/",
"http://example.org/other/": "output/other/"
"^http://example.org/base/": "output/base/",
"^http://example.org/other/": "output/other/"
}
},
{
Expand All @@ -369,8 +369,8 @@ A quad sink that combines multiple quad sinks.
"outputFormat": "application/n-quads",
"fileExtension": "$.nq2",
"iriToPath": {
"http://example.org/base/": "output-2/base/",
"http://example.org/other/": "output-2/other/"
"^http://example.org/base/": "output-2/base/",
"^http://example.org/other/": "output-2/other/"
}
}
]
Expand Down Expand Up @@ -401,8 +401,8 @@ A quad sink that wraps over another quad sink and only passes the quads through
"outputFormat": "application/n-quads",
"fileExtension": "$.nq",
"iriToPath": {
"http://example.org/base/": "output/base/",
"http://example.org/other/": "output/other/"
"^http://example.org/base/": "output/base/",
"^http://example.org/other/": "output/other/"
}
}
]
Expand Down
43 changes: 29 additions & 14 deletions lib/io/QuadSinkFile.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import * as readline from 'node:readline';
import type { Writable } from 'node:stream';
import type * as RDF from '@rdfjs/types';
import type { IQuadSink } from './IQuadSink';
Expand All @@ -9,7 +8,7 @@ import { ParallelFileWriter } from './ParallelFileWriter';
*/
export class QuadSinkFile implements IQuadSink {
private readonly outputFormat: string;
private readonly iriToPath: Record<string, string>;
private readonly iriToPath: Map<RegExp, string>;
private readonly fileWriter: ParallelFileWriter;
protected readonly log: boolean;
protected readonly fileExtension?: string;
Expand All @@ -18,19 +17,18 @@ export class QuadSinkFile implements IQuadSink {

public constructor(options: IQuadSinkFileOptions) {
this.outputFormat = options.outputFormat;
this.iriToPath = options.iriToPath;
this.iriToPath = new Map(Object.entries(options.iriToPath).map(([ exp, sub ]) => [
new RegExp(exp, 'u'),
sub,
]));
this.log = Boolean(options.log);
this.fileExtension = options.fileExtension;

this.fileWriter = new ParallelFileWriter({ streams: 128 });

this.attemptLog();
}

protected attemptLog(newLine = false): void {
if (this.log && (this.counter % 1_000 === 0 || newLine)) {
readline.clearLine(process.stdout, 0);
readline.cursorTo(process.stdout, 0);
process.stdout.write(`\rHandled quads: ${this.counter / 1_000}K`);
if (newLine) {
process.stdout.write(`\n`);
Expand All @@ -46,20 +44,26 @@ export class QuadSinkFile implements IQuadSink {
}

// Find base path from the first matching baseIRI
let path: string | undefined;
for (const [ baseIRI, basePath ] of Object.entries(this.iriToPath)) {
if (iri.startsWith(baseIRI)) {
path = basePath + iri.slice(baseIRI.length);
break;
let bestMatch: RegExpExecArray | undefined;
let bestRegex: RegExp | undefined;

for (const exp of this.iriToPath.keys()) {
const match = exp.exec(iri);
if (match && (bestMatch === undefined || match[0].length > bestMatch[0].length)) {
bestMatch = match;
bestRegex = exp;
}
}

// Crash if we did not find a matching baseIRI
if (!path) {
if (!bestRegex) {
throw new Error(`No IRI mapping found for ${iri}`);
}

// Escape illegal directory names
// Perform substitution and replace illegal directory names
let path = iri.replace(bestRegex, this.iriToPath.get(bestRegex)!);

// Replace illegal directory names
path = path.replaceAll(/[*|"<>?:]/ug, '_');

// Add file extension if we don't have one yet
Expand Down Expand Up @@ -90,11 +94,22 @@ export class QuadSinkFile implements IQuadSink {
}

export interface IQuadSinkFileOptions {
/**
* The RDF format to output, expressed as mimetype.
*/
outputFormat: string;
/**
* Mapping of regular expressions to their replacements,
* for determining the file path from a given IRI.
* @range {json}
*/
iriToPath: Record<string, string>;
/**
* Whether to log quad handling progress.
*/
log?: boolean;
/**
* Optional file extension to use.
*/
fileExtension?: string;
}
Loading

0 comments on commit e2e1d75

Please sign in to comment.