Skip to content

Commit

Permalink
fix(ingester): use cdtn filtering for fiche vdd (#167)
Browse files Browse the repository at this point in the history
  • Loading branch information
lionelB committed Nov 9, 2020
1 parent 644d351 commit 1863826
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 40 deletions.
16 changes: 16 additions & 0 deletions targets/ingester/@types/fiches-vdd.d.ts
Expand Up @@ -6,5 +6,21 @@ declare module "@socialgouv/fiches-vdd" {
attributes: { ID: string, URL: string }
children: RawJson[]
}

export type FicheIndex = {
id: string
type: "particuliers" | "associations" | "professionnels"
title: string
subject: string
theme: string
breadcrumbs: FicheVddBreadcrumbs[]
date: string
}

export type FicheVddBreadcrumbs = {
id: string
text: string
}

export function getFiche(type: string, id: string): RawJson
}
7 changes: 5 additions & 2 deletions targets/ingester/src/cli.js
Expand Up @@ -89,12 +89,12 @@ async function download(pkgName, url) {

/** @type {[string, (pkgName:string)=>Promise<import("./index.js").CdtnDocument[]>|void][]} */
const dataPackages = [
["@socialgouv/datafiller-data", () => {}],
["@socialgouv/contributions-data", getContributionsDocuments],
["@socialgouv/kali-data", getAgreementDocuments],
["@socialgouv/legi-data", getCdtDocuments],
["@socialgouv/fiches-vdd", getFichesServicePublic],
["@socialgouv/fiches-travail-data", getFicheTravailEmploi],
["@socialgouv/datafiller-data", () => {}],
];
/**
*
Expand All @@ -108,6 +108,7 @@ async function getPackage(pkgName, pkgVersion = "latest") {
const url = pkgInfo.dist.tarball;
const latest = pkgInfo.version;
if (await isPkgOutdated(pkgName, latest)) {
console.log(`download ${pkgName}@${latest}`);
await download(pkgName, url);
}
}
Expand Down Expand Up @@ -155,7 +156,9 @@ async function main() {
if (args.dryRun || !documents) {
continue;
}
console.log(`ready to ingest ${pkgName}`);
console.log(
`ready to ingest ${documents.length} documents from ${pkgName}`
);
const inserts = await batchPromises(documents, insertDocument, 10);
ids = ids.concat(inserts);
}
Expand Down
188 changes: 188 additions & 0 deletions targets/ingester/src/transform/fichesServicePublic/filter.js
@@ -0,0 +1,188 @@
/**
* Return a filtered set of FicheIndex
* @param {import("@socialgouv/fiches-vdd").FicheIndex[]} fiches
*/
export function filter(fiches) {
const filteredFiches = fiches.filter((fiche) => {
const arianeIds = fiche.breadcrumbs.map((item) => item.id);
if (!fiche.id.startsWith("F")) {
return false;
}
/** @param {string} id */
const matchFilDAriane = (id) => arianeIds.includes(id);

if (excludeFicheId.some(matchFilDAriane)) {
return false;
}
if (excludeDossierId.some(matchFilDAriane)) {
// Il existe des fiches que l'on souhaite garder, alors que
// l'on ne souhaite pas garder son dossier parent
return includeFicheId.some(matchFilDAriane);
}

const includeList = includeThemeId.concat(includeDossierId, includeFicheId);
if (includeList.some(matchFilDAriane)) {
return true;
}

// Par défaut, on exclue
return false;
});
const particuliers = filteredFiches.filter(
({ type }) => type === "particuliers"
);
const professionnels = filteredFiches
.filter(({ type }) => type === "professionnels")
.filter(({ id }) =>
particuliers.every(({ id: particulierId }) => particulierId !== id)
);

const associations = filteredFiches
.filter(({ type }) => type === "associations")
.filter(({ id }) =>
particuliers.every(({ id: particulierId }) => particulierId !== id)
)
.filter(({ id }) =>
professionnels.every(({ id: professionnelId }) => professionnelId !== id)
);

return particuliers.concat(professionnels, associations);
}

// Liste fournie par @jrduscher
const excludeDossierId = [
"N500",
"N511",
"N505",
"N31057",
"N19978",
"N186",
"N431",
"N512",
"N503",
"N102",
"N20276",
"N515",
"N379",
];
const excludeFicheId = [
"F1234",
"F3059",
"F10027",
"F12416",
"F13375",
"F20314",
"F20678",
"F22290",
"F22295",
"F22316",
"F22327",
"F22335",
"F22352",
"F22354",
"F22356",
"F22358",
"F22359",
"F22424",
"F22532",
"F22553",
"F22726",
"F23369",
"F23459",
"F23460",
"F23507",
"F23670",
"F23744",
"F23756",
"F23891",
"F23992",
"F23994",
"F23997",
"F24005",
"F24013",
"F31195",
"F31204",
"F31233",
"F31263",
"F31406",
"F31409",
"F31422",
"F31427",
"F31479",
"F31670",
"F31712",
"F31713",
"F31837",
"F31926",
"F32090",
"F32095",
"F32234",
"F32258",
"F32307",
"F32308",
"F32581",
"F32703",
"F32965",
"F33843",
"F34629",
"F34631",
"F34633",
"F34900",
];
const includeDossierId = [
"N20286",
"N31477",
"N107",
"N31143",
"N16594",
"N24267",
"N31775",
"N22781",
"N31391",
"N31392",
];
const includeThemeId = [
"N19806", // particulier / travail
];
const includeFicheId = [
"F92",
"F153",
"F174",
"F1043",
"F1190",
"F1226",
"F1234",
"F1642",
"F1691",
"F1928",
"F2064",
"F2140",
"F2141",
"F2142",
"F2309",
"F2354",
"F2517",
"F2642",
"F2742",
"F10029",
"F10041",
"F12382",
"F14809",
"F14860",
"F14868",
"F15132",
"F15813",
"F19087",
"F21000",
"F22606",
"F23106",
"F23425",
"F23633",
"F31982",
"F32329",
"F32709",
"F33050",
"F34059",
"F34705",
"F34902",
];
8 changes: 7 additions & 1 deletion targets/ingester/src/transform/fichesServicePublic/format.js
Expand Up @@ -33,7 +33,7 @@ function getText(element) {
* @param {import("@socialgouv/fiches-vdd").RawJson} fiche
* @param {ingester.referenceResolver} resolveCdtReference
* @param {import("@socialgouv/kali-data").IndexedAgreement[]} agreements
* @returns {Pick<ingester.FicheServicePublic, Exclude<keyof ingester.FicheServicePublic, keyof {slug, url:string, excludeFromSearch: string}>> }
* @returns {Pick<ingester.FicheServicePublic, Exclude<keyof ingester.FicheServicePublic, keyof {slug, excludeFromSearch: string}>> }
*/
export function format(fiche, resolveCdtReference, agreements) {
const publication = fiche.children[0];
Expand All @@ -50,6 +50,11 @@ export function format(fiche, resolveCdtReference, agreements) {
const [year, month, day] = dateRaw.split(" ")[1].split("-");
const date = `${day}/${month}/${year}`;

const audience = getText(getChild(publication, "Audience"));
const urlSlug =
audience === "Particuliers" ? "particuliers" : "professionnels-entreprises";
const url = `https://www.service-public.fr/${urlSlug}/vosdroits/${id}`;

const intro = getText(getChild(publication, "Introduction"));
const texte = getText(getChild(publication, "Texte"));
const listeSituations = getText(getChild(publication, "ListeSituations"));
Expand All @@ -75,5 +80,6 @@ export function format(fiche, resolveCdtReference, agreements) {
source: SOURCES.SHEET_SP,
text,
title,
url,
};
}
46 changes: 9 additions & 37 deletions targets/ingester/src/transform/fichesServicePublic/index.js
Expand Up @@ -2,6 +2,7 @@ import slugify from "@socialgouv/cdtn-slugify";

import { getJson } from "../../lib/getJson.js";
import { referenceResolver } from "../../lib/referenceResolver";
import { filter } from "./filter.js";
import { format } from "./format.js";
// Extract external content url from Content tag markdown
/**
Expand All @@ -19,22 +20,16 @@ function extractMdxContentUrl(markdown) {
return matchUrl ? matchUrl[0] : null;
}

const slugMap = {
associations: "associations",
particuliers: "particuliers",
"professionnels-entreprises": "professionnels",
};

/**
*
* @param {string} pkgName
*/
export default async function getFichesServicePublic(pkgName) {
const [contributions, externals, agreements, cdt] = await Promise.all([
const [contributions, ficheVddIndex, agreements, cdt] = await Promise.all([
/** @type {Promise<import("@socialgouv/contributions-data").Question[]>} */
(getJson("@socialgouv/contributions-data/data/contributions.json")),
/** @type {Promise<import("@socialgouv/datafiller-data").ExternalDoc[]>} */
(getJson("@socialgouv/datafiller-data/data/externals.json")),
/** @type {Promise<import("@socialgouv/fiches-vdd").FicheIndex[]>} */
(getJson("@socialgouv/fiches-vdd/data/index.json")),
/** @type {Promise<import("@socialgouv/kali-data").IndexedAgreement[]>} */
(getJson("@socialgouv/kali-data/data/index.json")),
/** @type {Promise<import("@socialgouv/legi-data").Code>} */
Expand All @@ -43,6 +38,8 @@ export default async function getFichesServicePublic(pkgName) {

const resolveCdtReference = referenceResolver(cdt);

const listFicheVdd = filter(ficheVddIndex);

const fichesIdFromContrib = contributions
.map(({ answers }) => extractMdxContentUrl(answers.generic.markdown))
.filter(Boolean)
Expand All @@ -51,46 +48,21 @@ export default async function getFichesServicePublic(pkgName) {
return id;
});

const vddExternals = externals.find(
({ title }) => title === "service-public.fr"
);

if (!vddExternals || !vddExternals.urls.length) {
throw new Error("fiches sp urls not found");
}
/** @type {ingester.FicheServicePublic[]} */
const fiches = [];
for (const url of vddExternals.urls) {
const [, slugType, idFiche] =
url.match(/([a-z-]+)\/vosdroits\/(F[0-9]+)$/) || [];
if (!Object.prototype.hasOwnProperty.call(slugMap, slugType) || !idFiche) {
// throw new Error(`Unknown fiche ${url}`);
console.error(`[getFichesServicePublic] - error | Unknown fiche ${url}`);
continue;
}
for (const { id: idFiche, type } of listFicheVdd) {
let fiche;
try {
fiche = await getJson(
`${pkgName}/data/${
slugMap[/** @type { keyof slugMap } */ (slugType)]
}/${idFiche}.json`
);
fiche = await getJson(`${pkgName}/data/${type}/${idFiche}.json`);
} catch (err) {
console.error(
">",
`${pkgName}/data/${
slugMap[/** @type { keyof slugMap } */ (slugType)]
}/${idFiche}.json`,
url
);
console.error(">", `${pkgName}/data/${type}/${idFiche}.json`);
continue;
}
const ficheSp = format(fiche, resolveCdtReference, agreements);
fiches.push({
...ficheSp,
excludeFromSearch: fichesIdFromContrib.includes(ficheSp.id),
slug: slugify(ficheSp.title),
url,
});
}

Expand Down

0 comments on commit 1863826

Please sign in to comment.