From 0ebf8bfe7c46990efd5d932db9b415076fc68829 Mon Sep 17 00:00:00 2001 From: Martial Maillot Date: Tue, 1 Feb 2022 19:05:29 +0100 Subject: [PATCH] fix: add a token to bypass the bot protection (#368) --- .github/workflows/release.yml | 3 ++- src/fetch-data/__tests__/scrapUrl.test.js | 27 +++++++++++++++++++++++ src/fetch-data/generateHeaders.js | 14 ++++++++++++ src/fetch-data/index.js | 16 ++++---------- src/fetch-data/scrapUrl.js | 4 ++++ 5 files changed, 51 insertions(+), 13 deletions(-) create mode 100644 src/fetch-data/generateHeaders.js diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 53063fde..05390bc6 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -6,7 +6,7 @@ on: repository_dispatch: types: manual_release workflow_dispatch: - + jobs: release: name: Release @@ -45,6 +45,7 @@ jobs: - name: Start env: + TOKEN_MT: ${{ secrets.TOKEN_MT }} DATAFILLER_URL: ${{ secrets.DATAFILLER_URL }} run: yarn start diff --git a/src/fetch-data/__tests__/scrapUrl.test.js b/src/fetch-data/__tests__/scrapUrl.test.js index a1f2a3c0..9e3db5ed 100644 --- a/src/fetch-data/__tests__/scrapUrl.test.js +++ b/src/fetch-data/__tests__/scrapUrl.test.js @@ -41,22 +41,49 @@ got.mockImplementation((url) => { parseDom.mockImplementation(() => ({ title: "Yo" })); +const OLD_ENV = process.env; + +beforeEach(() => { + jest.resetModules(); // Most important - it clears the cache + process.env = { ...OLD_ENV }; // Make a copy +}); + +afterAll(() => { + process.env = OLD_ENV; // Restore old environment +}); + +test("scrapUrl should throw an error if no TOKEN variable has been set", async () => { + expect.assertions(1); + try { + await scrapUrl("id", "http://url.ok"); + } catch (e) { + // eslint-disable-next-line jest/no-conditional-expect + expect(e.message).toBe( + "Token (cgtoken) is required to fetch the data. This token is provided by the travail-emploi.gouv.fr team." + ); + } +}); + test("scrapUrl should return formated data", async () => { + process.env.TOKEN_MT = "TOKEN"; const result = await scrapUrl("id", "http://url.ok"); expect(result).toEqual({ title: "Yo" }); }); test("scrapUrl should throw if redirected url failed", async () => { + process.env.TOKEN_MT = "TOKEN"; await expect(scrapUrl("id", "url.wrong-redirect")).rejects.toThrow( /Wrong redirectUrl/ ); }); test("scrapUrl should throw if url failed", async () => { + process.env.TOKEN_MT = "TOKEN"; await expect(scrapUrl("id", "url.http.fail")).rejects.toThrow(/HTTP Error/); }); test("scrap should throw if parse fail", async () => { + process.env.TOKEN_MT = "TOKEN"; await expect(scrapUrl("id", "url.parse.fail")).rejects.toThrow( /Parsing Error/ ); diff --git a/src/fetch-data/generateHeaders.js b/src/fetch-data/generateHeaders.js new file mode 100644 index 00000000..e6bc3da4 --- /dev/null +++ b/src/fetch-data/generateHeaders.js @@ -0,0 +1,14 @@ +/** + * Build the header for request with a specific token to bypass bot protection + */ +export function generateHeaders(extras) { + if (!process.env.TOKEN_MT) { + throw Error( + "Token (cgtoken) is required to fetch the data. This token is provided by the travail-emploi.gouv.fr team." + ); + } + return { + ...extras, + Cookie: `cgtoken=${process.env.TOKEN_MT};`, + }; +} diff --git a/src/fetch-data/index.js b/src/fetch-data/index.js index cdc2b432..32ec822c 100644 --- a/src/fetch-data/index.js +++ b/src/fetch-data/index.js @@ -3,7 +3,7 @@ import got from "got"; import pLimit from "p-limit"; import path from "path"; -import data from "./api.data.json"; +import { generateHeaders } from "./generateHeaders"; import { scrapUrl } from "./scrapUrl"; const FEED_URL = "https://travail-emploi.gouv.fr/?page=oseo_json"; @@ -12,21 +12,15 @@ const limit = pLimit(10); export async function fetchFeed(url) { const response = await got.post(url, { - headers: { + headers: generateHeaders({ "Content-Type": "application/json", - }, + }), http2: true, retry: 3, }); const { fiches: feed } = JSON.parse(response.body); return feed; } - -export async function loadFeed() { - const { fiches: feed } = data; - return Promise.resolve(feed); -} - export async function scrap(urls) { const inputs = urls.map(({ id, url }) => limit(() => scrapUrl(id, url))); const results = await Promise.allSettled(inputs); @@ -68,9 +62,7 @@ export async function scrap(urls) { if (module === require.main) { const t0 = Date.now(); - // cf issue https://github.com/SocialGouv/cdtn-admin/issues/707 - // fetchFeed(FEED_URL) - loadFeed() + fetchFeed(FEED_URL) .then(scrap) .then((fiches) => { console.log(`done in ${Math.round((Date.now() - t0) / 1000)} sec`); diff --git a/src/fetch-data/scrapUrl.js b/src/fetch-data/scrapUrl.js index 282cd806..03423c97 100644 --- a/src/fetch-data/scrapUrl.js +++ b/src/fetch-data/scrapUrl.js @@ -1,12 +1,15 @@ import got from "got"; import { JSDOM } from "jsdom"; +import { generateHeaders } from "./generateHeaders"; import { parseDom } from "./parseDom"; export async function scrapUrl(id, url) { + const headers = generateHeaders(); try { let response = await got(url, { followRedirect: true, + headers, http2: true, retry: 3, }); @@ -15,6 +18,7 @@ export async function scrapUrl(id, url) { try { response = await got(redirectUrl, { followRedirect: true, + headers, http2: true, retry: 3, });