Skip to content

Commit

Permalink
fix: add a token to bypass the bot protection (#368)
Browse files Browse the repository at this point in the history
  • Loading branch information
m-maillot committed Feb 1, 2022
1 parent 76a7924 commit 0ebf8bf
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 13 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
repository_dispatch:
types: manual_release
workflow_dispatch:

jobs:
release:
name: Release
Expand Down Expand Up @@ -45,6 +45,7 @@ jobs:

- name: Start
env:
TOKEN_MT: ${{ secrets.TOKEN_MT }}
DATAFILLER_URL: ${{ secrets.DATAFILLER_URL }}
run: yarn start

Expand Down
27 changes: 27 additions & 0 deletions src/fetch-data/__tests__/scrapUrl.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,49 @@ got.mockImplementation((url) => {

parseDom.mockImplementation(() => ({ title: "Yo" }));

const OLD_ENV = process.env;

beforeEach(() => {
jest.resetModules(); // Most important - it clears the cache
process.env = { ...OLD_ENV }; // Make a copy
});

afterAll(() => {
process.env = OLD_ENV; // Restore old environment
});

test("scrapUrl should throw an error if no TOKEN variable has been set", async () => {
expect.assertions(1);
try {
await scrapUrl("id", "http://url.ok");
} catch (e) {
// eslint-disable-next-line jest/no-conditional-expect
expect(e.message).toBe(
"Token (cgtoken) is required to fetch the data. This token is provided by the travail-emploi.gouv.fr team."
);
}
});

test("scrapUrl should return formated data", async () => {
process.env.TOKEN_MT = "TOKEN";
const result = await scrapUrl("id", "http://url.ok");
expect(result).toEqual({ title: "Yo" });
});

test("scrapUrl should throw if redirected url failed", async () => {
process.env.TOKEN_MT = "TOKEN";
await expect(scrapUrl("id", "url.wrong-redirect")).rejects.toThrow(
/Wrong redirectUrl/
);
});

test("scrapUrl should throw if url failed", async () => {
process.env.TOKEN_MT = "TOKEN";
await expect(scrapUrl("id", "url.http.fail")).rejects.toThrow(/HTTP Error/);
});

test("scrap should throw if parse fail", async () => {
process.env.TOKEN_MT = "TOKEN";
await expect(scrapUrl("id", "url.parse.fail")).rejects.toThrow(
/Parsing Error/
);
Expand Down
14 changes: 14 additions & 0 deletions src/fetch-data/generateHeaders.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/**
* Build the header for request with a specific token to bypass bot protection
*/
export function generateHeaders(extras) {
if (!process.env.TOKEN_MT) {
throw Error(
"Token (cgtoken) is required to fetch the data. This token is provided by the travail-emploi.gouv.fr team."
);
}
return {
...extras,
Cookie: `cgtoken=${process.env.TOKEN_MT};`,
};
}
16 changes: 4 additions & 12 deletions src/fetch-data/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import got from "got";
import pLimit from "p-limit";
import path from "path";

import data from "./api.data.json";
import { generateHeaders } from "./generateHeaders";
import { scrapUrl } from "./scrapUrl";

const FEED_URL = "https://travail-emploi.gouv.fr/?page=oseo_json";
Expand All @@ -12,21 +12,15 @@ const limit = pLimit(10);

export async function fetchFeed(url) {
const response = await got.post(url, {
headers: {
headers: generateHeaders({
"Content-Type": "application/json",
},
}),
http2: true,
retry: 3,
});
const { fiches: feed } = JSON.parse(response.body);
return feed;
}

export async function loadFeed() {
const { fiches: feed } = data;
return Promise.resolve(feed);
}

export async function scrap(urls) {
const inputs = urls.map(({ id, url }) => limit(() => scrapUrl(id, url)));
const results = await Promise.allSettled(inputs);
Expand Down Expand Up @@ -68,9 +62,7 @@ export async function scrap(urls) {

if (module === require.main) {
const t0 = Date.now();
// cf issue https://github.com/SocialGouv/cdtn-admin/issues/707
// fetchFeed(FEED_URL)
loadFeed()
fetchFeed(FEED_URL)
.then(scrap)
.then((fiches) => {
console.log(`done in ${Math.round((Date.now() - t0) / 1000)} sec`);
Expand Down
4 changes: 4 additions & 0 deletions src/fetch-data/scrapUrl.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import got from "got";
import { JSDOM } from "jsdom";

import { generateHeaders } from "./generateHeaders";
import { parseDom } from "./parseDom";

export async function scrapUrl(id, url) {
const headers = generateHeaders();
try {
let response = await got(url, {
followRedirect: true,
headers,
http2: true,
retry: 3,
});
Expand All @@ -15,6 +18,7 @@ export async function scrapUrl(id, url) {
try {
response = await got(redirectUrl, {
followRedirect: true,
headers,
http2: true,
retry: 3,
});
Expand Down

0 comments on commit 0ebf8bf

Please sign in to comment.