Skip to content

Commit

Permalink
Merge pull request #1 from SidorenkovIvan/image-store
Browse files Browse the repository at this point in the history
fetch and store images to database
  • Loading branch information
SidorenkovIvan committed Apr 3, 2020
2 parents ff5f550 + 529bf94 commit 111b712
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 12 deletions.
4 changes: 3 additions & 1 deletion package.json
Expand Up @@ -5,7 +5,7 @@
"main": "scraper.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"start" : "node scraper.js"
"start": "node scraper.js"
},
"keywords": [
"node",
Expand All @@ -16,9 +16,11 @@
"dependencies": {
"chalk": "^3.0.0",
"cheerio": "^1.0.0-rc.3",
"file-type": "^14.1.4",
"http": "0.0.0",
"iconv-lite": "^0.5.1",
"jsdom": "^16.2.1",
"node-fetch": "^2.6.0",
"sqlite3": "^4.1.1",
"tress": "^1.1.4"
},
Expand Down
85 changes: 74 additions & 11 deletions scraper.js
Expand Up @@ -3,11 +3,16 @@ let cheerio = require("cheerio");
let tress = require("tress");
let sqlite3 = require("sqlite3").verbose();
let fs = require("fs");
const fetch = require('node-fetch');
const FileType = require('file-type');

let siteUrl = "https://tea4u.by";
let results = [];
let category = {};
let product = {};
let prodId = 0;
const DB_NAME = 'data.sqlite';
const GRAB_IMGS = true;


function start() {
needle.get(siteUrl, (err, res) => {
Expand Down Expand Up @@ -66,13 +71,6 @@ let q = tress(function(url, callback) {
let productUrl = a.attr("href");
let categoryURL = productUrl.substr(0, productUrl.lastIndexOf('/'));
let categoryTitle = category[categoryURL].title;
results.push([
imgUrl,
productTitle,
productUrl,
categoryURL,
categoryTitle
]);
let productCode = $(el).find(".additional .code > span").text();
let p = {imgUrl, productTitle, productUrl, categoryURL, categoryTitle};
if (!(productCode in product)) {
Expand All @@ -94,17 +92,18 @@ let q = tress(function(url, callback) {

// эта функция выполнится, когда в очереди закончатся ссылки
q.drain = function () {
let dataBase = new sqlite3.Database('data.sqlite');
let dataBase = new sqlite3.Database(DB_NAME);
dataBase.serialize(() => {
dataBase.run('DROP TABLE IF EXISTS category');
dataBase.run('CREATE TABLE category (' +
' "category_id" INTEGER NOT NULL UNIQUE, ' +
' "title" TEXT, ' +
' "url" TEXT UNIQUE, ' +
' "parent_id" INTEGER DEFAULT 0, ' +
' PRIMARY KEY("category_id") );');
let stmt = dataBase.prepare('INSERT INTO category VALUES (?, ?, ?)');
let stmt = dataBase.prepare('INSERT INTO category VALUES (?, ?, ?, ?)');
for (const u in category)
stmt.run(category[u].id, category[u].title, category[u].url);
stmt.run(category[u].id, category[u].title, category[u].url, category[u].parentId);
stmt.finalize();
dataBase.run('DROP TABLE IF EXISTS product');
dataBase.run('CREATE TABLE product (' +
Expand Down Expand Up @@ -146,7 +145,71 @@ q.drain = function () {
});
fs.writeFileSync('./data.json', JSON.stringify(product, null, 4));
//console.log("total unique products " + prodId);
if (GRAB_IMGS) storeImages();
};

// добавляем в очередь ссылки на категории из меню
start();

async function imgToBase64(url) {
/*
fetch(url)
.then(response => response.buffer())
.then(buffer => {
console.log(buffer);
// Encode to base64
let encodedImage = buffer.toString('base64');
//console.log(encodedImage);
return buffer
})
.then(buf => FileType.fromBuffer(buf))
.then(type => {
console.log(type);
});
*/

let response = await fetch(url);
let buf = await response.buffer();
let type = await FileType.fromBuffer(buf);
let prefix = "data:" + type.mime + ";base64,";
let base64 = buf.toString("base64");
return prefix + base64;
}

// const u = "https://tea4u.by/image/cache/catalog/goods/S01735/IMG_9115-330x220.jpg";
// imgToBase64(u).then(function (base64) {
// console.log(base64);
// });

function storeImages() {
let db = new sqlite3.Database(DB_NAME);
db.serialize(async () => {
db.run('DROP TABLE IF EXISTS image');
db.run('CREATE TABLE "image" (' +
'"url" TEXT NOT NULL UNIQUE,' +
'"base64" TEXT NOT NULL,' +
'PRIMARY KEY("url") );');

let stmt = db.prepare('INSERT INTO image VALUES (?, ?)');

let imgMap = new Map();

for (const code in product) {
const url = product[code].prods[0].imgUrl;
if (imgMap.has(url)) {
imgMap.set(url, imgMap.get(url) + 1);
}
else {
imgMap.set(url, 1);
console.log('fetch ' + url);
const base64 = await imgToBase64(url);
stmt.run(url, base64);
}
}

console.log(imgMap);

stmt.finalize();
db.close();
});
}

0 comments on commit 111b712

Please sign in to comment.