Skip to content

Commit

Permalink
Last commit
Browse files Browse the repository at this point in the history
  • Loading branch information
SidorenkovIvan committed Apr 5, 2020
1 parent 64651b9 commit d0514b5
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 79 deletions.
1 change: 1 addition & 0 deletions package.json
Expand Up @@ -28,6 +28,7 @@
"tress": "^1.1.4"
},
"devDependencies": {
"md5": "^2.2.1",
"needle": "^2.3.3",
"nodemon": "^2.0.2",
"puppeteer": "^2.1.1",
Expand Down
182 changes: 103 additions & 79 deletions scraper.js
Expand Up @@ -5,6 +5,7 @@ let sqlite3 = require("sqlite3").verbose();
let fs = require("fs");
const fetch = require('node-fetch');
const FileType = require('file-type');
const md5 = require('md5');

let siteUrl = "https://tea4u.by";
let category = {};
Expand Down Expand Up @@ -88,107 +89,130 @@ let q = tress(function(url, callback) {
}
callback(); //вызываем callback в конце
});
}, 10); // запускаем 10 параллельных потоков
}, 1); // запускаем 10 параллельных потоков !!!

// эта функция выполнится, когда в очереди закончатся ссылки
q.drain = function () {
let dataBase = new sqlite3.Database(DB_NAME);
dataBase.serialize(() => {
dataBase.run('DROP TABLE IF EXISTS category');
dataBase.run('CREATE TABLE category (' +
' "category_id" INTEGER NOT NULL UNIQUE, ' +
' "title" TEXT, ' +
' "url" TEXT UNIQUE, ' +
' "parent_id" INTEGER DEFAULT 0, ' +
' PRIMARY KEY("category_id") );');
let stmt = dataBase.prepare('INSERT INTO category VALUES (?, ?, ?, ?)');
for (const u in category)
stmt.run(category[u].id, category[u].title, category[u].url, category[u].parentId);
stmt.finalize();
dataBase.run('DROP TABLE IF EXISTS product');
dataBase.run('CREATE TABLE product (' +
' "product_id" INTEGER NOT NULL UNIQUE, ' +
' "imgUrl" TEXT, ' +
' "productTitle" TEXT, ' +
' "productUrl" TEXT, ' +
' "description" TEXT, ' +
' "images" TEXT, ' +
' "code" TEXT, ' +
' PRIMARY KEY("product_id") );');
let stmt1 = dataBase.prepare('INSERT INTO product VALUES (?, ?, ?, ?, ?, ?, ?)');
let foreign = [];
for (const c in product) {
for (let p of product[c].prods) {
foreign.push([category[p.categoryURL].id, product[c].id]);
}
stmt1.run(product[c].id,
product[c].prods[0].imgUrl,
product[c].prods[0].productTitle,
product[c].prods[0].productUrl,
product[c].description,
product[c].images.join('|'),
c);
}
stmt1.finalize();
dataBase.run('DROP TABLE IF EXISTS category_product');
dataBase.run('CREATE TABLE category_product (' +
' "category_id" INTEGER NOT NULL, ' +
' "product_id" INTEGER NOT NULL, ' +
' FOREIGN KEY("product_id") REFERENCES "product"("product_id"), ' +
' PRIMARY KEY("category_id","product_id"), ' +
' FOREIGN KEY("category_id") REFERENCES "category"("category_id") );');
let stmt2 = dataBase.prepare('INSERT INTO category_product VALUES (?, ?)');
for (const f of foreign)
stmt2.run(f[0], f[1]);
stmt2.finalize();
dataBase.close();
let tableString = '';
dataBase.serialize(async () => {
dataBase.run('DROP TABLE IF EXISTS category');
dataBase.run('CREATE TABLE category (' +
' "category_id" INTEGER NOT NULL UNIQUE, ' +
' "title" TEXT, ' +
' "url" TEXT UNIQUE, ' +
' "parent_id" INTEGER DEFAULT 0, ' +
' PRIMARY KEY("category_id") );');
let stmt = dataBase.prepare('INSERT INTO category VALUES (?, ?, ?, ?)');
for (const u in category) {
const stmtData = [category[u].id, category[u].title, category[u].url, category[u].parentId];
tableString += stmtData.join();
stmt.run(stmtData);
}
stmt.finalize();
dataBase.run('DROP TABLE IF EXISTS product');
dataBase.run('CREATE TABLE product (' +
' "product_id" INTEGER NOT NULL UNIQUE, ' +
' "imgUrl" TEXT, ' +
' "productTitle" TEXT, ' +
' "productUrl" TEXT, ' +
' "description" TEXT, ' +
' "images" TEXT, ' +
' "code" TEXT, ' +
' PRIMARY KEY("product_id") );');
let stmt1 = dataBase.prepare('INSERT INTO product VALUES (?, ?, ?, ?, ?, ?, ?)');
let foreign = [];
for (const c in product) {
for (let p of product[c].prods) {
foreign.push([category[p.categoryURL].id, product[c].id]);
}
const stmt1Data = [product[c].id,
product[c].prods[0].imgUrl,
product[c].prods[0].productTitle,
product[c].prods[0].productUrl,
product[c].description,
product[c].images.join('|'),
c];
tableString += stmt1Data.join();
stmt1.run(stmt1Data);
}
stmt1.finalize();
dataBase.run('DROP TABLE IF EXISTS category_product');
dataBase.run('CREATE TABLE category_product (' +
' "category_id" INTEGER NOT NULL, ' +
' "product_id" INTEGER NOT NULL, ' +
' FOREIGN KEY("product_id") REFERENCES "product"("product_id"), ' +
' PRIMARY KEY("category_id","product_id"), ' +
' FOREIGN KEY("category_id") REFERENCES "category"("category_id") );');
let stmt2 = dataBase.prepare('INSERT INTO category_product VALUES (?, ?)');
for (const f of foreign) {
const stmt2Data = [f[0], f[1]];
tableString += stmt2Data.join();
stmt2.run(stmt2Data);
}
stmt2.finalize();


dataBase.run('DROP TABLE IF EXISTS hash_table');
dataBase.run('CREATE TABLE hash_table ("hash" TEXT NOT NULL, PRIMARY KEY("hash") );');
let stmt3 = dataBase.prepare('INSERT INTO hash_table VALUES (?)');
let l = tableString.length;
if (GRAB_IMGS) tableString = await storeImages(dataBase, tableString);
console.log('length before grab images ' + l);
console.log('length after grab images ' + tableString.length);
const hash = md5(tableString);
console.log(`md5 hash ${hash}`);
stmt3.run(hash);
stmt3.finalize();
dataBase.close();
});

fs.writeFileSync('./data.json', JSON.stringify(product, null, 4));
//console.log("total unique products " + prodId);
if (GRAB_IMGS) storeImages();
};

async function imgToBase64(url) {
async function imgToBase64BLOB(url) {
let response = await fetch(url);
let buf = await response.buffer();
//let type = await FileType.fromBuffer(buf);
//let prefix = "data:" + type.mime + ";base64,";
let base64 = buf.toString("base64");
return base64;
return [base64, buf];
}

function storeImages() {
let db = new sqlite3.Database(DB_NAME);
db.serialize(async () => {
db.run('DROP TABLE IF EXISTS image');
db.run('CREATE TABLE "image" (' +
'"url" TEXT NOT NULL UNIQUE,' +
'"base64" TEXT NOT NULL,' +
'PRIMARY KEY("url") );');
async function storeImages(db, tblStr) {
db.run('DROP TABLE IF EXISTS image');
db.run('CREATE TABLE "image" (' +
'"url" TEXT NOT NULL UNIQUE,' +
'"base64" TEXT NOT NULL,' +
'"raw" BLOB,' +
'PRIMARY KEY("url") );');

let stmt = db.prepare('INSERT INTO image VALUES (?, ?)');
let stmt = db.prepare('INSERT INTO image VALUES (?, ?, ?)');

let imgMap = new Map();
let imgMap = new Map();

for (const code in product) {
const url = product[code].prods[0].imgUrl;
if (imgMap.has(url)) {
imgMap.set(url, imgMap.get(url) + 1);
}
else {
imgMap.set(url, 1);
console.log('fetch ' + url);
const base64 = await imgToBase64(url);
stmt.run(url, base64);
}
for (const code in product) {
const url = product[code].prods[0].imgUrl;
if (imgMap.has(url)) {
imgMap.set(url, imgMap.get(url) + 1);
}
else {
imgMap.set(url, 1);
console.log('fetch ' + url);
const data = await imgToBase64BLOB(url);
const stmtData = [url, data[0], null];
tblStr += stmtData.join();
stmt.run(stmtData);
}
}

console.log(imgMap);
//console.log(imgMap);

stmt.finalize();
db.close();
});
stmt.finalize();

return tblStr;
}

// добавляем в очередь ссылки на категории из меню
start();
start();

0 comments on commit d0514b5

Please sign in to comment.