In [1]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset


from typing import List, Tuple, Dict


import multiprocessing  as mp
import requests
import jsonlines
from tqdm import tqdm
import time

In [6]:
class Sent2textDataset(Dataset):
    
    def __init__(self,path_t_csv, path_i_json, path_i_folder = None, 
                 clastering_mode = False, transform = None):
        """
        path_t_csv - путь до csv файла с текстами
        path_i_json - путь до json файла с картинками
        path_i_folder - путь для сохранения скаченных фотографий
        clastering_mode - тексты разбиты на кластеры
        """
        self.text_data = path_t_csv #pd.read_csv(path_t_csv)
        self.img_links = self._load_json_links(path_i_json)
        
        self.path_to_img = path_i_folder if path_i_folder else path_i_json
        
        manager = mp.Manager()
        self._imgs_path = manager.Queue()
        self._load_imgs(list(self.img_links.items()),n_workers = 16)
        
    def __len__(self,):
        return self.text_data.id_img.unique().shape[0]
    
    def __getitem__():
        pass
        
    
    def _load_json_links(self,data_path: str, only_i_from_csv = True)->Dict[int, Tuple[str,str]]:
        data = []
        only_csv_links = []
        with jsonlines.open(data_path) as reader:
            reader = tqdm(reader)
            for obj in reader:
                data.append((obj['image'], obj['url']))
                
        if only_i_from_csv:
            #скачивать изображения принадлежащие csv
            only_csv_links = {idx: data[idx] for idx in self.text_data.id_img.unique()}
            return only_csv_links
        
        return data
    
    
    def _worker(self,task):
        paths_img = self._load_img(task)
        self._imgs_path.put(paths_img)
        
    
    def _load_img(self,links: Tuple[int,Tuple[str,str]])->int:
        try:
            response = requests.get(f"{links[1][1]}")
            with open(f"{self.path_to_img}/{links[1][0]}.jpg", "wb") as img:
                img.write(response.content)
            return links[0]
        except requests.exceptions.ConnectionError as e:
            print(f"Oyy, miss {links[0]}")
            
    def _load_imgs(self, links: List[Tuple[int, Tuple[str,str]]], n_workers = 1)->bool:
        all_row = set(self.text_data.id_img.unique())
        return_row = set()
        all_len = len(all_row)
        print(links)
        with mp.Pool(n_workers) as p:
            p.map(self._worker, links)
            
            for _ in range(len(links)):
                return_row.add(self._imgs_path.get())
                
        all_row.difference_update(return_row)
        
        for row in all_row:
             self.text_data = self.text_data.drop(self.text_data[self.text_data.id_img == row].index)
        
        assert all_len - len(all_row) == len(self.text_data.id_img.unique())
        
        print(f"Download photo {all_len - len(all_row)} with {all_len} finish")
        return True

In [None]:
path_t_csv_no_clastr = pd.read_csv("data/preproc_text700.csv")[1000:2000] #"data/preproc_text700.csv"
path_i_json = "data/images.json"
path_i_folder = "data/images"

ds = Sent2textDataset(path_t_csv_no_clastr, path_i_json, path_i_folder)

In [14]:
ds.text_data

Unnamed: 0,text_proc,id_img
1000,одноэтажный дом до м,941193
1001,соколов промокод на подвеску,941193
1002,диета при белок теряющей энтеропатии после опе...,941193
1003,чудо природа с овощ,1370697
1004,сапоги молочного цвет с ч рный колготки,1370697
...,...,...
1995,схема акпп тойота камри грация,4731110
1996,структура занятия в детский сад по чтение худо...,4731110
1997,теплый пол в ванной фотобанк,4731110
1998,тотемы животное этный тату,1444968


In [15]:
ds.img_links[941193]

(1095883,
 'https://club.season.ru/uploads/post/33693/274/post-33693-1351611274.jpg')

In [7]:
!rm -rf  data/images

In [8]:
!ls data/
!mkdir data/images

images.json  metadata.json  preproc_text700.csv
