# Scrapping car images
 
As side effect of data scriping we got links on car images, so now we can download it and use inf CV tasks.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests
import os
import errno
import time
import glob

In [5]:
glob.glob('../../data/1/source_data/*')

['../../data/1/source_data/source_frame_all_offer2020-01-22 12:37.pckl',
 '../../data/1/source_data/total_frame_valid_links_2020-01-23 11:07.pckl',
 '../../data/1/source_data/broken_index_df_2020-01-29 06:19.csv',
 '../../data/1/source_data/broken_index_df_2020-01-29 17:56.csv',
 '../../data/1/source_data/broken_index_df_2020-01-28 18:57.csv',
 '../../data/1/source_data/clean_data_offer_used 2020-09-04 13:17.csv',
 '../../data/1/source_data/clean_data_offer_used 2020-03-02 20:04.csv',
 '../../data/1/source_data/broken_index_df_2020-01-30 12:32.csv',
 '../../data/1/source_data/page_2020-01-22 09:48.csv',
 '../../data/1/source_data/broken_index_df_2020-01-30 13:21.csv',
 '../../data/1/source_data/region_data_ 2020-03-03 17:08.csv',
 '../../data/1/source_data/clean_data_offer_used 2020-03-03 17:14.csv',
 '../../data/1/source_data/clean_offer_frame_all2020-01-22 12:40.pckl',
 '../../data/1/source_data/region_data_ 2020-03-03 16:57.csv',
 '../../data/1/source_data/broken_index_df_2020-01-30

In [2]:
# load data
offer_data = pd.read_pickle('./source_data/clean_offer_frame_all2020-01-22 12:40.pckl')

In [16]:
# check valid links
def validate_links(img_links):
    valid_links = []
    for l in img_links:
        if '_crpd' not in l:
            l = 'https:' + l
            valid_links.append(l)
    return valid_links


links_frame = offer_data[['img_links']].copy()

links_frame['total_img_count'] = links_frame.img_links.apply(lambda row: len(row))
links_frame['valid_img_links'] = links_frame.img_links.apply(lambda row: validate_links(row))
links_frame['valid_img_count'] = links_frame.valid_img_links.apply(lambda row: len(row))

total_frame = pd.concat([offer_data , links_frame], axis=1)
total_frame.to_pickle('./source_data/total_frame_valid_links_' \
                      + datetime.today().strftime('%Y-%m-%d %H:%M') + '.pckl' )

In [11]:
# total valid links
total_link = total_frame['valid_img_count'].sum()
total_link

566760

## Download img
Web scrapping it's not stable process so some times we get broken images, but we can save broken links and iteratively download it.

In [75]:
# list for index of cars which didn't download
brocken_index = []

In [139]:
%%time

for i in links_frame[links_frame['valid_img_count'] != 0].index[232120:]:
    dir_path = './img/'+str(i)+'/'
    if not os.path.exists(os.path.dirname(dir_path)):
        try:
            os.makedirs(os.path.dirname(dir_path))
        except OSError as exc: 
            if exc.errno != errno.EEXIST:
                raise    
    for j, link in enumerate(links_frame['valid_img_links'].iloc[i]):
        try:
            img = requests.get(link)
            open(dir_path + str(j) + '.webp', 'wb').write(img.content)
        except Exception as e:
            print('Error!', link)
            brocken_index.append(i)
            print(e)
            #time.sleep(600)
    #break

CPU times: user 4min 9s, sys: 11.8 s, total: 4min 21s
Wall time: 31min 28s


In [141]:
# save broken index
broken_index_df = pd.DataFrame(pd.Series(brocken_index).unique(), columns=['broken_index'])
broken_index_df.to_csv('./source_data/broken_index_df_' + datetime.today().strftime('%Y-%m-%d %H:%M') + '.csv')

In [160]:
%%time
# download broken img

for i in broken_index_df.broken_index:
    dir_path = './img/'+str(i)+'/'
    print(dir_path)
    if not os.path.exists(os.path.dirname(dir_path)):
        try:
            
            os.makedirs(os.path.dirname(dir_path))
        except OSError as exc: 
            if exc.errno != errno.EEXIST:
                raise    
    for j, link in enumerate(links_frame['valid_img_links'].iloc[i]):
        try:
            print(link)
            img = requests.get(link)
            open(dir_path + str(j) + '.webp', 'wb').write(img.content)
        except Exception as e:
            print('Error!', link)
            brocken_index.append(i)
            print(e)
            #time.sleep(600)
    #break

In [19]:
%%time
# check total downloading 
total_img = 0
for path in glob.glob('./img/*'):
    total_img += len(glob.glob(path +'/*.webp'))
    #break
print(total_link)
print(total_img)


566760
566760
CPU times: user 42.2 s, sys: 31.6 s, total: 1min 13s
Wall time: 2h 56min 44s


## All images downloaded!
Path structure for image datatatset is './img/< directory index >/< number of picture>.webp'


## Tasks that can be decided with this dataset
1. Classification - predict  producer, body type etc.
2. Regression - predict milage, price