Run spider, copy data to local folder.

Certain files might be empty : not a problem here, but to take into account in next steps.

    - READ : *.py   - utils/functions.py
    - READ : *.jl   - scraped data from other fodler on the computer (scraping projects)
    - READ : *.json - data/processed_data (ID of already stored data in json files => keep only new ones)
    - WRITE: *.csv  - data/raw_data (main and history)

# 0. General

In [49]:
import os
import json
import datetime
import numpy as np
import pandas as pd
from utils import functions as f

In [50]:
now = f.get_now()
scrapping=False
print(now)

2019-05-31_10-28-34


# 1. Scraping

In [51]:
from shutil import copyfile

In [52]:
def run_spider(link, spider_name, file_name, max_page=None, start_url=None):
    
    # Eventually remove previous existing file
    
    # Prepare command to execute spider
    cmd1 = 'cd "{}"'.format(link)
    cmd2 = 'scrapy crawl {} -o {}'.format(spider_name, file_name)
    if max_page:
        cmd2 = cmd2 + ' -a max_page={}'.format(max_page)
    if start_url:
        cmd2 = cmd2 + ' -a start_url="{}"'.format(start_url)
    
    # Run spider
    cmd = "{} && {}".format(cmd1, cmd2)
    rep = os.system(cmd)
    
    # Display result
    if rep==0:
        print("Spider {} successfully run".format(spider_name))
    else:
        print("Couldn't run spider {}".format(spider_name))

In [53]:
def get_immo_data(source, source_link, dest_folder='data/raw_data', now=f.get_now()):
    
    # Compute source path
    path_source = '{}/{}_immo.jl'.format(source_link, source)
    
    # Save data for history
    path_dest_history = '{}/history/{}_{}.jl'.format(dest_folder, source, now)
    copyfile(path_source, path_dest_history)
    
    # Save tmp data for processing pipeline
    path_dest_pipeline = '{}/raw_{}.jl'.format(dest_folder, source)
    copyfile(path_source, path_dest_pipeline)
    
    print('> Files {} saved.'.format(source))
    

In [54]:
max_page=3
lbc_url = 'https://www.leboncoin.fr/recherche/?category=9&locations=Nantes,Rennes,Reims_51100,Bordeaux,Talence_33400,Pessac_33600,M%C3%A9rignac_33700&real_estate_type=2&immo_sell_type=old&price=75000-125000'
pv_url ='https://www.paruvendu.fr/immobilier/annonceimmofo/liste/listeAnnonces?nbp=0&tt=1&tbApp=1&tbDup=1&tbChb=1&tbLof=1&at=1&nbp0=99&px0=90000&px1=111000&pa=FR&ddlFiltres=nofilter&codeINSEE=44XX0,35XX0,51454,33281,33318,33522,'
sl_url = 'https://www.seloger.com/list.htm?enterprise=0&natures=1&places=[{ci:330522}|{ci:330318}|{ci:330281}|{ci:510454}|{ci:350238}|{ci:440109}|{ci:330063}|{ci:330039}]&price=50000/150000&projects=2&proximity=0,10&qsversion=1.0&types=1'

In [55]:
lbc_link = '/Users/thibaudlamothe/OneDrive - Capgemini/Documents/Python_scripts/05_Scraping/LBC/LBC'
sl_link = '/Users/thibaudlamothe/OneDrive - Capgemini/Documents/Python_scripts/05_Scraping/SL/SL/'
pv_link = '/Users/thibaudlamothe/OneDrive - Capgemini/Documents/Python_scripts/05_Scraping/ParuVendu/ParuVendu/'


if scrapping:
    run_spider(lbc_link, 'spiderLBC', 'lbc_immo.jl',max_page=max_page, start_url=lbc_url)
    run_spider(pv_link, 'spiderPV', 'pv_immo.jl', max_page=max_page, start_url=pv_url)
    run_spider(sl_link, 'spiderSL', 'sl_immo.jl', max_page=max_page, start_url=sl_url)
    
get_immo_data('lbc', lbc_link)
get_immo_data('pv', pv_link)
get_immo_data('sl', sl_link)

lbc_file = 'data/raw_data/raw_{}.jl'.format('lbc')
pv_file = 'data/raw_data/raw_{}.jl'.format('pv')
sl_file = 'data/raw_data/raw_{}.jl'.format('sl')

> Files lbc saved.
> Files pv saved.
> Files sl saved.


# 2. Selecting

In [56]:
def filter_ids(df, col_name, list_id):
    return df.loc[df[col_name].isin(list_id)]

In [57]:
def get_ids(source):
    path = 'data/processed_data/list_{}_id.json'.format(source)
    with open(path, 'r') as fp:
        list_id = json.load(fp)
    list_id = list(list_id.values())
    return list_id

In [58]:
df_lbc = f.read_jl_file(lbc_file)
df_pv = f.read_jl_file(pv_file)
df_sl = f.read_jl_file(sl_file)

new_lbc_ids = df_lbc['id_'].values
new_pv_ids = df_pv['annonce'].values
new_sl_ids = df_sl['annonce'].values

In [59]:
processed_path = 'data/processed_data/process_data.csv'
is_processed = os.path.isfile(processed_path)

if is_processed:
    print('> is_processed')
    # Get old ids
    lbc_ids = get_ids('lbc')
    pv_ids = get_ids('pv')
    sl_ids = get_ids('sl')
    
    # Modify data to save
    print('Avant', df_lbc.shape, df_pv.shape, df_sl.shape)
    df_lbc = df_lbc.pipe(filter_ids, 'id_', [id_ for id_ in new_lbc_ids if id_ not in lbc_ids])
    df_pv = df_pv.pipe(filter_ids, 'annonce', [id_ for id_ in new_pv_ids if id_ not in pv_ids])
    df_sl = df_sl.pipe(filter_ids, 'annonce', [id_ for id_ in new_sl_ids if id_ not in sl_ids])
    print('Après', df_lbc.shape, df_pv.shape, df_sl.shape)

print('> Selection ok.')

> is_processed
Avant (805, 14) (441, 10) (3183, 17)
Après (0, 14) (0, 10) (0, 17)
> Selection ok.


# 3. Saving

In [60]:
tmp_folder = 'data/new_tmp_data'
df_lbc.to_csv('{}/new_lbc.csv'.format(tmp_folder), header=True, index=False)
df_sl.to_csv('{}/new_sl.csv'.format(tmp_folder), header=True, index=False)
df_pv.to_csv('{}/new_pv.csv'.format(tmp_folder), header=True, index=False)
print('> New data saved.')

> New data saved.


In [62]:
df_lbc

Unnamed: 0,auteur,categorie,code_postal,critere,date_absolue,description,id_,is_msg,is_num,nb_pict,prix,titre,url,ville
