# 0. General

In [1]:
import os
import json
import datetime
import pandas as pd
from utils import functions as f

In [2]:
now = f.get_now()
print(now)

2019-05-30_18-46-44


# 1. Scraping

In [3]:
from shutil import copyfile

In [4]:
def run_spider(link, spider_name, file_name):
    
    # Prepare command to execute spider
    cmd1 = 'cd "{}"'.format(link)
    cmd2 = 'scrapy crawl {} -o {}'.format(spider_name, file_name)
    
    # Run spider
    rep = os.system("{} && {}".format(cmd1, cmd2))
    
    # Display result
    if rep==0:
        print("Spider {} successfully run".format(spider_name))
    else:
        print("Couldn't run spider {}".format(spider_name))

In [5]:
def get_immo_data(source, source_link, dest_folder='data/raw_data', now=f.get_now()):
    
    # Compute source path
    path_source = '{}/{}_immo.jl'.format(source_link, source)
    
    # Save data for history
    path_dest_history = '{}/{}_{}.jl'.format(dest_folder, source, now)
    copyfile(path_source, path_dest_history)
    
    # Save tmp data for processing pipeline
    path_dest_pipeline = '{}/raw_{}.jl'.format(dest_folder, source)
    copyfile(path_source, path_dest_pipeline)
    
    print('> Files {} saved.'.format(source))
    

In [6]:
lbc_link = '/Users/thibaudlamothe/OneDrive - Capgemini/Documents/Python_scripts/05_Scraping/LBC/LBC'
sl_link = '/Users/thibaudlamothe/OneDrive - Capgemini/Documents/Python_scripts/05_Scraping/SL/SL/'
pv_link = '/Users/thibaudlamothe/OneDrive - Capgemini/Documents/Python_scripts/05_Scraping/ParuVendu/ParuVendu/'

scrapping=True
if scrapping:
    run_spider(lbc_link, 'spiderLBC', 'lbc_immo.jl')
    run_spider(pv_link, 'spiderPV', 'pv_immo.jl')
    run_spider(sl_link, 'spiderSL', 'sl_immo.jl')
    
get_immo_data('lbc', lbc_link)
get_immo_data('pv', pv_link)
get_immo_data('sl', sl_link)

lbc_file = 'data/raw_data/raw_{}.jl'.format('lbc')
pv_file = 'data/raw_data/raw_{}.jl'.format('pv')
sl_file = 'data/raw_data/raw_{}.jl'.format('sl')

Spider spiderLBC successfully run


Spider spiderPV successfully run


Spider spiderSL successfully run
> Files lbc saved.
> Files pv saved.
> Files sl saved.


# 2. Selecting

In [7]:
def filter_ids(df, col_name, list_id):
    return df.loc[df[col_name].isin(list_id)]

In [8]:
df_lbc = f.read_jl_file(lbc_file)
df_pv = f.read_jl_file(pv_file)
df_sl = f.read_jl_file(sl_file)

new_lbc_ids = df_lbc['id_'].values
new_pv_ids = df_pv['annonce'].values
new_sl_ids = df_sl['annonce'].values

In [9]:
processed_path = 'data/processed_data/processed_data.csv'
is_processed = os.path.isfile(processed_path)

if is_processed:
    print('> is_processed')
    # Get old ids
    lbc_ids = get_lbc_ids() # functions to write
    pv_ids = get_pv_ids()
    sl_ids = get_sl_ids()
    
    # Modify data to save
    df_lbc = df_lbc.pipe(filter_ids, 'id_', [id_ for id_ in new_lbc_ids if id_ not in lbc_ids])
    df_pv = df_pv.pipe(filter_ids, 'annonce', [id_ for id_ in new_pv_ids if id_ not in pv_ids])
    df_sl = df_sl.pipe(filter_ids, 'annonce', [id_ for id_ in new_sl_ids if id_ not in sl_ids])
print('> Selection ok.')

> Selection ok.


# 3. Saving

In [10]:
tmp_folder = 'data/new_tmp_data'
df_lbc.to_csv('{}/new_lbc.csv'.format(tmp_folder), header=True, index=False)
df_sl.to_csv('{}/new_sl.csv'.format(tmp_folder), header=True, index=False)
df_pv.to_csv('{}/new_pv.csv'.format(tmp_folder), header=True, index=False)
print('> New data saved.')

> New data saved.
