In [1]:
import pandas as pd
import numpy as np
import string
import os
import seaborn as sns
import json 
import matplotlib.pyplot as plt
import functions as f

In [None]:
os.environ['DATA_PATH']='/Users/thibaud/Documents/data/'

In [2]:
def load_dpt(dept_list, year_file='2018'):
    print('> Loading...')
    df_list = []
    FOLDER = os.environ['DATA_PATH'] + 'DVF/data_per_dept'
    for dept in dept_list:
        print(' - Loading dept {}'.format(dept))
        path = '{}/raw_{}/'.format(FOLDER, year_file)
        dept_file_name = 'dept_{}.csv'.format(dept)
        total_path = '{}{}'.format(path, dept_file_name)
        df = pd.read_csv(total_path, low_memory=False)
        df_list.append(df)
    
    print('> Concatenating...')
    df = pd.concat(df_list)
        
    return df

In [3]:
def cleaning(df, keeping_rate=0.66):
    print('> Cleaning...')
    empty_ser =  df.isnull().sum()
    empty_max = df.shape[0]*keeping_rate
    empty_cols = empty_ser[empty_ser > empty_max].index.tolist()

    df_small = df.drop(empty_cols, axis=1)
    df_small = df_small[df_small['Valeur fonciere'].isnull()==False]
    df_small['Date mutation'] = pd.to_datetime(df_small['Date mutation'], errors='coerce')
    df_small['Valeur fonciere'] = df_small['Valeur fonciere'].apply(lambda x: int(x.split(',')[0]))
    df_small['No voie'] = df_small['No voie'].apply(lambda x : 0 if pd.isnull(x) else np.int(x))
    print(' - Deleted : {} empty columns ({} % empty)'.format(len(empty_cols), int(keeping_rate*100)))
    return df_small

In [6]:
def save_clean(df, dept_number, year_file='2018'):
    FOLDER = os.environ['DATA_PATH'] + 'DVF/data_per_dept'
    path = '{}/processed_{}/dept_{}.csv'.format(FOLDER, year_file, dept_number)
    df.to_csv(path, index=False)
    return True

In [5]:
def change_col_names(df):
    df = (df.pipe(f.renameDfCol)
          .rename(columns={'nombre_pieces_principales':'nb_piece',
                        'surface_reelle_bati':'surface',
                        'valeur_fonciere':'prix',
                        'code_departement':'dept',
                        'commune':'ville'}))
    return df

In [7]:
dept_list = ['33', '35', '44', '51']
dept_list = ['75', '92']
for dept in dept_list:
    is_stored = (load_dpt([dept])
                 .pipe(cleaning)
                 .pipe(f.renameDfCol)
                 .rename(columns={'nombre_pieces_principales':'nb_piece',
                        'surface_reelle_bati':'surface',
                        'valeur_fonciere':'prix',
                        'code_departement':'dept',
                        'commune':'ville'})
                 .pipe(save_clean, dept)
                )
    if is_stored:
        print('> Dept {} correctly stored'.format(dept))
    
    

> Loading...
 - Loading dept 75
> Concatenating...
> Cleaning...
 - Deleted : 21 empty columns (66 % empty)
> Dept 75 correctly stored
> Loading...
 - Loading dept 92
> Concatenating...
> Cleaning...
 - Deleted : 23 empty columns (66 % empty)
> Dept 92 correctly stored
