In [1]:
import pandas as pd
import numpy as np

# Loading and reducing raw data

In [None]:
def process_raw_data_into_file(columns, filename='valeursfoncieres-', appendix='.txt', sep='|', years=range(2014,2019)):
    df = pd.DataFrame()
    for year in years:
        print(year)
        temp = pd.read_csv(filename + str(year) + appendix, sep=sep, low_memory = False)
        temp = temp[columns]
        df = pd.concat([df, temp], axis=0, join='outer', ignore_index=True, copy=True)
    df.to_csv(path_or_buf = 'main.csv', sep = ',', index=False)
    
columns = ['Date mutation', 'Nature mutation', 'Valeur fonciere', 'Code postal', 'Code type local', 'Surface reelle bati', 'Nombre pieces principales']
process_raw_data_into_file(columns)

# Loading main.csv

In [2]:
df = pd.read_csv('main.csv', sep=',', low_memory = False)
df.drop(labels=['Code commune'], axis=1, inplace=True)
df.head(5)

Unnamed: 0,Date mutation,Nature mutation,Valeur fonciere,Code postal,Code type local,Surface reelle bati,Nombre pieces principales
0,09/01/2014,Vente,25150000,1310.0,1.0,147.0,5.0
1,09/01/2014,Vente,17450000,1000.0,2.0,80.0,2.0
2,09/01/2014,Vente,17450000,1000.0,3.0,0.0,0.0
3,02/01/2014,Vente,15750000,1440.0,,,
4,02/01/2014,Vente,15750000,1440.0,1.0,103.0,4.0


In [3]:
def enrich_datetime(df, column='Date mutation', date_format="%d/%m/%Y"): # make more columns from datetime, so 14/6/2019 becomes d = 14, m = 6, y = 2019
    df[column] = pd.to_datetime(df[column], format="%d/%m/%Y")

    rng = range(df.shape[0])
    dmy = []
    for dt in list(df[column]):
        dmy.append([dt.day, dt.month, dt.year])
    df['year'] = [x[2] for x in dmy]
    df['month'] = [x[1] for x in dmy]
    df['day'] = [x[0] for x in dmy]
    return df

df = enrich_datetime(df, 'Date mutation')
df.head(5)

Unnamed: 0,Date mutation,Nature mutation,Valeur fonciere,Code postal,Code type local,Surface reelle bati,Nombre pieces principales,year,month,day
0,2014-01-09,Vente,25150000,1310.0,1.0,147.0,5.0,2014,1,9
1,2014-01-09,Vente,17450000,1000.0,2.0,80.0,2.0,2014,1,9
2,2014-01-09,Vente,17450000,1000.0,3.0,0.0,0.0,2014,1,9
3,2014-01-02,Vente,15750000,1440.0,,,,2014,1,2
4,2014-01-02,Vente,15750000,1440.0,1.0,103.0,4.0,2014,1,2


In [4]:
def natur_mutation_vente_only(df): # Filters out anything that is not a sale (exchanges, auctions and such)
    df.drop(labels=df.loc[df['Nature mutation'] != 'Vente'].index, axis=0, inplace=True)
    df.drop(labels='Nature mutation', axis=1, inplace=True)
    return df

df = natur_mutation_vente_only(df)
df.head(5)

Unnamed: 0,Date mutation,Valeur fonciere,Code postal,Code type local,Surface reelle bati,Nombre pieces principales,year,month,day
0,2014-01-09,25150000,1310.0,1.0,147.0,5.0,2014,1,9
1,2014-01-09,17450000,1000.0,2.0,80.0,2.0,2014,1,9
2,2014-01-09,17450000,1000.0,3.0,0.0,0.0,2014,1,9
3,2014-01-02,15750000,1440.0,,,,2014,1,2
4,2014-01-02,15750000,1440.0,1.0,103.0,4.0,2014,1,2


In [5]:
def only_houses_and_apartment_data(df):
    df.dropna(axis=0, how='any', subset=['Code type local'], inplace=True)
    df['Code type local'] = df['Code type local'].astype('int64')
    df.drop(labels=df.loc[(df['Code type local'] != 2) & (df['Code type local'] != 1)].index, axis=0, inplace=True)
    #df.sort_values(df.columns[0], axis=0, ascending=True, inplace=True, na_position='last')
    #df.reset_index(drop=True,inplace=True)
    return df

df = only_houses_and_apartment_data(df)
df.head(5)

Unnamed: 0,Date mutation,Valeur fonciere,Code postal,Code type local,Surface reelle bati,Nombre pieces principales,year,month,day
0,2014-01-09,25150000,1310.0,1,147.0,5.0,2014,1,9
1,2014-01-09,17450000,1000.0,2,80.0,2.0,2014,1,9
4,2014-01-02,15750000,1440.0,1,103.0,4.0,2014,1,2
6,2014-01-07,9000000,1000.0,2,61.0,2.0,2014,1,7
8,2014-01-11,37000000,1250.0,1,220.0,8.0,2014,1,11


In [6]:
def clean_nan_from_essential_columns(df):
    df.dropna(axis=0, how='any', subset=['Nombre pieces principales', 'Date mutation', 'Valeur fonciere', 'Code postal'], inplace=True)
    df.sort_values(df.columns[0], axis=0, ascending=True, inplace=True, na_position='last')
    df.reset_index(drop=True,inplace=True)
    return df

df = clean_nan_from_essential_columns(df)
df.head()

Unnamed: 0,Date mutation,Valeur fonciere,Code postal,Code type local,Surface reelle bati,Nombre pieces principales,year,month,day
0,2014-01-01,24000000,83640.0,1,50.0,2.0,2014,1,1
1,2014-01-01,4300000,16700.0,1,87.0,3.0,2014,1,1
2,2014-01-01,24000000,83640.0,1,50.0,2.0,2014,1,1
3,2014-01-02,7300000,9220.0,1,74.0,4.0,2014,1,2
4,2014-01-02,15000000,80500.0,1,92.0,5.0,2014,1,2


In [7]:
def cut_of_coma(df, clmn='Valeur fonciere'): # Literally cuts out a coma from numbers (together with decimal zeroes)
    temp = []
    for each in list(df[clmn]):
        each = str(each)
        if "," in str(each):
            temp.append(each[:-3])
        else:
            temp.append(each)
    df[clmn] = temp
    return df

def into_int(df, clmns=['Valeur fonciere', 'Code postal', 'Surface reelle bati', 'Nombre pieces principales']):
    for each in clmns:
        df[each] = df[each].astype('int64', errors='ignore') # Simply takes all number colums and converts them into integers (from strings and floats)
    return df

df = cut_of_coma(df)
df = into_int(df)
df.head()

Unnamed: 0,Date mutation,Valeur fonciere,Code postal,Code type local,Surface reelle bati,Nombre pieces principales,year,month,day
0,2014-01-01,240000,83640,1,50,2,2014,1,1
1,2014-01-01,43000,16700,1,87,3,2014,1,1
2,2014-01-01,240000,83640,1,50,2,2014,1,1
3,2014-01-02,73000,9220,1,74,4,2014,1,2
4,2014-01-02,150000,80500,1,92,5,2014,1,2


In [9]:
# Data from notary database used to calculate additional variables used in the model
notary_data = df[['Code postal', 'year', 'Code type local', 'Nombre pieces principales', 'Valeur fonciere']].copy()
notary_data.head()

Unnamed: 0,Code postal,year,Code type local,Nombre pieces principales,Valeur fonciere
0,83640,2014,1,2,240000
1,16700,2014,1,3,43000
2,83640,2014,1,2,240000
3,9220,2014,1,4,73000
4,80500,2014,1,5,150000
