In [1]:
# Imports
## Import data preparation libraries
import pandas as pd 
import numpy as np 
from datetime import datetime

##Import data enrichment library
import requests 

## Import database libraries 
from sqlalchemy import create_engine, Table, Column, Integer, String, Float, Boolean, Date, MetaData, ForeignKey
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy.ext.declarative import declarative_base

In [2]:
# choix à faire sur l'architecture base de données :
## soit transactions(avec base maisons et base appartements), soit adresses
iter_csv= pd.read_csv("../data/valeursfoncieres-2021.txt", sep='|',iterator=True, chunksize=100000, low_memory = False)
df_maison = pd.concat([chunk[chunk['Code type local'] == 1] for chunk in iter_csv]) 
df_maison.info()

In [2]:
import pandas as pd


class get_data:
    """ Read data from csv and load it in a dataframe
    accepted arguments : path to file , separator, chunksize and filter
    option to load csv by filtering on house type
    """

    def __init__(self,
                 path="../data/valeursfoncieres-2021.txt",
                 sep="|",
                 chunksize=100000):
        self.path = path
        self.sep = sep
        self.chunksize = chunksize

    def read_csv(self, filtering_column='Code type local', filter=1):
        """ pass option on which column to filter and filter value"""
        iter_csv = pd.read_csv(self.path,
                               sep=self.sep,
                               iterator=True,
                               chunksize=self.chunksize,
                               low_memory=False)
        self.df = pd.concat(
            [chunk[chunk[filtering_column] == filter] for chunk in iter_csv])
        return self.df

    def enrichissement_coordinates(self, df):
        pass


In [3]:
df= get_data().read_csv()

In [4]:
df

Unnamed: 0,Code service CH,Reference document,1 Articles CGI,2 Articles CGI,3 Articles CGI,4 Articles CGI,5 Articles CGI,No disposition,Date mutation,Nature mutation,...,Surface Carrez du 5eme lot,Nombre de lots,Code type local,Type local,Identifiant local,Surface reelle bati,Nombre pieces principales,Nature culture,Nature culture speciale,Surface terrain
0,,,,,,,,1,05/01/2021,Vente,...,,0,1.0,Maison,,97.0,5.0,S,,2410.0
3,,,,,,,,1,04/01/2021,Vente,...,,0,1.0,Maison,,88.0,4.0,S,,866.0
4,,,,,,,,1,06/01/2021,Vente,...,,0,1.0,Maison,,168.0,5.0,S,,1426.0
14,,,,,,,,2,04/01/2021,Vente,...,,0,1.0,Maison,,96.0,3.0,,,
15,,,,,,,,1,08/01/2021,Vente,...,,0,1.0,Maison,,100.0,4.0,S,,703.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3378074,,,,,,,,1,22/02/2021,Vente,...,,1,1.0,Maison,,150.0,5.0,,,
3378134,,,,,,,,1,17/02/2021,Vente,...,,0,1.0,Maison,,69.0,2.0,S,,908.0
3378547,,,,,,,,1,18/03/2021,Vente,...,,0,1.0,Maison,,157.0,7.0,S,,77.0
3378563,,,,,,,,1,04/03/2021,Vente,...,,0,1.0,Maison,,103.0,6.0,S,,107.0


In [24]:
# code pour voir comment est rempli pythonpath
# import sys
# print(sys.path)

['/home/pierre/simplon/house_prediction/notebooks', '/home/pierre/.pyenv/versions/3.8.6/lib/python38.zip', '/home/pierre/.pyenv/versions/3.8.6/lib/python3.8', '/home/pierre/.pyenv/versions/3.8.6/lib/python3.8/lib-dynload', '', '/home/pierre/.pyenv/versions/3.8.6/envs/house_prediction/lib/python3.8/site-packages']


In [5]:
# from house_prediction_package.data import get_data
from more_itertools import chunked

class preprocessing :

    def __init__(self,df) :
        # self.df = get_data().read_csv()
        self.df = df

    def clean_columns(self,
                      columns=[
                          'Code service CH', 'Reference document',
                          '1 Articles CGI', '2 Articles CGI', '3 Articles CGI',
                          '4 Articles CGI', '5 Articles CGI', 'No Volume',
                          'Identifiant local'
                      ]):
        """ drop useless columns
        Customisation of columns to drop must be entered as a list
        """
        self.df = self.df.drop(columns,axis=1)
        # no need to return method results as we gonna call all methods at the end of preprocessing class
        return self

    def create_identifier(self) :
        variables_to_clean = [
            "Code departement", "Code commune", "Prefixe de section",
            "Section", "No plan"
        ]
        size_variables= [2,3,3,2,4]
        for i,j in zip(variables_to_clean,size_variables):
            values = {"Prefixe de section": '000'}
            self.df= self.df.fillna(value=values)
            if i == "Prefixe de section" :
                self.df[i] = self.df[i].apply(str).apply(lambda x: x[:3])
            chunked_data = chunked(self.df[i], 10000, strict=False)
            new_variable = [
                str(value).zfill(j) for sublist in list(chunked_data)
                for value in sublist
            ]
            self.df[f"clean_{i.replace(' ','_').lower()}"] = new_variable
            self.df= self.df.drop([i],axis=1)
        self.df["parcelle_cadastrale"] = self.df[[
            "clean_code_departement", "clean_code_commune", "clean_prefixe_de_section",
            "clean_section", "clean_no_plan"]].apply(lambda x: "".join(x), axis=1)
        self.df["parcelle_cad_section"]=self.df["parcelle_cadastrale"].str[:10]
        # drop cleaned columns
        return self.df


In [15]:
df2 = preprocessing(df).clean_columns().create_identifier()

In [16]:
df2.columns

Index(['No disposition', 'Date mutation', 'Nature mutation', 'Valeur fonciere',
       'No voie', 'B/T/Q', 'Type de voie', 'Code voie', 'Voie', 'Code postal',
       'Commune', '1er lot', 'Surface Carrez du 1er lot', '2eme lot',
       'Surface Carrez du 2eme lot', '3eme lot', 'Surface Carrez du 3eme lot',
       '4eme lot', 'Surface Carrez du 4eme lot', '5eme lot',
       'Surface Carrez du 5eme lot', 'Nombre de lots', 'Code type local',
       'Type local', 'Surface reelle bati', 'Nombre pieces principales',
       'Nature culture', 'Nature culture speciale', 'Surface terrain',
       'clean_code_departement', 'clean_code_commune',
       'clean_prefixe_de_section', 'clean_section', 'clean_no_plan',
       'parcelle_cadastrale', 'parcelle_cad_section'],
      dtype='object')

In [90]:
df_analyse = pd.DataFrame()
for column in df2.columns :
    df_analyse.at[column,'Complete'] = df2[column].isnull().value_counts()[0]
    try :
        df_analyse.at[column,'Vide'] = df2[column].isnull().value_counts()[1]
        df_analyse.at[column,'Completion_taux'] = (df2[column].isnull().value_counts()[0]/df2.shape[0])*100
    except KeyError  :
        df_analyse.at[column,'Completion_taux'] = 100
df_analyse = df_analyse.round(2).sort_values(by=['Completion_taux'],ascending = False)

In [None]:
df_maison=df_maison.drop(['Code service CH', 'Reference document',
                          '1 Articles CGI', '2 Articles CGI', '3 Articles CGI',
                          '4 Articles CGI', '5 Articles CGI', 'No Volume',
                          'Identifiant local'
                      ],axis =1)

In [17]:
# for column in df.columns : 
#     # print(column,(df[column].isnull().value_counts().sort_index()))
#     if ((df2[column].isnull().value_counts().sort_index()[0]/df2.shape[0])*100) < 2:
#     #    df3= df2.drop(column,axis =1)
columns_to_drop = [column for column in df2.columns if ((df2[column].isnull().value_counts().sort_index()[0]/df2.shape[0])*100) < 2 ]
df2= df2.drop(columns_to_drop,axis=1)

In [18]:
df2.columns

Index(['No disposition', 'Date mutation', 'Nature mutation', 'Valeur fonciere',
       'No voie', 'B/T/Q', 'Type de voie', 'Code voie', 'Voie', 'Code postal',
       'Commune', '1er lot', 'Nombre de lots', 'Code type local', 'Type local',
       'Surface reelle bati', 'Nombre pieces principales', 'Nature culture',
       'Nature culture speciale', 'Surface terrain', 'clean_code_departement',
       'clean_code_commune', 'clean_prefixe_de_section', 'clean_section',
       'clean_no_plan', 'parcelle_cadastrale', 'parcelle_cad_section'],
      dtype='object')

Unnamed: 0,Complete,Completion_taux,Vide
No disposition,624553.0,100.0,
Date mutation,624553.0,100.0,
parcelle_cadastrale,624553.0,100.0,
clean_no_plan,624553.0,100.0,
clean_section,624553.0,100.0,
clean_prefixe_de_section,624553.0,100.0,
clean_code_commune,624553.0,100.0,
clean_code_departement,624553.0,100.0,
Nombre pieces principales,624545.0,100.0,8.0
Surface reelle bati,624545.0,100.0,8.0


In [None]:
# convert the 'Date' column to datetime format \n
df_maison['Date mutation']= pd.to_datetime(df_maison['Date mutation'],format= "%d/%m/%Y")
#df_maison= df_maison.drop(['Date mutation'], axis = 1)

In [None]:
df_maison.columns

In [1]:
# supression des colonnes innutiles avant chargement dans db 
df_maison=df_maison.drop(['Code service CH', 'Reference document', '1 Articles CGI',
'2 Articles CGI', '3 Articles CGI', '4 Articles CGI', '5 Articles CGI','No Volume','Identifiant local'], axis =1 )

NameError: name 'df_maison' is not defined

## Performances

On a des problemes de performances 1h pr ingérer 1/25 des données brutes. 2 MIllions de lignes et plus de 30 colonnes
avant d'envoyer en base, on va essayer de nettoyer le df (suppression des colonnes inutiles, des doublons, des lignes vides)
regroupement des lignes correspondant à 1 seule transaction- création d'un id et group by 
recentrer le projet sur les mventes de maisons

In [None]:
## à vérifier si cela fonctionne
engine = create_engine('sqlite:///../data/transactions.sqlite', echo = True)
Session = sessionmaker(bind=engine)
Base = declarative_base()

In [None]:
# à corriger en reprenant typo de l'exercice foot
class Transactions_maisons(Base):
        __tablename__ = 'dvf_maisons'

        id = Column(Integer, primary_key=True)
        no_disposition = Column(Integer)
        date_mutation = Column(Date)
        nature_mutation = Column(String)
        valeur_fonciere = Column(Integer)
        no_voie = Column(Integer)
        b_t_q = Column(String)
        type_voie = Column(String)
        code_voie = Column(String)
        voie = Column(String)
        code_postal = Column(Integer)
        commune = Column(String)
        code_departement = Column(String)
        code_commune = Column(String)
        prefixe_section = Column(String)
        section = Column(String)
        no_plan = Column(Integer)
        lot_1 = Column(String)
        surface_carrez_1er_lot = Column(Integer)
        lot_2 = Column(String)
        surface_carrez_2eme_lot = Column(Integer)
        lot_3 = Column(String)
        surface_carrez_3eme_lot = Column(Integer)
        lot_4 = Column(String)
        surface_carrez_4eme_lot = Column(Integer)
        lot_5 = Column(String)
        surface_carrez_5eme_lot = Column(Integer)
        nombre_lots = Column(Integer)
        code_type_local = Column(Float)
        type_local = Column(String)
        surface_reelle_bati = Column(Integer)
        nombre_pieces_principales = Column(Integer)
        nature_culture = Column(String)
        nature_culture_speciale = Column(String)
        surface_terrain = Column(Integer)
        
        def __init__(self, no_disposition, date_mutation, nature_mutation, valeur_fonciere, no_voie,
                     b_t_q, type_voie, code_voie, voie, code_postal, commune, code_departement, 
                     code_commune, prefixe_section, section, no_plan, lot_1, surface_carrez_1er_lot, 
                     surface_carrez_2eme_lot, surface_carrez_3eme_lot, surface_carrez_4eme_lot, 
                     surface_carrez_5eme_lot, lot_2, lot_3, lot_4, lot_5, nombre_lots, code_type_local, 
                     type_local, surface_reelle_bati, nombre_pieces_principales, nature_culture, nature_culture_speciale,
                     surface_terrain ):
            self.no_disposition = no_disposition 
            self.date_mutation = date_mutation 
            self.nature_mutation = nature_mutation
            self.valeur_fonciere = valeur_fonciere 
            self.no_voie = no_voie
            self.b_t_q = b_t_q 
            self.type_voie = type_voie 
            self.code_voie = code_voie 
            self.voie = voie 
            self.code_postal = code_postal
            self.commune = commune
            self.code_departement = code_departement
            self.code_commune = code_commune
            self.prefixe_section = prefixe_section
            self.section = section
            self.no_plan = no_plan
            self.lot_1 = lot_1
            self.surface_carrez_1er_lot = surface_carrez_1er_lot
            self.lot_2 = lot_2
            self.surface_carrez_2eme_lot = surface_carrez_2eme_lot
            self.lot_3 = lot_3
            self.surface_carrez_3eme_lot = surface_carrez_3eme_lot
            self.lot_4 = lot_4
            self.surface_carrez_4eme_lot = surface_carrez_4eme_lot
            self.lot_5 = lot_5
            self.surface_carrez_5eme_lot = surface_carrez_5eme_lot
            self.nombre_lots = nombre_lots
            self.code_type_local = code_type_local
            self.type_local = type_local
            self.surface_reelle_bati = surface_reelle_bati
            self.nombre_pieces_principales = nombre_pieces_principales
            self.nature_culture = nature_culture
            self.nature_culture_speciale = nature_culture_speciale
            self.surface_terrain =surface_terrain 

In [None]:
# delete table
#Transactions_maisons.__table__.drop(engine)"

In [None]:
# 2 - generate database schema
Base.metadata.create_all(engine)
# 3 - create a new session
session = Session()

In [None]:
n = 2000  #chunk row size
list_df = [df_maison[i:i+n] for i in range(0,df_maison.shape[0],n)]
# reassemblage by pd.concat possible mais on s'en fiche car on va fonctionner sur des'petits df' 
#pour enrichissement puis insertion en bdd

In [None]:
test = 'ok'
start_time = datetime.now()
for j in range(len(list_df)):
    if test == 'ok':
        for i,transaction in list_df[j].iterrows():
            session= Session()
            new_transaction = Transactions_maisons(transaction.values[0],transaction.values[1],transaction.values[2],transaction.values[3],transaction.values[4],
                                                      transaction.values[5],transaction.values[6],transaction.values[7],transaction.values[8],transaction.values[9],
                                                      transaction.values[10],transaction.values[11],transaction.values[12],transaction.values[13],transaction.values[14],
                                                      transaction.values[15],transaction.values[16],transaction.values[17],transaction.values[18],transaction.values[19],
                                                      transaction.values[20],transaction.values[21],transaction.values[22],transaction.values[23],transaction.values[24],
                                                      transaction.values[25],transaction.values[26],transaction.values[27],transaction.values[28],transaction.values[29],
                                                      transaction.values[30],transaction.values[31],transaction.values[32],transaction.values[33])
            session.add(new_transaction)
            session.commit()
            session.close()
    else  :
        break
    test= input(f"iteration {j}, pour passer à l'itération {j+1} taper ok  : ")
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

In [None]:
#integré à la boucle 
#session.commit()
session.close()

In [None]:
session = Session()

In [None]:
rows = session.query(Transactions_maisons).count()
rows