In [1]:
import pandas as pd
import numpy as np
#from locale import atof, setlocale, LC_NUMERIC, LC_ALL
import matplotlib.pyplot as plt
import matplotlib_inline
import seaborn as sns
import scipy
from scipy.stats import norm
#setlocale(LC_ALL, 'fr_FR.UTF-8')

# GetData

In [None]:
import pandas as pd

class GetData:
    """ Read data from csv and load it in a dataframe
    accepted arguments : path to file , separator, chunksize and filter
    option to load csv by filtering on house type
    """

    def __init__(self,path ="../data/valeursfoncieres-2021.txt",sep = "|", chunksize = 100000):
        self.path = path
        self.sep = sep
        self.chunksize = chunksize


    def read_csv(self, filtering_column='Code type local', filter=[1]):
        """ pass option on which column to filter and filter value
        if several filter value, pass the as a list"""
        iter_csv = pd.read_csv(self.path,
                               sep=self.sep,
                               iterator=True,
                               chunksize=self.chunksize,
                               low_memory=False)
        self.df = pd.concat([
            chunk[chunk[filtering_column].isin(filter)] for chunk in iter_csv
        ])
        return self.df

    def enrichissement_coordinates(self,df):
        pass

    def enrichissement_insee(self,df):
        pass

    def loading_data_db (self,df):
        pass


In [2]:
pd.set_option('display.max_columns', None)

# Preprocessing

Tout l'objectif de cette étape est de tirer le maximum du dataset, en reconstituant ce qui correspond réellement à une transaction. 
Nous avons réalisé cela en regroupant chaque transaction selon 3 clés : Parcelle / date / montant car ceux ci sont répété sur les lignes communes. 
Ensuite nous avons utilisé la methode apply qui pour chaque ligne du dataset aggrégé applique une fonction lambda dans laquelle on inséré une série dont les paramétres étaient la ligne aggrée. cela nous a permis de réaliser des fonctions plus complexes au niveau des aggrégats et de couvrir les différents cas ammenant à la mutltiplication des lignes 
Les cas plus complexes à retrouver sont par exemple la nature culture basée principale en se basant sur la superficie de celle ci (--- piste https://stackoverflow.com/questions/23394476/keep-other-columns-when-doing-groupby 
L'autre cas complexe était de compter le nombre de dépendances différentes ainsi que le nombre de maisons différentes au sein d'une même aggrégation -**ajouter cas ou on a plusieurs dépendances et plusieurs terrains -- trop complexe** : 
* #dépendance - count du terme dépendance valuecount sorted[0]  / nunique type local si nunique nat culture = 1 & len value count ==2 else count terme dependance valuecount sorted[0]   if len_value count ==2 else 0
* #maisons - count terme maison value count sorted [1] / nunqiue type local si nunique nat culture = 1 & len value count ==2 else count terme maison value count if len value_count ==2 else 0 

In [None]:
#from house_prediction_package.data import GetData
import pandas as pd
import numpy as np
from datetime import datetime
from more_itertools import chunked
from scipy import stats

from sklearn.model_selection import train_test_split
#sans doute à supprimer au lancement final du modele
#from locale import atof, setlocale, LC_NUMERIC, LC_ALL

#setlocale(LC_ALL, 'fr_FR.UTF-8')

class Preprocessing :

    def __init__(self,df) :
        # self.df = get_data().read_csv()
        self.df = df

    def clean_columns(self,
                      columns=[
                          'Code service CH', 'Reference document',
                          '1 Articles CGI', '2 Articles CGI', '3 Articles CGI',
                          '4 Articles CGI', '5 Articles CGI', 'No Volume',
                          'Identifiant local'
                      ]):
        """ drop useless columns
        Customisation of columns to drop must be entered as a list
        """
        # suppression of 100% empty columns - these columns are officially not completed in this db
        self.df = self.df.drop(columns,axis=1)
        # suppression of columns poorly completed
        columns_to_drop = [column for column in self.df.columns if ((self.df[column].isnull().value_counts().sort_index()[0]/self.df.shape[0])*100) < 2 ]
        self.df= self.df.drop(columns_to_drop,axis=1)
        # replacement of , by . in numerical variables & deletion of non numrical caracters in num columns : 
        columns_num = ['Valeur fonciere', 'Surface Carrez du 1er lot', 'Nombre de lots',
        'Surface reelle bati', 'Nombre pieces principales', 'Surface terrain']
        # transformation des , en . pour réaliser des opérations sur les nombres et suppressions des caracteres non numériques au sein de ces colonnes 
        for column in columns_num : 
            self.df[column]=self.df[column].apply(lambda s: s.replace(",",".") if isinstance(s,str) else s)
            self.df[column] = pd.to_numeric(self.df[column], errors = 'coerce')
        # suppression of nan value on target variable
        self.df= self.df.dropna(subset=['Valeur fonciere'])
        #self.df['Surface Carrez du 1er lot'] = self.df['Surface Carrez du 1er lot'].apply(
        #    lambda x: atof(x))        
        # pre processing avant groupby mais attention sortir valeures foncieres avant de mettre en POO
        ob_columns= self.df.dtypes[self.df.dtypes == 'O'].index
        num_columns = self.df.dtypes[(self.df.dtypes == 'int')
                                     | (self.df.dtypes == 'float')].index
        non_num_col = ['No disposition', 'No voie', 'Code postal', 'Code commune',
       'Prefixe de section', 'No plan','Code type local']
        num_columns = [value for value in num_columns if value not in non_num_col]
        for column in ob_columns :
            self.df[column]=self.df[column].replace(np.nan,'',regex=True)
        #à adapter in v2
        
        self.df[num_columns] = self.df[num_columns].apply(pd.to_numeric,
                                                              errors='coerce')
        
        #drop duplicates
        self.df = self.df.drop_duplicates().reset_index(drop= True)
        # by returning self, we can do method chaining like preprocessing(df).clean_columns().create_identifier()
        return self.df

    def create_identifier(self) :
        """ Create a 'unique' identifier allowing us to group several lines corresponding to a unique transaction
        """
        variables_to_clean = [
            "Code departement", "Code commune", "Prefixe de section",
            "Section", "No plan"
            ]
        size_variables= [2,3,3,2,4]
        for i,j in zip(variables_to_clean,size_variables):
            chunked_data = chunked(self.df[i], 10000, strict=False)
            values = {"Prefixe de section": '000'}
            self.df= self.df.fillna(value=values)
            if i == "Prefixe de section" :
                self.df[i] = self.df[i].apply(str).apply(lambda x: x[:3])
            new_variable = [
                str(value).replace(".","").zfill(j) for sublist in list(chunked_data)
                for value in sublist
            ]
            self.df[f"clean_{i.replace(' ','_').lower()}"] = new_variable
            self.df= self.df.drop([i],axis=1)
        self.df["parcelle_cadastrale"] = self.df[[
            "clean_code_departement", "clean_code_commune", "clean_prefixe_de_section",
            "clean_section", "clean_no_plan"]].apply(lambda x: "".join(x), axis=1)
        self.df["parcelle_cad_section"]=self.df["parcelle_cadastrale"].str[:10]
        self.df = self.df.drop([
            "clean_prefixe_de_section", "clean_section", "clean_no_plan"
        ], axis = 1)
        return self.df

    def aggregate_transactions(self):
        self.df = self.df.groupby(["parcelle_cad_section","Date mutation","Valeur fonciere"], as_index= False).apply(lambda x : pd.Series({
            "num_voie" : x["No voie"].max()
            ,"B_T_Q" : x["B/T/Q"].max()
            ,"type_de_voie": x["Type de voie"].max()
            ,"voie": x["Voie"].max()
            ,"code_postal": x["Code postal"].max()
            ,"commune": max(x["Commune"])
            ,"clean_code_departement": x["clean_code_departement"].max()
            ,"clean_code_commune": max(x["clean_code_commune"])
            ,"surface_carrez_lot_1" :  x["Surface Carrez du 1er lot"].sum()/((x["Surface reelle bati"].count()/x["Nature culture"].nunique()))
            ,"Nb_lots": x["Nombre de lots"].max()
            ,"surface_terrain" : ((x["Surface terrain"].sum()/x["Surface reelle bati"].count()) if (int(x["Surface terrain"].nunique()) ==1 and int(x["Nature culture"].nunique()) == 1 )else x["Surface terrain"].sum())
            ,"surface_reelle_bati" : (x["Surface reelle bati"].sum()/(x["Surface reelle bati"].count()/x["Type local"].nunique()) if (int(x["Nature culture"].nunique() > 1)) else x["Surface reelle bati"].sum())
            ,"nb_pieces_principales" : (x["Nombre pieces principales"].sum()/(x["Surface reelle bati"].count()/x["Type local"].nunique()) if int(x["Nature culture"].nunique()) > 1 else x["Nombre pieces principales"].sum())      
            ,"dependance" : x["Type local"].unique()
            ,"main_type_terrain" : x["Nature culture"].max()
            ,"parcelle_cadastrale": x["parcelle_cadastrale"].max()}))
        self.df = self.df.replace(np.inf, np.nan)
        #drop rows with only dependances transactions as we focus on houses
        self.df = self.df[self.df.dependance.apply(
            lambda x: x.all() != "Dépendance")].reset_index(drop=True)
        self.df["dependance"] = self.df.dependance.apply(sorted, 1)
        self.df[["Dependance",
                 "Maison"]] = pd.DataFrame(self.df.dependance.tolist(),
                                           index=self.df.index)
        self.df["Dependance"] = [1 if value =="Dépendance"else 0 for value in self.df["Dependance"]]
        self.df= self.df.drop(["dependance","Maison"],axis =1)
        return self.df

    # to do : function calling enrichissement from data


    def feature_generation (self):
        # convert the 'Date' column to datetime format
        self.df["month"] = pd.to_datetime(
            self.df["Date mutation"],format="%d/%m/%Y").dt.month
        self.df= self.df.drop(["Date mutation"], axis = 1)
        ## attention à ne faire qu'après avoir enrichi avec variables insee
        dict_type_voie = dict()
        for value in self.df["type_de_voie"].value_counts()[self.df["type_de_voie"].value_counts()<300 ].index.values :
            dict_type_voie[value] = "Autres"
        self.df=self.df.replace({"type_voie" : dict_type_voie})
        self.df["type_de_voie"]= self.df["type_de_voie"].replace(np.nan,'vide')
        return self.df

    def zscore (self) :
        # Calculate the z-score from scratch
        #self.df['Valeur fonciere']= df['Valeur fonciere'].apply(lambda x: atof(x))
        standard_deviation = self.df["Valeur fonciere"].std(ddof=0)
        mean_value = self.df["Valeur fonciere"].mean()
        zscores = [(value - mean_value) / standard_deviation
                for value in self.df["Valeur fonciere"]]
        self.df["zscores"]= zscores
        # absolute value of zscore and if sup x then 1  :
        self.df["outlier"] = [
            1 if (abs(value) > 3) else 0 for value in self.df["zscores"]
        ]
        self.df=self.df[self.df["outlier"] == 0].reset_index(drop=True)
        self.df = self.df.drop(["zscores","outlier"], axis = 1)
        return self.df

    def split_x_y (self):
        columns_model = ["type_de_voie",
            "clean_code_departement",
            "clean_code_commune",
            "code_postal",
            "surface_terrain",
            "surface_reelle_bati", "nb_pieces_principales",
            "main_type_terrain",  "Dependance",
            "month"]
        # Séparation des variables catégorielles et numériques
        categorical_features = [
            "type_de_voie", "clean_code_departement", "clean_code_commune",
            "code_postal", "main_type_terrain", "Dependance", "month"
        ]
        numerical_features = [
            "surface_terrain", "surface_reelle_bati", "nb_pieces_principales"
        ]
        for column in categorical_features:
            self.df[column] = self.df[column].replace(np.nan, "").apply(str)
        X = self.df[columns_model]
        y =self.df["Valeur fonciere"]
        # selection des variables
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=42)
        return self.df,categorical_features, numerical_features, X_train, X_test, y_train, y_test


# Pipeline 

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

from sklearn import pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

#from house_prediction_package.preprocessing import Preprocessing
#from house_prediction_package.data import GetData


class Pipeline :

    def __init__(self, df):
        self.df = df
        # self.categorical_features = categorical_features
        # self.numerical_features = numerical_features
        # self.X_train = X_train
        # self.y_train = y_train
        # option 2
        #appeler les méthodes
        self.df, self.categorical_features, self.numerical_features, self.X_train, self.X_test, self.y_train, self.y_test = Preprocessing(
            df).feature_generation().zscore().split_x_y()

    def pipeline(self):
        # création des pipelines de pré-processing pour les variables numériques et catégorielles
        #ajout d'un parametre pour gerer les valeures non connues dans onehotencoder - il les passe à 0(autres options disponibles)
        numerical_pipeline = make_pipeline(KNNImputer(n_neighbors=3), MinMaxScaler())
        categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
        preprocessor = make_column_transformer(
            (numerical_pipeline, self.numerical_features),
            (categorical_pipeline, self.categorical_features))
        model = make_pipeline(preprocessor, LinearRegression())
        fitted_model = model.fit(self.X_train, self.y_train)
        return fitted_model, self.X_train, self.y_train,self.X_test, self.y_test


# Zone de tests

## Test model sans aggrégation 

In [None]:
df = GetData().read_csv()

In [None]:
df,categorical_features, numerical_features, X_train, X_test, y_train, y_test = Preprocessing(df).split_x_y()

In [None]:
numerical_pipeline = make_pipeline(KNNImputer(n_neighbors=3), MinMaxScaler())
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
preprocessor = make_column_transformer(
            (numerical_pipeline, numerical_features),
            (categorical_pipeline, categorical_features))
model = make_pipeline(preprocessor, LinearRegression())
fitted_model = model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

test_y_hat = fitted_model.predict(X_test)
print('Score r²: ', fitted_model.score(X_test, y_test))
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_hat - y_test)))
print("Residual  of squares (MSE): %.2f" % np.mean((test_y_hat - y_test)**2))
print("R2-score: %.2f" % r2_score(test_y_hat, y_test))

In [None]:
import pickle
# save the model to disk
filename = 'house_model_wo_aggregations.sav'
pickle.dump(fitted_model, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

## Test model maison avec aggrégation 

In [None]:
df= GetData().read_csv()

In [None]:
df = Preprocessing(df).clean_columns()

In [None]:
df= Preprocessing(df).create_identifier()

In [None]:
df = Preprocessing(df).aggregate_transactions()

In [None]:
df.to_csv("aggregatedfile_houses.csv", sep='|', encoding="utf-8") 

In [None]:
df = Preprocessing(df).feature_generation()

In [None]:
df = Preprocessing(df).zscore() 

In [None]:
df,categorical_features, numerical_features, X_train, X_test, y_train, y_test =  Preprocessing(df).split_x_y()

In [None]:
numerical_pipeline = make_pipeline(KNNImputer(n_neighbors=3), MinMaxScaler())
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
preprocessor = make_column_transformer(
            (numerical_pipeline, numerical_features),
            (categorical_pipeline, categorical_features))
model = make_pipeline(preprocessor, LinearRegression())
fitted_model = model.fit(X_train, y_train)

In [None]:
X_test[X_test.surface_terrain== np.inf]

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import math
X_test = X_test.replace(np.inf, np.nan)
test_y_hat = fitted_model.predict(X_test)
print('Score r²: ', fitted_model.score(X_test, y_test))
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_hat - y_test)))
print("Residual  of squares (MSE): %.2f" % np.mean((test_y_hat - y_test)**2))
print("R(MSE): %.2f" % math.sqrt(np.mean((test_y_hat - y_test)**2)))
print("R2-score: %.2f" % r2_score(test_y_hat, y_test))

Score r²:  0.4806680640761509      
Mean absolute error: 83460.59     
Residual  of squares (MSE): 46023819610.40     
R(MSE): 214531.63    
R2-score: -0.02    

In [None]:
import pickle
# save the model to disk
filename = 'house_model_aggregations.sav'
pickle.dump(fitted_model, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

## test model maison/dep with aggregations

In [None]:
df= GetData().read_csv(filter=[1,3])

In [None]:
df = Preprocessing(df).clean_columns()

In [None]:
df= Preprocessing(df).create_identifier()

In [None]:
df = Preprocessing(df).aggregate_transactions()

df.to_csv("aggregatedfile_houses_dep.csv", sep='|', encoding="utf-8")

In [None]:
df.info()

In [None]:
df = Preprocessing(df).feature_generation()

df = Preprocessing(df).zscore() 

df,categorical_features, numerical_features, X_train, X_test, y_train, y_test  = Preprocessing(df).split_x_y()

numerical_pipeline = make_pipeline(KNNImputer(n_neighbors=3), MinMaxScaler())
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
preprocessor = make_column_transformer(
            (numerical_pipeline, numerical_features),
            (categorical_pipeline, categorical_features))
model = make_pipeline(preprocessor, LinearRegression())

In [None]:
fitted_model = model.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import math

test_y_hat = fitted_model.predict(X_test)
print('Score r²: ', fitted_model.score(X_test, y_test))
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_hat - y_test)))
print("Residual  of squares (MSE): %.2f" % np.mean((test_y_hat - y_test)**2))
print("R(MSE): %.2f" % math.sqrt(np.mean((test_y_hat - y_test)**2)))
print("R2-score: %.2f" % r2_score(test_y_hat, y_test))

import pickle
# save the model to disk
filename = 'house_dep_model_aggregations.sav'
pickle.dump(fitted_model, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

## Enrichissement lat long iris insee

In [None]:
import pandas as pd
import numpy as np

On lit le nouveau csv aggrégé qui nous a fait réduire le nombre de lignes de 1 M à 430 K 
Ensuite, on utilise l'api ban 
enrichissement 20 h trop long
test avec csv 

In [None]:
df = pd.read_csv("aggregatedfile_houses_dep.csv", sep = "|", index_col=0,dtype ={"parcelle_cad_section":  str  
,"Date mutation": object 
,"Valeur fonciere"  : np.float32
,"num_voie" : np.float32
,"B_T_Q" : object 
,"type_de_voie": object 
,"voie":  object 
,"code_postal": np.float64
,"commune": object 
,"clean_code_departement": object 
,"clean_code_commune": object  
,"surface_carrez_lot_1": np.float32
,"Nb_lots": np.int32  
,"surface_terrain":  np.float32
,"surface_reelle_bati":np.float32
,"nb_pieces_principales":np.float32
,"main_type_terrain":object 
,"parcelle_cadastrale": object})

In [None]:
# Using DataFrame.apply() and lambda function\n",
# df_adresses['No voie']= df_adresses['No voie'].astype(int)\n",
# on ne peut pas passer les num voies/code postaux  en int/string sans d'abord nettoyer les nan values \n",
df["voie"]=df["voie"].replace(" ","+")
df["adresse"] = df[["num_voie", "type_de_voie", "voie"]].apply(lambda x: "+".join(x.astype(str)), axis=1)
#df['clean_code_commune'] = df[["clean_code_departement","clean_code_commune"]].apply(lambda x: "".join(x.astype(str)), axis=1)


In [None]:
full_code_commun =[]
for dep,comm in zip(df["clean_code_departement"],df['clean_code_commune']):
    if len(dep) == 3: 
        full_code_commun.append(dep + comm[1:3])
    else : 
        full_code_commun.append(dep + comm)

In [None]:
df['clean_code_commune'] = full_code_commun

In [None]:
df["adresse"]= df[["adresse","clean_code_commune"]].apply(lambda x : "&citycode=".join(x.astype(str)),axis=1)

In [None]:
df.tail()

In [None]:
n = 2000  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]
# reassemblage by pd.concat possible mais on s'en fiche car on va fonctionner sur des'petits df' 
#pour enrichissement puis insertion en bdd

In [None]:
import requests
from datetime import datetime

In [None]:
list_df[0]['adresse']

In [None]:
#long=[]
#lat= []

In [None]:
test = 'ok'
start_time = datetime.now()

for j in range(2,len(list_df)):
    if test == 'ok':
        for value in list_df[j]['adresse']:
            try : 
                long.append(requests.get(f'http://localhost:7878/search?q={value}').json()['features'][0]['geometry']['coordinates'][0])
                lat.append(requests.get(f'http://localhost:7878/search/?q={value}').json()['features'][0]['geometry']['coordinates'][1])
            except  : 
                lat.append('not found')
                long.append('not found')            
    test= input(f"iteration {j}, pour passer à l'itération {j+1} taper ok  : ")
    f=j
end_time = datetime.now()
print('Duration: {} et arret à la {}'.format(end_time - start_time, f))

iteration 2, pour passer à l'itération 3 taper ok  : ok    
iteration 3, pour passer à l'itération 4 taper ok  : ok    
iteration 4, pour passer à l'itération 5 taper ok  : ok    
iteration 5, pour passer à l'itération 6 taper ok  : ok    
iteration 6, pour passer à l'itération 7 taper ok  : ok    
iteration 7, pour passer à l'itération 8 taper ok  : ok    
iteration 8, pour passer à l'itération 9 taper ok  : ok    
iteration 9, pour passer à l'itération 10 taper ok  : ok    
iteration 10, pour passer à l'itération 11 taper ok  : ok    
iteration 12, pour passer à l'itération 13 taper ok  : ok    
iteration 13, pour passer à l'itération 14 taper ok  : ko    

In [None]:
import pickle
# save the model to disk
filename = 'lat.sav'
pickle.dump(lat, open(filename, 'wb'))
filename2= 'long.sav'
pickle.dump(lat, open(filename2, 'wb'))

In [None]:
requests.get("http://localhost:7878/search/?q=27.0+RUE+DES PINS&citycode=97424").json()

In [None]:
df

## Récupération CSV enrichi en masse et récupération IRIS

In [None]:
df = pd.read_csv('../data/geolocgeocoded.csv', sep='|', index_col=0, dtype= {
"num_voie" : np.float32
,"type_de_voie": object 
,"voie":  object 
,"commune": object 
,"clean_code_commune": object, "latitude": np.float32, "longitude": np.float32, "result_score" : np.float32 })

In [None]:
df = df[["num_voie","type_de_voie","voie","commune","clean_code_commune","latitude", "longitude","result_score"]]

In [None]:
df

In [3]:
import requests
from datetime import datetime

pour lancer pyris : 
1. d'abord activer lancer postgres sql 
 * sudo service postgresql start
2.  puis depuis le dossier  house pred /api / pyris : 
 * gunicorn -b 127.0.0.1:5555 pyris.api.run:app 

Qu'est ce qu'IRIS : "Afin de préparer la diffusion du recensement de la population de 1999, l'INSEE avait développé un découpage du territoire en mailles de taille homogène appelées IRIS2000. Un sigle qui signifiait « Ilots Regroupés pour l'Information Statistique » et qui faisait référence à la taille visée de 2 000 habitants par maille élémentaire." source : https://www.insee.fr/fr/metadonnees/definition/c1523    
API fonctionne avec Postgis ( extension de postgres pour les données géospatiales).    
Pour qu'elle fonctionne, j'ai donc télécharger les fichiers suivants : 
 * contours iris 
 * références iris 
 * divers statistiques INSEE à l'échelon IRIS 
     * statistiques activités -
     * statistiques démographiques - ménages et evol pop 
     * statistiques scolaires
     * statistiques logements 

In [None]:
df['coordinates'] = df[["latitude","longitude"]].apply(lambda x : "&lon=".join(x.astype(str)),axis=1)

In [None]:
df = df.drop_duplicates(subset=['coordinates'])

In [None]:
# api url 
url = "http://127.0.0.1:5555/api/coords?geojson=false&lat="

In [None]:
n = 50000  #chunk row size
list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]
# reassemblage by pd.concat possible mais on s'en fiche car on va fonctionner sur des'petits df' 
#pour enrichissement puis insertion en bdd

In [None]:
IRIS = []

In [None]:
test = 'ok'
start_time = datetime.now()

for j in range(0,len(list_df)):
    if test == 'ok':
        for value in list_df[j]['coordinates']:
            try : 
                IRIS.append(requests.get(f'{url}{value}').json()['complete_code'])
            except  : 
                IRIS.append('not found')
    test= input(f"iteration {j}, pour passer à l'itération {j+1} taper ok  : ")
    f=j
end_time = datetime.now()
print('Duration: {} et arret à la {}'.format(end_time - start_time, f))

In [None]:
IRIS= pickle.load(open(filename, 'rb'))

In [None]:
df['IRIS']= IRIS

In [None]:
df[df['IRIS'] =='not found']

Recherche par ville sur les valeures non trouvées

In [None]:
for index, value in df[df['IRIS'] =='not found'].iterrows() : 
    print('######',index,'*****', value['clean_code_commune'])
    try :
        df.at[index,'IRIS'] = requests.get(f"http://127.0.0.1:5555/api/city/code/{value['clean_code_commune']}").json()[0]
    except : 
        continue

## Recherche api open data soft 


les dernieres valeures non trouvées appartiennent aux dom tom Martinique Guadeloupe Guyanne et Réunion

In [None]:
url ="https://data.opendatasoft.com/api/records/1.0/search/?dataset=iris-millesime-france%40lareunion&q="

In [None]:
/api/records/1.0/search/?dataset=iris-millesime-france&q=97101&sort=year&facet=com_arm_name 

## appel d'un api externe pour la réunion : 
** ajouter un prefiltre dans le code **

https://data.opendatasoft.com/explore/dataset/iris-millesime-france%40lareunion/api/?disjunctive.reg_name&disjunctive.dep_name&disjunctive.arrdep_name&disjunctive.ze2020_name&disjunctive.bv2012_name&disjunctive.epci_name&disjunctive.ept_name&disjunctive.com_name&disjunctive.com_arm_name&disjunctive.iris_name&sort=year&q=97424&geofilter.polygon=&geofilter.distance=

In [None]:
for index, value in df[df['IRIS'] =='not found'].iterrows() : 
    print('######',index,'*****', value['clean_code_commune'])
    try :
        df.at[index,'IRIS'] = requests.get(f"{url}{value['clean_code_commune']}&sort=year&facet=com_arm_name").json()['records'][0]['fields']['iris_code']
    except : 
        continue

## Appel d'un api externe pour la guadeloupe 


https://regionguadeloupe.opendatasoft.com/explore/dataset/iris-millesime-france/information/?disjunctive.reg_name&disjunctive.dep_name&disjunctive.arrdep_name&disjunctive.ze2020_name&disjunctive.bv2012_name&disjunctive.epci_name&disjunctive.ept_name&disjunctive.com_name&disjunctive.com_arm_name&disjunctive.iris_name&sort=year&q=97101&dataChart=eyJxdWVyaWVzIjpbeyJjb25maWciOnsiZGF0YXNldCI6ImlyaXMtbWlsbGVzaW1lLWZyYW5jZSIsIm9wdGlvbnMiOnsiZGlzanVuY3RpdmUucmVnX25hbWUiOnRydWUsImRpc2p1bmN0aXZlLmRlcF9uYW1lIjp0cnVlLCJkaXNqdW5jdGl2ZS5hcnJkZXBfbmFtZSI6dHJ1ZSwiZGlzanVuY3RpdmUuemUyMDIwX25hbWUiOnRydWUsImRpc2p1bmN0aXZlLmJ2MjAxMl9uYW1lIjp0cnVlLCJkaXNqdW5jdGl2ZS5lcGNpX25hbWUiOnRydWUsImRpc2p1bmN0aXZlLmVwdF9uYW1lIjp0cnVlLCJkaXNqdW5jdGl2ZS5jb21fbmFtZSI6dHJ1ZSwiZGlzanVuY3RpdmUuY29tX2FybV9uYW1lIjp0cnVlLCJkaXNqdW5jdGl2ZS5pcmlzX25hbWUiOnRydWUsInNvcnQiOiJ5ZWFyIiwicSI6Im1hcmllIGdhbGFudGUgZ3JhbmQgYm91cmciLCJyZWZpbmUuemUyMDIwX25hbWUiOiJNYXJpZS1HYWxhbnRlIiwicmVmaW5lLmNvbV9uYW1lIjoiR3JhbmQtQm91cmcifX0sImNoYXJ0cyI6W3siYWxpZ25Nb250aCI6dHJ1ZSwidHlwZSI6ImxpbmUiLCJmdW5jIjoiQ09VTlQiLCJzY2llbnRpZmljRGlzcGxheSI6dHJ1ZSwiY29sb3IiOiIjRUQ5QTlBIn1dLCJ4QXhpcyI6InllYXIiLCJtYXhwb2ludHMiOiIiLCJ0aW1lc2NhbGUiOiJ5ZWFyIiwic29ydCI6IiJ9XSwiZGlzcGxheUxlZ2VuZCI6dHJ1ZSwiYWxpZ25Nb250aCI6dHJ1ZX0%3D

In [None]:
url ="https://regionguadeloupe.opendatasoft.com/api/records/1.0/search/?dataset=iris-millesime-france&q="

In [None]:
for index, value in df[df['IRIS'] =='not found'].iterrows() : 
    print('######',index,'*****', value['clean_code_commune'])
    if value['clean_code_commune'][0:3] == '971' :
        try :
            df.at[index,'IRIS'] = requests.get(f"{url}{value['clean_code_commune']}&sort=year&facet=com_arm_name").json()['records'][0]['fields']['iris_code']
        except : 
            continue

## appel d'un autre api pour martinique et guyanne 


https://public.opendatasoft.com/explore/dataset/georef-france-iris/api/?disjunctive.reg_name&disjunctive.dep_name&disjunctive.arrdep_name&disjunctive.ze2020_name&disjunctive.bv2012_name&disjunctive.epci_name&disjunctive.ept_name&disjunctive.com_name&disjunctive.com_arm_name&disjunctive.iris_name&sort=year&q=97201

In [None]:
url="https://public.opendatasoft.com/api/records/1.0/search/?dataset=georef-france-iris&q="

In [None]:
for index, value in df[df['IRIS'] =='not found'].iterrows() : 
    print('######',index,'*****', value['clean_code_commune'])
    try :
        df.at[index,'IRIS'] = requests.get(f"{url}{value['clean_code_commune']}&sort=year&facet=com_arm_name").json()['records'][0]['fields']['iris_code']
    except : 
        continue

## reconstitution df

In [None]:
df_init = pd.read_csv('../data/geolocgeocoded.csv', sep='|', index_col=0, dtype= {
"num_voie" : np.float32
,"type_de_voie": object 
,"voie":  object 
,"commune": object 
,"clean_code_commune": object, "latitude": np.float32, "longitude": np.float32, "result_score" : np.float32 })

In [None]:
df_init= df_init[["num_voie","type_de_voie","voie","commune","clean_code_commune","latitude", "longitude","result_score"]]

Pour code final garder plutot l'id de résultat qui correspond à lat et long trouvé et dédoublonner selon cet id

In [None]:
df_init['coordinates'] = df_init[["latitude","longitude"]].apply(lambda x : "&lon=".join(x.astype(str)),axis=1)

In [None]:
df_init = df_init.merge(df[['coordinates','IRIS']], left_on='coordinates', right_on='coordinates',
          suffixes=('_left', '_right'),  how='left')

In [None]:
df_init.to_csv("aggregatedfile_houses_dep_WITH_IRIS.csv", sep='|', encoding="utf-8")

## Enrichissement variables INSEE sur CODES IRIS


In [None]:
import requests
from datetime import datetime

In [4]:
df= pd.read_csv('aggregatedfile_houses_dep_WITH_IRIS.csv', sep='|', index_col=0, dtype= {
"num_voie" : np.float32
,"type_de_voie": object 
,"voie":  object 
,"commune": object 
,"clean_code_commune": object, "latitude": np.float32, "longitude": np.float32, "result_score" : np.float32, 
'IRIS': object})

In [None]:
#url = "http://127.0.0.1:5555/api/insee/activite/distribution/"

In [5]:
#inititing lists : 
actif_15_24 = []
actif_25_54 = []
actif_55_64 = []
chomage_15_24 = []
chomage_25_54 = []
chomage_55_64 = []
taux_chomage_15_24 = []
taux_chomage_25_54 = []
taux_chomage_55_64 = []

In [6]:
for value in df.IRIS.unique() : 
    try:
        result = requests.get(f'http://127.0.0.1:5555/api/insee/activite/distribution/{value}?by=age').json()['data']
        for variable in result.keys():
            try: 
                if variable == 'taux_chomage_15_24':
                    taux_chomage_15_24.append(result[variable])
                elif variable == 'taux_chomage_25_54':
                    taux_chomage_25_54.append(result[variable])
                elif variable == 'taux_chomage_55_64':
                    taux_chomage_55_64.append(result[variable])
            except : 
                if variable == 'taux_chomage_15_24':
                    taux_chomage_15_24.append('not_found')
                elif variable == 'taux_chomage_25_54':
                    taux_chomage_25_54.append('not found')
                elif variable == 'taux_chomage_55_64':
                    taux_chomage_55_64.append('not_found')
    except: 
        taux_chomage_15_24.append('not_found')
        taux_chomage_25_54.append('not found')
        taux_chomage_55_64.append('not_found')

In [None]:
#requests.get(f'http://127.0.0.1:5555/api/insee/activite/distribution/010010000?by=age').json()['data'].keys()

In [7]:
df_stat=pd.DataFrame({'IRIS': df.IRIS.unique(),'taux_chomage_15_24':taux_chomage_15_24,"taux_chomage_25_54":taux_chomage_25_54,"taux_chomage_55_64":taux_chomage_55_64 })

In [8]:
df_stat

Unnamed: 0,IRIS,taux_chomage_15_24,taux_chomage_25_54,taux_chomage_55_64
0,010010000,30.781699,7.951426,3.349618
1,010020000,66.666667,2.083333,19.047619
2,010040101,15.546274,13.509962,8.700207
3,010040102,21.93555,19.040939,11.767701
4,010040201,31.624587,15.586252,7.263252
...,...,...,...,...
39229,974200601,57.169849,34.490463,37.954605
39230,974210301,57.009346,37.608319,33.858268
39231,974220901,63.18449,35.561451,33.546692
39232,974230103,66.666667,25.080608,0.0


In [9]:
main_residence_30m2= []
main_residence_30_40m2= []
main_residence_40_60m2= []
main_residence_60_80m2= []
main_residence_80_100m2= []
main_residence_100_120m2= []
main_residence_120m2= []

In [10]:
for value in df_stat['IRIS'] : 
    try:
        result= requests.get(f'http://127.0.0.1:5555/api/insee/logement/distribution/{value}?by=area').json()['data']
        for variable in result.keys():
            try: 
                if variable == 'main_residence_30m2':
                    main_residence_30m2.append(result[variable])
                elif variable == 'main_residence_30_40m2':
                    main_residence_30_40m2.append(result[variable])
                elif variable == 'main_residence_40_60m2':
                    main_residence_40_60m2.append(result[variable])
                elif variable == 'main_residence_60_80m2':
                    main_residence_60_80m2.append(result[variable])
                elif variable == 'main_residence_80_100m2':
                    main_residence_80_100m2.append(result[variable])
                elif variable == 'main_residence_100_120m2':
                    main_residence_100_120m2.append(result[variable])
                elif variable == 'main_residence_120m2':
                    main_residence_120m2.append(result[variable])
            except : 
                if variable == 'main_residence_30m2':
                    main_residence_30m2.append('not_found')
                elif variable == 'main_residence_30_40m2':
                    main_residence_30_40m2.append('not found')
                elif variable == 'main_residence_40_60m2':
                    main_residence_40_60m2.append('not_found')
                elif variable == 'main_residence_60_80m2':
                    main_residence_60_80m2.append('not found')
                elif variable == 'main_residence_80_100m2':
                    main_residence_80_100m2.append('not_found')
                elif variable == 'main_residence_30_40m2':
                    main_residence_100_120m2.append('not found')
                elif variable == 'main_residence_120m2':
                    main_residence_120m2.append('not_found')
    except: 
        main_residence_30m2.append('not_found')
        main_residence_30_40m2.append('not found')
        main_residence_40_60m2.append('not_found')
        main_residence_60_80m2.append('not_found')
        main_residence_80_100m2.append('not found')
        main_residence_100_120m2.append('not_found')
        main_residence_120m2.append('not_found')

39234

In [14]:
df_stat['main_residence_30m2'] = main_residence_30m2
df_stat['main_residence_30_40m2'] = main_residence_30_40m2
df_stat['main_residence_40_60m2'] = main_residence_40_60m2
df_stat['main_residence_60_80m2'] = main_residence_60_80m2
df_stat['main_residence_80_100m2'] = main_residence_80_100m2
df_stat['main_residence_100_120m2'] = main_residence_100_120m2
df_stat['main_residence_120m2'] = main_residence_120m2

In [15]:
df_stat.to_csv("IRIS_STATS_INSEE.csv", sep='|', encoding="utf-8")

In [16]:
df= pd.read_csv('IRIS_STATS_INSEE.csv', sep='|', index_col=0)

In [17]:
df

Unnamed: 0,IRIS,taux_chomage_15_24,taux_chomage_25_54,taux_chomage_55_64,main_residence_30m2,main_residence_30_40m2,main_residence_40_60m2,main_residence_60_80m2,main_residence_80_100m2,main_residence_100_120m2,main_residence_120m2
0,010010000,30.781699051803873,7.951425511099434,3.349618266127431,1.02007247489045,2.05513214994842,4.10277069981308,40.7963874586391,82.40487863695,75.1828077367137,105.437950843046
1,010020000,66.66666666666666,2.083333333333334,19.047619047619083,0.0,1.02057613168724,6.12345679012344,8.16460905349792,25.514403292181,20.4115226337448,41.8436213991768
2,010040101,15.546273592258824,13.509962289770527,8.700207036189525,124.799171710648,6.33147370971934,148.583062229812,222.922083457259,188.383238324276,74.6817491206856,48.2199720459297
3,010040102,21.93554992744209,19.040939419400164,11.767700885207525,42.3121687277957,129.171132275325,424.571110077357,551.162472853823,324.676306386888,131.723968000821,113.187770351294
4,010040201,31.624586878064985,15.586252108322359,7.263251703606728,7.66407096898548,87.1173128733012,334.270897358721,614.685637147138,505.869952728022,225.475226068816,161.070692431894
...,...,...,...,...,...,...,...,...,...,...,...
39229,974200601,57.16984861345612,34.49046270253541,37.954605146315664,10.2287040248056,10.1553538408883,10.3159153513555,56.3360670142889,86.9350394386021,43.1854582990727,22.9316626348459
39230,974210301,57.009345794392466,37.608318890814495,33.85826771653545,18.4441329034067,32.7895696060563,64.5544651619234,221.32959484088,298.180148605075,111.689471470629,50.2090284592737
39231,974220901,63.18448958064734,35.561451466448,33.54669191100054,12.8485249019488,39.7416934267193,165.483240641782,354.406422872675,383.773584321716,239.939976565863,114.554222584866
39232,974230103,66.66666666666679,25.08060771565701,0.0,0.0,0.0,0.0,0.967281765289865,1.98785681067681,1.04831286871385,0.967281765289865


## Vérification regroupement modalité code commune 

In [None]:
requests.get(f'http://127.0.0.1:5555/api/insee/logement/distribution/010040201?by=area').json()['data']

## cas de tests sur aggregations 

In [None]:
df= GetData().read_csv()


In [None]:
df = Preprocessing(df).clean_columns()

df= Preprocessing(df).create_identifier()

In [None]:
df2 = df[0:10000]

In [None]:
df2.groupby(["parcelle_cad_section","Date mutation","Valeur fonciere"], as_index= False).apply(lambda x : pd.Series({
"surface_terrain" : ((x["Surface terrain"].sum()/x["Surface reelle bati"].count()) if (int(x["Surface terrain"].nunique()) ==1 and int(x["Nature culture"].nunique()) == 1 )else x["Surface terrain"].sum())
    ,"Nature_culture" : x["Nature culture"].max()
    , "su terrain 2": x["Surface terrain"].sum()
    , "suterrainmax": x["Surface terrain"].max()
  #  , "su_bat": x["Surface reelle bati"]
    ,"nat_terrain_unique": x["Nature culture"].nunique()
    , "suterrain_count" : x["Surface terrain"].count()
    ,"suterrain_unique": x["Surface terrain"].nunique()
    ,"su_bat_unique" : x["Surface reelle bati"].nunique()
    ,"su_bat_count" : x["Surface reelle bati"].count()
            
})).tail(40)

In [None]:
df2.groupby(["parcelle_cad_section","Date mutation","Valeur fonciere"], as_index= False).apply(lambda x : pd.Series({
    "surface_reelle_bati" : (x["Surface reelle bati"].sum()/(x["Surface reelle bati"].count()/x["Type local"].nunique()) if (int(x["Nature culture"].nunique() > 1)) else x["Surface reelle bati"].sum())
 ,"nb_pieces_principales" : (x["Nombre pieces principales"].sum()/(x["Surface reelle bati"].count()/x["Type local"].nunique()) if int(x["Nature culture"].nunique()) > 1 else x["Nombre pieces principales"].sum())      
    ,"nb_piecemax" : x["Nombre pieces principales"].max()
    ,"Nature_culture" : x["Nature culture"].max()
    , "su bat 2": x["Surface reelle bati"].sum()
    , "sumax": x["Surface reelle bati"].max()
  #  , "su_bat": x["Surface reelle bati"]
    , "su_count" : x["Surface reelle bati"].count()
    ,"nat_cul_unique": x["Nature culture"].nunique()
    ,"subatiment_unique": x["Surface reelle bati"].nunique()
            
})).tail(40)

In [None]:
df_tot= GetData().read_csv([1,3])

df_tot = Preprocessing(df_tot).clean_columns()

df_tot= Preprocessing(df_tot).create_identifier()

In [None]:
# cas ou type local identique mais nature culture différente: 

df_tot[(df_tot['parcelle_cadastrale']== '01289000AC0176') | (df_tot['parcelle_cadastrale']== '013500000C1248')| (df_tot['parcelle_cadastrale']== '01195000AD0050')]
#actions possibles : 
# meme valeur fonciere 
# meme de surface reelle bati 
#pas d'info sur 1er lot
# pas d info Nombre de lots 
#meme nombre pieces principales 
# différente nature culture (variable texte )
# différente surface terrain 


In [None]:
# cas ou type local identique mais nature culture différente: 

df[(df['parcelle_cadastrale']== '01289000AC0176') | (df['parcelle_cadastrale']== '013500000C1248')| (df['parcelle_cadastrale']== '01195000AD0050')]
#actions possibles : 
# meme valeur fonciere 
# meme de surface reelle bati 
#pas d'info sur 1er lot
# pas d info Nombre de lots 
#meme nombre pieces principales 
# différente nature culture (variable texte )
# différente surface terrain 


In [None]:
df[df['parcelle_cad_section']=='01001000ZH']

In [None]:
# même maison surface reelle bati identique 
df[(df['parcelle_cadastrale'] == '013500000C1248')]
#actions possibles : 
# meme valeur fonciere 
# meêm type local 
# même surface relle bati
 #pas d'info sur 1er lot
# pas d info Nombre de lots 
# même nombre de pieces principales

# différence nature culture (variable texte )
# surface terrain différente en fonction de la parcelle cadastrale 


In [None]:
# cas ou 2 maisons , une dépendance et un terrain :
# repérable par section et date commune 

df[df.index.isin([71,72,73,74])]
#actions possibles : 
# meme valeur fonciere 
# code type lcoal différent pr dépendance absent pour terrain
# différence de surface reelle bati  (0 dépendance et nan pour terrain)
#pas d'info sur 1er lot
# pas d info Nombre de lots 
# différence sur nombre de pieces principales

# différence nature culture (variable texte )
# surface terrain différente en fonction de la parcelle cadastrale 
