In [1]:
import pandas as pd
import numpy as np
import multiprocessing
from joblib import Parallel, delayed

# Opérations de traitements


In [2]:
def clean_df(df:pd.DataFrame)->pd.DataFrame:
  df=df.drop(columns=['ancien_code_commune','ancien_nom_commune','ancien_id_parcelle'])
  # Elimination des locaux industriels et commerciaux
  # mutation_commerciale=df[df.code_type_local==4].id_mutation
  # df=df[~df.id_mutation.isin(mutation_commerciale)]
  coltofill=['valeur_fonciere','nombre_pieces_principales','surface_reelle_bati',
            'surface_terrain','lot1_surface_carrez','lot2_surface_carrez',
            'lot3_surface_carrez','lot4_surface_carrez','lot5_surface_carrez',
            'lot1_numero','lot2_numero','lot3_numero','lot4_numero','lot5_numero',
            'adresse_numero','code_postal','numero_volume','longitude','latitude']
  df[coltofill]=df[coltofill].fillna(0)
  df['code_type_local']=df['code_type_local'].fillna(3)
  df['type_local']=df['type_local'].fillna('Dépendance')

  coltofill=['adresse_nom_voie','adresse_code_voie','code_nature_culture',
            'adresse_suffixe','nature_culture','code_nature_culture_speciale',
            'nature_culture_speciale']
  df[coltofill]=df[coltofill].fillna('<EMPTY>')
  return df


# Fonctions de transformations des données en version linéarisée

In [81]:
alt_df_cols=[
             "id_mutation",
             'jour_mutation',
             'mois_mutation',
             'annee_mutation',
             "nature_mutation",
             "valeur_fonciere",
             "adresse_numero",
             "adresse_suffixe",
             'adresse_nom_voie',
             'adresse_code_voie',
             'code_postal',
             'nom_commune',
             'code_departement',
             'id_parcelle',
             'surface_carrez_total',
             'surface_reelle_bati_total',
             'surface_terrain_total',
             'nombre_lots',
             'nombre_maisons',
             'surface_carrez_maisons',
             'surface_reelle_bati_maisons',
             'surface_terrain_maisons',
             'nombre_appartements',
             'surface_carrez_appartements',
             'surface_reelle_bati_appartements',
             'surface_terrain_appartements',
             'nombre_dependances',
             'surface_carrez_dependances',
             'surface_reelle_bati_dependances',
             'surface_terrain_dependances',
             'nombre_pieces_principales',
             'nature_culture',
             'nature_culture_speciale',
             'longitude',
             'latitude',
]

def reduce_get_id(df:pd.DataFrame)->int:
  type_count=df.code_type_local.value_counts()
  if 1 in type_count:
    #Maison
    if type_count[1]==1:
      return df.index.get_loc(df.index[df.code_type_local==1][0])
    else:
      return np.argmax(df.surface_reelle_bati)
  elif 2 in type_count:
    # Appartement
    if type_count[2]==1:
      return df.index.get_loc(df.index[df.code_type_local==2][0])
    else:
      return np.argmax(df.surface_reelle_bati)
  return np.argmax(df.surface_reelle_bati)


def fusion_data(df:pd.DataFrame)->pd.Series:
  iprinc=reduce_get_id(df)
  mut_princ=df.iloc[iprinc]
  values=[]
  # id_mutation
  values.append(mut_princ.id_mutation)
  date=mut_princ.date_mutation.split('-')
  # jour_mutation
  values.append(date[2])
  # mois_mutation
  values.append(date[1])
  # annee_mutation
  values.append(date[0])
  # nature_mutation
  values.append(mut_princ.nature_mutation)
  # valeur_fonciere
  values.append(mut_princ.valeur_fonciere)
  # adresse_numero
  values.append(mut_princ.adresse_numero)
  # adresse_suffixe
  values.append(mut_princ.adresse_suffixe)
  # adresse_nom_voie
  values.append(mut_princ.adresse_nom_voie)
  # adresse_code_voie
  values.append(mut_princ.adresse_code_voie)
  # code_postal
  values.append(mut_princ.code_postal)
  # nom_commune
  values.append(mut_princ.nom_commune)
  # code_departement
  values.append(mut_princ.code_departement)
  # id_parcelle
  values.append(mut_princ.id_parcelle)
  # surface_carrez_total
  cols_carrez=['lot1_surface_carrez','lot2_surface_carrez','lot3_surface_carrez','lot4_surface_carrez','lot5_surface_carrez']
  values.append(df[cols_carrez].sum().sum())
  # surface_reelle_bati_total
  values.append(df.surface_reelle_bati.sum())
  # surface_terrain_total
  values.append(df.surface_terrain.sum())
  # nombre_lots
  values.append(df.nombre_lots.sum())
  # nombre_maisons
  # surface_carrez_maisons
  # surface_reelle_bati_maisons
  # surface_terrain_maisons
  def surface_divide(indice:int):
    # nombre_local
    type_locaux=df.code_type_local.value_counts()
    values.append(0 if indice not in type_locaux else type_locaux[indice])
    # surface_carrez_local
    values.append(df[df.code_type_local==indice][cols_carrez].sum().sum())
    # surface_reelle_bati_local
    values.append(df[df.code_type_local==indice].surface_reelle_bati.sum().sum())
    # surface_terrain_local
    values.append(df[df.code_type_local==indice].surface_terrain.sum().sum())
  surface_divide(1)
  # nombre_appartements
  # surface_carrez_appartements
  # surface_reelle_bati_appartements
  # surface_terrain_appartements
  surface_divide(2)
  # nombre_dependences
  # surface_carrez_dependences
  # surface_reelle_bati_dependences
  # surface_terrain_dependences
  surface_divide(3)
  # nombre_pieces_principales
  values.append(df.nombre_pieces_principales.sum())
  # nature_culture
  values.append(mut_princ.nature_culture)
  # nature_culture_speciale
  values.append(mut_princ.nature_culture_speciale)
  # longitude
  values.append(mut_princ.longitude)
  # latitude
  values.append(mut_princ.latitude)
  return pd.Series({ k:v for k,v in zip(alt_df_cols,values)}, index=alt_df_cols)

def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for _, group in dfGrouped)
    return pd.DataFrame(retLst,columns=alt_df_cols)


# Traitements des CSV

In [190]:
for i,annee in enumerate(range(2020,2022)):
  if i ==0:
    df=pd.read_csv(f"original/{annee}.csv",encoding='utf8')
    df=clean_df(df)
  else:
    df2=pd.read_csv(f"original/{annee}.csv",encoding='utf8')
    df2=clean_df(df2)
    df=pd.concat([df,df2])
    
  print(annee)
  # new_df=applyParallel(df.groupby("id_mutation"), fusion_data)
  # new_df=df.groupby("id_mutation").apply(fusion_data)
  # new_df=new_df.set_index("id_mutation")
  # new_df.to_csv(f'clean_data/{annee}_cleaned.csv',encoding='utf8')
  

2020


  exec(code_obj, self.user_global_ns, self.user_ns)


2021


In [99]:
df.columns

Index(['id_mutation', 'date_mutation', 'numero_disposition', 'nature_mutation',
       'valeur_fonciere', 'adresse_numero', 'adresse_suffixe',
       'adresse_nom_voie', 'adresse_code_voie', 'code_postal', 'code_commune',
       'nom_commune', 'code_departement', 'id_parcelle', 'numero_volume',
       'lot1_numero', 'lot1_surface_carrez', 'lot2_numero',
       'lot2_surface_carrez', 'lot3_numero', 'lot3_surface_carrez',
       'lot4_numero', 'lot4_surface_carrez', 'lot5_numero',
       'lot5_surface_carrez', 'nombre_lots', 'code_type_local', 'type_local',
       'surface_reelle_bati', 'nombre_pieces_principales',
       'code_nature_culture', 'nature_culture', 'code_nature_culture_speciale',
       'nature_culture_speciale', 'surface_terrain', 'longitude', 'latitude'],
      dtype='object')

In [200]:
data=df#[df.code_type_local==4]
data=data[data.nom_commune=='Nantes']
# data=data[data.id_parcelle.str.contains('MR')]
data=data[data.surface_reelle_bati>0]
# data=data[data.code_postal!=44000]
source=data.copy()
# data=data.groupby('id_mutation').agg({'valeur_fonciere':'mean','surface_reelle_bati':'sum'})
data["prix_m2"]=data.valeur_fonciere/data.surface_reelle_bati
cols=['id_mutation','date_mutation','adresse_numero','adresse_nom_voie','valeur_fonciere','lot1_surface_carrez','lot2_surface_carrez','lot3_surface_carrez','surface_reelle_bati','code_postal','prix_m2']
(data.valeur_fonciere/data.surface_reelle_bati).median()

3911.5275142314995

In [194]:
data.date_mutation.max()
# df[df.id_mutation=='2020-553209']

'2021-06-30'

In [201]:
# print(data)
data[data.adresse_nom_voie.str.contains("GOGUET")][cols]
# data[data.id_mutation=='2021-193424'][cols]


Unnamed: 0,id_mutation,date_mutation,adresse_numero,adresse_nom_voie,valeur_fonciere,lot1_surface_carrez,lot2_surface_carrez,lot3_surface_carrez,surface_reelle_bati,code_postal,prix_m2
1364226,2020-555610,2020-05-11,2.0,RLE DU MONT GOGUET,370000.0,75.06,0.0,0.0,75.0,44000.0,4933.333333
1373748,2020-560975,2020-11-10,5.0,RUE DU MONT GOGUET,155000.0,32.77,0.0,0.0,35.0,44000.0,4428.571429
1376531,2020-562423,2020-12-16,5.0,RUE DU MONT GOGUET,90000.0,17.36,0.0,0.0,14.0,44000.0,6428.571429
1376819,2020-562591,2020-12-21,3.0,RUE DU MONT GOGUET,440000.0,0.0,0.0,0.0,88.0,44000.0,5000.0
1376990,2020-562679,2020-12-29,15.0,RUE DU MONT GOGUET,300000.0,0.0,0.0,0.0,20.0,44000.0,15000.0
1376991,2020-562679,2020-12-29,15.0,RUE DU MONT GOGUET,300000.0,0.0,0.0,0.0,23.0,44000.0,13043.478261
1376992,2020-562679,2020-12-29,15.0,RUE DU MONT GOGUET,300000.0,0.0,0.0,0.0,20.0,44000.0,15000.0
1376993,2020-562679,2020-12-29,15.0,RUE DU MONT GOGUET,300000.0,0.0,0.0,0.0,25.0,44000.0,12000.0
485340,2021-191840,2021-02-25,5.0,RUE DU MONT GOGUET,120000.0,26.73,0.0,0.0,28.0,44000.0,4285.714286
