# Proto-cluster prediction
In some cases, it might be useful to apply already created personas (based on 2019 data) to other data sets. We did this to use the personas as analysis and reweighting element for synthetic populations and simulations that have been done with the 2015 census data. This workbook describes the process in short. However, it is important to note that this always require a certain amount of manual recoding of variable names and scales to work. 

The census data from 2015 is available: https://www.insee.fr/fr/statistiques/3625223?sommaire=3558417

In [1]:
%autosave 180

Autosaving every 180 seconds


In [74]:
#Load packages
import pandas as pd
import geopandas as gpd
import numpy as np
import dbf
import pickle
import os
from kmodes.kprototypes import KPrototypes

In [9]:
os.chdir("..")

In [None]:
pwd

In [23]:
dbf = Dbf5('data/raw/FD_INDCVIZA_2015.dbf')
census = dbf.to_dataframe()

In [25]:
census.head()

Unnamed: 0,CANTVILLE,NUMMI,ACHLR,AEMMR,AGED,AGER20,AGEREV,AGEREVQ,ANAI,ANEMR,...,TP,TRANS,TRIRIS,TYPC,TYPFC,TYPL,TYPMC,TYPMR,VOIT,WC
0,75ZZ,1,3,7,67,79,66,65,1949,5,...,Z,Z,752621,3,Z,2,1,11,0,Z
1,75ZZ,2,2,9,74,79,73,70,1939,1,...,Z,Z,750781,3,Z,2,3,32,0,Z
2,75ZZ,2,2,9,42,54,41,40,1971,1,...,1,2,750781,3,1,2,3,32,0,Z
3,75ZZ,2,2,9,13,14,12,10,2000,1,...,Z,Z,750781,3,1,2,3,32,0,Z
4,75ZZ,2,2,9,15,17,15,15,1998,1,...,Z,Z,750781,3,1,2,3,32,0,Z


In [75]:
# load the saved kproto object from a file
with open('data/interim/kproto_100pct.pkl', 'rb') as f:
    kproto = pickle.load(f)

In [26]:
#drop columns that aren't used
census = census.drop(labels=['ACHLR','AEMMR','AGER20','AGEREV','AGEREVQ','ANAI','ANEMR',
            'APAF','ARM','ASCEN','BAIN','BATI','CANTVILLE','CATIRIS','CATL','CATPC','CHAU',
            'CHFL','CHOS','CLIM','CMBL','CUIS','DEPT','DEROU','DNAI','EAU','EGOUL','ELEC','EMPL',
            'HLML','INATC','INFAM','INPER','INPERF','IRAN','LIENF','LPRF','LPRM','METRODOM','NA5','NAIDT',
            'NE24FR','NE3FR','NENFR','NUMF','NUMMI','ORIDT','RECH',
            'REGION','SANI','SANIDOM','SFM','TACTD16','TRIRIS','TYPC',
            'TYPFC','TYPMC','TYPMR','WC'], axis=1)

In [56]:
census4cluster = census.drop(labels=['IPONDI','AGED','IRIS','NE18FR'], axis=1)

In [57]:
# rename the 'DIPL_15' column to 'DIPL'
census4cluster = census4cluster.rename(columns={'DIPL_15': 'DIPL'})

# create a dictionary to map the old values to the new values
mapping = {'A': 12, 'B': 13, 'C': 14, 'D': 18, 'Z': 'ZZ'}

# replace the old values with the new values using the map() function
census4cluster['DIPL'] = census4cluster['DIPL'].map(mapping)

In [58]:
# rename the 'NE6FR' column to 'NE5FR'
census4cluster = census4cluster.rename(columns={'NE6FR': 'NE5FR'})

In [59]:
# create a dictionary to map the old values to the new values
mapping = {'A': 1, 'B': 6}

# replace the old values with the new values using the map() function
census4cluster['STAT_CONJ'] = census4cluster['STAT_CONJ'].map(mapping)

In [60]:
# create a dictionary to map the old values to the new values
mapping = {'1':'1','2':'2','3': '4','4':'5','5':'6','Z':'Z'}

# replace the old values with the new values using the map() function
census4cluster['TRANS'] = census4cluster['TRANS'].map(mapping)

In [63]:
census4cluster.head(10)

Unnamed: 0,COUPLE,CS1,DIPL,ETUD,GARL,ILETUD,ILT,IMMI,INAI,MOCO,...,SEXE,STAT_CONJ,STATR,STOCD,SURF,TACT,TP,TRANS,TYPL,VOIT
0,2,7,13,2,2,Z,Z,2,3,32,...,1,6,Z,10,3,21,Z,Z,2,0
1,2,7,18,2,2,Z,Z,2,4,31,...,1,6,Z,22,4,21,Z,Z,2,0
2,2,5,18,2,2,Z,1,2,4,23,...,2,6,1,22,4,11,1,2,2,0
3,2,8,ZZ,1,2,1,Z,2,2,12,...,2,6,Z,22,4,23,Z,Z,2,0
4,2,8,12,1,2,1,Z,2,2,12,...,2,6,Z,22,4,22,Z,Z,2,0
5,2,8,12,1,2,1,Z,2,2,12,...,1,6,Z,22,4,22,Z,Z,2,0
6,2,7,12,2,2,Z,Z,1,6,31,...,1,6,Z,21,3,21,Z,Z,2,0
7,2,3,12,2,2,Z,3,1,6,31,...,2,6,1,21,3,11,1,6,2,0
8,1,3,18,2,1,Z,1,2,3,22,...,1,6,2,10,3,11,1,6,2,0
9,2,8,ZZ,1,1,1,Z,2,1,11,...,2,6,Z,10,3,23,Z,Z,2,0


# Clustering process

### Normalisation of numeric variables NBPI, NE5FR, NPERR, VOIT

In [64]:
# Convert non numeric values to NA
census4cluster['NBPI'] = pd.to_numeric(census4cluster['NBPI'], errors='coerce')
census4cluster['NE5FR'] = pd.to_numeric(census4cluster['NE5FR'], errors='coerce')
census4cluster['NPERR'] = pd.to_numeric(census4cluster['NPERR'], errors='coerce')
census4cluster['VOIT'] = pd.to_numeric(census4cluster['VOIT'], errors='coerce')

In [65]:
# Calculate normalised values
census4cluster['NBPI'] = (census4cluster['NBPI'] - census4cluster['NBPI'].min()) / (census4cluster['NBPI'].max() - census4cluster['NBPI'].min())
census4cluster['NE5FR'] = (census4cluster['NE5FR'] - census4cluster['NE5FR'].min()) / (census4cluster['NE5FR'].max() - census4cluster['NE5FR'].min())
census4cluster['NPERR'] = (census4cluster['NPERR'] - census4cluster['NPERR'].min()) / (census4cluster['NPERR'].max() - census4cluster['NPERR'].min())
census4cluster['VOIT'] = (census4cluster['VOIT'] - census4cluster['VOIT'].min()) / (census4cluster['VOIT'].max() - census4cluster['VOIT'].min())

In [66]:
# Replace NA with mean values
census4cluster['NBPI'].fillna(census4cluster['NBPI'].mean(), inplace=True)
census4cluster['NE5FR'].fillna(census4cluster['NE5FR'].mean(), inplace=True)
census4cluster['NPERR'].fillna(census4cluster['NPERR'].mean(), inplace=True)
census4cluster['VOIT'].fillna(census4cluster['VOIT'].mean(), inplace=True)

## Prepare data

In [67]:
# Encoding ordinal variable SURF
census4cluster['SURF'] = census4cluster['SURF'].astype(str)
categories = sorted(census4cluster['SURF'].unique())
census4cluster['SURF'] = pd.Categorical(census4cluster['SURF'], categories=categories, ordered=True)

In [68]:
census4cluster.head()

Unnamed: 0,COUPLE,CS1,DIPL,ETUD,GARL,ILETUD,ILT,IMMI,INAI,MOCO,...,SEXE,STAT_CONJ,STATR,STOCD,SURF,TACT,TP,TRANS,TYPL,VOIT
0,2,7,13,2,2,Z,Z,2,3,32,...,1,6,Z,10,3,21,Z,Z,2,0.0
1,2,7,18,2,2,Z,Z,2,4,31,...,1,6,Z,22,4,21,Z,Z,2,0.0
2,2,5,18,2,2,Z,1,2,4,23,...,2,6,1,22,4,11,1,2,2,0.0
3,2,8,ZZ,1,2,1,Z,2,2,12,...,2,6,Z,22,4,23,Z,Z,2,0.0
4,2,8,12,1,2,1,Z,2,2,12,...,2,6,Z,22,4,22,Z,Z,2,0.0


In [69]:
# Encoding nominal variables
NomVariables = ['COUPLE','CS1','DIPL','ETUD','GARL','ILETUD','ILT','IMMI','INAI','MOCO',
                'MODV','NA17','SEXE','STAT_CONJ','STATR','STOCD','TACT','TP','TRANS','TYPL']

# Iterate through the list using a for loop
for NomVar in NomVariables:
    census4cluster[NomVar] = census4cluster[NomVar].astype(str)
    categories = sorted(census4cluster[NomVar].unique())
    census4cluster[NomVar] = pd.Categorical(census4cluster[NomVar], categories=categories, ordered=False)

In [70]:
# define input dataframe
df = census4cluster
# specify ordered categorical values
cat_cols_ord = ['SURF']
# specify unordered categorical values
cat_cols_unord = ['COUPLE','CS1','DIPL','ETUD','GARL','ILETUD','ILT','IMMI','INAI','MOCO',
                'MODV','NA17','SEXE','STAT_CONJ','STATR','STOCD','TACT','TP','TRANS','TYPL']
for cat in cat_cols_ord:
    df[cat] = df[cat].astype(str)
    categories = sorted(df[cat].unique())
    df[cat] = pd.Categorical(df[cat], categories=categories, ordered=True)
for cat in cat_cols_unord:
    df[cat] = df[cat].astype(str)
    categories = sorted(df[cat].unique())
    df[cat] = pd.Categorical(df[cat], categories=categories, ordered=True)

# redefine datafram
census4cluster = df

In [78]:
#define number of clusters
clusters2015 = kproto.predict(census4cluster,categorical=[0,1,2,3,4,5,6,7,8,9,10,11,15,16,17,18,19,20,21,22,23])

In [80]:
# Create a pandas DataFrame from the array
clusters_2015 = pd.DataFrame(clusters2015, columns=['ID'])

# Save the DataFrame to a CSV file
clusters_2015.to_csv('data/output/clusters_2015.csv', index=False)