# Updated proto-clustering process

In [1]:
%autosave 180

Autosaving every 180 seconds


In [2]:
#Load packages
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from kmodes.kprototypes import KPrototypes

In [4]:
pwd

'/Users/tjark/Documents/Python/clustering.nosync/cluster process'

In [5]:
os.chdir("..")

In [5]:
# Downloaded data for area A - Ile de France
# https://www.insee.fr/fr/statistiques/6544333?sommaire=6456104

In [6]:
census = pd.read_csv('data/raw/FD_INDCVIZA_2019.csv', sep = ';') 

  census = pd.read_csv('data/raw/FD_INDCVIZA_2019.csv', sep = ';')


In [7]:
census.head()

Unnamed: 0,CANTVILLE,NUMMI,ACHLR,AEMMR,AGED,AGER20,AGEREV,AGEREVQ,ANAI,ANEMR,...,TP,TRANS,TRIRIS,TYPC,TYPFC,TYPL,TYPMC,TYPMR,VOIT,WC
0,75ZZ,1,1,9,72,79,72,70,1944,2,...,Z,Z,751501,3,Z,1,1,12,1,Z
1,75ZZ,2,2,9,59,64,58,55,1958,3,...,1,5,750901,3,2,6,4,41,0,Z
2,75ZZ,2,2,9,30,29,29,25,1987,3,...,1,6,750901,3,2,6,4,41,0,Z
3,75ZZ,3,1,7,82,80,81,80,1938,5,...,Z,Z,751431,3,2,2,4,44,1,Z
4,75ZZ,3,1,7,86,80,85,85,1934,5,...,Z,Z,751431,3,2,2,4,44,1,Z


In [8]:
#drop columns that aren't used
census = census.drop(labels=['ACHLR','AEMMR','AGER20','AGEREV','AGEREVQ','ANAI','ANEMR',
            'APAF','ARM','ASCEN','BAIN','BATI','CANTVILLE','CATIRIS','CATL','CATPC','CHAU',
            'CHFL','CHOS','CLIM','CMBL','CUIS','DEPT','DEROU','DNAI','EAU','EGOUL','ELEC','EMPL',
            'HLML','INATC','INFAM','INPER','INPERF','IRAN','LIENF','LPRF','LPRM','METRODOM','NA5','NAIDT',
            'NE24FR','NE3FR','NENFR','NUMF','NUMMI','ORIDT','RECH',
            'REGION','SANI','SANIDOM','SFM','TACTD16','TRIRIS','TYPC',
            'TYPFC','TYPMC','TYPMR','WC'], axis=1)

In [9]:
list(census.columns.values.tolist())

['AGED',
 'COUPLE',
 'CS1',
 'DIPL',
 'ETUD',
 'GARL',
 'ILETUD',
 'ILT',
 'IMMI',
 'INAI',
 'IPONDI',
 'IRIS',
 'MOCO',
 'MODV',
 'NA17',
 'NBPI',
 'NE17FR',
 'NE5FR',
 'NPERR',
 'SEXE',
 'STAT_CONJ',
 'STATR',
 'STOCD',
 'SURF',
 'TACT',
 'TP',
 'TRANS',
 'TYPL',
 'VOIT']

In [10]:
census4cluster = census.drop(labels=['AGED','IRIS','NE17FR'], axis=1)

## Convert weights 'IPONDI' to integrer multiplicators via stochastic rounding 

In [11]:
census4cluster['intweight'] = census4cluster['IPONDI'].astype(int)

In [12]:
census4cluster['fractweight'] = census4cluster['IPONDI'] - census4cluster['intweight']

In [13]:
u = random.random()
census4cluster['multiplicator'] = census4cluster['intweight'] + (u < census4cluster['fractweight'])

In [14]:
# delete columns no longer needed
census4cluster = census4cluster.drop(labels=['intweight','fractweight','IPONDI'], axis=1)

# Clustering process

## Normalisation of numeric variables NBPI, NE5FR, NPERR, VOIT

In [15]:
# Convert non numeric values to NA
census4cluster['NBPI'] = pd.to_numeric(census4cluster['NBPI'], errors='coerce')
census4cluster['NE5FR'] = pd.to_numeric(census4cluster['NE5FR'], errors='coerce')
census4cluster['NPERR'] = pd.to_numeric(census4cluster['NPERR'], errors='coerce')
census4cluster['VOIT'] = pd.to_numeric(census4cluster['VOIT'], errors='coerce')

In [16]:
# Calculate normalised values
census4cluster['NBPI'] = (census4cluster['NBPI'] - census4cluster['NBPI'].min()) / (census4cluster['NBPI'].max() - census4cluster['NBPI'].min())
census4cluster['NE5FR'] = (census4cluster['NE5FR'] - census4cluster['NE5FR'].min()) / (census4cluster['NE5FR'].max() - census4cluster['NE5FR'].min())
census4cluster['NPERR'] = (census4cluster['NPERR'] - census4cluster['NPERR'].min()) / (census4cluster['NPERR'].max() - census4cluster['NPERR'].min())
census4cluster['VOIT'] = (census4cluster['VOIT'] - census4cluster['VOIT'].min()) / (census4cluster['VOIT'].max() - census4cluster['VOIT'].min())

In [17]:
# Replace NA with mean values
census4cluster['NBPI'].fillna(census4cluster['NBPI'].mean(), inplace=True)
census4cluster['NE5FR'].fillna(census4cluster['NE5FR'].mean(), inplace=True)
census4cluster['NPERR'].fillna(census4cluster['NPERR'].mean(), inplace=True)
census4cluster['VOIT'].fillna(census4cluster['VOIT'].mean(), inplace=True)

## Prepare data

In [18]:
# Encoding ordinal variable SURF
census4cluster['SURF'] = census4cluster['SURF'].astype(str)
categories = sorted(census4cluster['SURF'].unique())
census4cluster['SURF'] = pd.Categorical(census4cluster['SURF'], categories=categories, ordered=True)

In [19]:
census4cluster.head()

Unnamed: 0,COUPLE,CS1,DIPL,ETUD,GARL,ILETUD,ILT,IMMI,INAI,MOCO,...,STAT_CONJ,STATR,STOCD,SURF,TACT,TP,TRANS,TYPL,VOIT,multiplicator
0,2,7,19,2,1,Z,Z,2,3,32,...,6,Z,10,4,21,Z,Z,1,0.333333,3
1,1,3,17,2,2,Z,1,1,6,21,...,3,2,21,1,11,1,5,6,0.0,3
2,1,3,16,2,2,Z,3,1,6,21,...,3,2,21,1,11,1,6,6,0.0,3
3,1,7,14,2,1,Z,Z,2,2,21,...,1,Z,10,4,21,Z,Z,2,0.333333,3
4,1,7,14,2,1,Z,Z,2,1,21,...,1,Z,10,4,21,Z,Z,2,0.333333,3


In [20]:
# Encoding nominal variables
NomVariables = ['COUPLE','CS1','DIPL','ETUD','GARL','ILETUD','ILT','IMMI','INAI','MOCO',
                'MODV','NA17','SEXE','STAT_CONJ','STATR','STOCD','TACT','TP','TRANS','TYPL']

# Iterate through the list using a for loop
for NomVar in NomVariables:
    census4cluster[NomVar] = census4cluster[NomVar].astype(str)
    categories = sorted(census4cluster[NomVar].unique())
    census4cluster[NomVar] = pd.Categorical(census4cluster[NomVar], categories=categories, ordered=False)

## Scale populations by multiplicator

In [21]:
census4cluster.insert(0, 'ID', range(0, len(census4cluster)))

In [22]:
def scale_rows(df_group):
    # calculate the number of times to repeat each row
    weight = df_group['multiplicator'].iloc[0]
    # repeat each row based on the weight column
    df_group = df_group.iloc[np.repeat(np.arange(len(df_group)), weight)]
    return df_group

In [23]:
# #create sample for efficiency
# sample_census4cluster = census4cluster.sample(n=1000)

In [24]:
# # apply the scaling function to each group of rows with the same ID
# census4cluster_scaled = sample_census4cluster.groupby('ID').apply(scale_rows)
# # reset the index of the new dataframe
# census4cluster_scaled.reset_index(drop=True, inplace=True)
# # delete ID and multiplocator columns no longer needed
# census4cluster_scaled = census4cluster_scaled.drop(labels=['multiplicator','ID'], axis=1)

## Elbow test

In [25]:
# census4cluster_scaled.head()

In [26]:
# # define min and max number of clusters
# min_clust = 2
# max_clust = 50
# k_range = list(range(min_clust,max_clust+1,2))
# # empty array for within of Within-cluster sum of squares WCSS
# wcss = []

# #verbose 0 = no process output, 1 = clusters per entry, 2 = full messages

# for k in k_range:
#     kproto = KPrototypes(n_clusters=k, init='Cao', verbose=0)
#     clusters = kproto.fit_predict(census4cluster_scaled,categorical=[0,1,2,3,4,5,6,7,8,9,10,11,15,16,17,18,19,20,21,22,23])
    
#     wcss.append(kproto.cost_)

In [27]:
# # plot the WCSS against k
# plt.plot(k_range, wcss)
# plt.xlabel('Number of clusters (k)')
# plt.ylabel('Within-cluster sum of squares (WCSS)')
# plt.title('Elbow plot for KPrototypes clustering')
# plt.show()

In [28]:
# wcss_file = pd.DataFrame (wcss, columns = ['wcss'])

In [29]:
# wcss_file.to_csv('data/interim/wcss.csv')

## Final clustering
(~2 days with Mac, 32GB Ram/2.3 GHz Quad-Core Intel Core i7)

In [30]:
# apply the scaling function to each group of rows with the same ID
census4cluster_scaled = census4cluster.groupby('ID').apply(scale_rows)
# reset the index of the new dataframe
census4cluster_scaled.reset_index(drop=True, inplace=True)
# delete ID and multiplocator columns no longer needed
census4cluster_scaled = census4cluster_scaled.drop(labels=['multiplicator','ID'], axis=1)

### Interim safety export 

In [31]:
# Write file
census4cluster_scaled.to_csv('data/interim/census4cluster_scaled.csv')

In [36]:
len(census4cluster_scaled)

10719909

In [35]:
census4cluster_scaled.head()

Unnamed: 0,COUPLE,CS1,DIPL,ETUD,GARL,ILETUD,ILT,IMMI,INAI,MOCO,...,SEXE,STAT_CONJ,STATR,STOCD,SURF,TACT,TP,TRANS,TYPL,VOIT
0,2,7,19,2,1,Z,Z,2,3,32,...,2,6,Z,10,4,21,Z,Z,1,0.333333
1,2,7,19,2,1,Z,Z,2,3,32,...,2,6,Z,10,4,21,Z,Z,1,0.333333
2,2,7,19,2,1,Z,Z,2,3,32,...,2,6,Z,10,4,21,Z,Z,1,0.333333
3,1,3,17,2,2,Z,1,1,6,21,...,1,3,2,21,1,11,1,5,6,0.0
4,1,3,17,2,2,Z,1,1,6,21,...,1,3,2,21,1,11,1,5,6,0.0


In [6]:
#Read file (if interrupted)
census4cluster_scaled = pd.read_csv('data/interim/census4cluster_scaled.csv')
census4cluster_scaled = census4cluster_scaled.drop(columns=['Unnamed: 0'])
census4cluster_scaled.head()

  census4cluster_scaled = pd.read_csv('data/interim/census4cluster_scaled.csv')


Unnamed: 0,COUPLE,CS1,DIPL,ETUD,GARL,ILETUD,ILT,IMMI,INAI,MOCO,...,SEXE,STAT_CONJ,STATR,STOCD,SURF,TACT,TP,TRANS,TYPL,VOIT
0,2,7,19,2,1,Z,Z,2,3,32,...,2,6,Z,10,4,21,Z,Z,1,0.333333
1,2,7,19,2,1,Z,Z,2,3,32,...,2,6,Z,10,4,21,Z,Z,1,0.333333
2,2,7,19,2,1,Z,Z,2,3,32,...,2,6,Z,10,4,21,Z,Z,1,0.333333
3,1,3,17,2,2,Z,1,1,6,21,...,1,3,2,21,1,11,1,5,6,0.0
4,1,3,17,2,2,Z,1,1,6,21,...,1,3,2,21,1,11,1,5,6,0.0


In [7]:
# 5% sample of overall population
five_per = int((len(census4cluster_scaled)/2000))

In [9]:
# #create sample for efficiency
census4cluster_scaled_five_per = census4cluster_scaled.sample(n=five_per)

In [10]:
# define input dataframe
df = census4cluster_scaled_five_per
# specify ordered categorical values
cat_cols_ord = ['SURF']
# specify unordered categorical values
cat_cols_unord = ['COUPLE','CS1','DIPL','ETUD','GARL','ILETUD','ILT','IMMI','INAI','MOCO',
                'MODV','NA17','SEXE','STAT_CONJ','STATR','STOCD','TACT','TP','TRANS','TYPL']
for cat in cat_cols_ord:
    df[cat] = df[cat].astype(str)
    categories = sorted(df[cat].unique())
    df[cat] = pd.Categorical(df[cat], categories=categories, ordered=True)
for cat in cat_cols_unord:
    df[cat] = df[cat].astype(str)
    categories = sorted(df[cat].unique())
    df[cat] = pd.Categorical(df[cat], categories=categories, ordered=True)

# redefine datafram
census4cluster_scaled_five_per = df

## Started 28 March, 10 PM

In [11]:
#define number of clusters
k = 16

kproto = KPrototypes(n_clusters=k, init='Cao', verbose=1)
clusters = kproto.fit_predict(census4cluster_scaled_five_per,categorical=[0,1,2,3,4,5,6,7,8,9,10,11,15,16,17,18,19,20,21,22,23])

In [34]:
# Add cluster IDs to initial file
# census4cluster_scaled['cluster'] = clusters

In [17]:
import pickle

# save the kproto object to a file
with open('kproto.pkl', 'wb') as f:
    pickle.dump(kproto, f)

In [21]:
# # load the saved kproto object from a file
# with open('kproto.pkl', 'rb') as f:
#     kproto = pickle.load(f)

In [27]:
# use existing kproto object to predict clusters for new data
clusters_full = kproto.predict(census4cluster_scaled, categorical=[0,1,2,3,4,5,6,7,8,9,10,11,15,16,17,18,19,20,21,22,23])