<a href="https://colab.research.google.com/github/TobiPrae/customer_segmentation/blob/main/CustomerSegmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages

In [3]:
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
import seaborn as sns

# magic word for producing visualizations in notebook
%matplotlib inline
print("Libraries successfully imported")

Libraries successfully imported


In [4]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

print("Everything successfully set up")

Everything successfully set up


# Load data

In [None]:
# Get list of every available file in google drive
#file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
#for file1 in file_list:
#  print('title: %s, id: %s' % (file1['title'], file1['id']))

In [5]:
# download demographic data
download_data = drive.CreateFile({'id': '135UcsbUyLcwS16QHa7lFcXHKhuy7HwWF'})
download_data.GetContentFile('azdias.csv')  
df_azdias = pd.read_csv('azdias.csv', engine="python", sep=';')

print(df_azdias.shape)

(891221, 366)


In [6]:
# download customer data
download_data = drive.CreateFile({'id': '1eDq76GDZLXeonowxPrbwF-EHZwvaENLK'})
download_data.GetContentFile('customers.csv')  
df_customers = pd.read_csv('customers.csv', engine="python", sep=';')

print(df_customers.shape)

(191652, 369)


In [34]:
# download attribute values and clean them
download_data = drive.CreateFile({'id': '1wFNhCQm7Cv9CGJ7Pm_RbtrJdnLb4zyuC'})
download_data.GetContentFile('attributes_values.xlsx')  
df_attributes_values = pd.read_excel('attributes_values.xlsx', names=["None", "attribute", "description", "value", "meaning"])
df_attributes_values = df_attributes_values.drop(columns=['None'])[1:]
print(df_attributes_values.shape)

(2258, 4)


In [35]:
# download attribute information levels and clean them
download_data = drive.CreateFile({'id': '1icPBxHTV0zysKvQSJhSDPDhyH8Qd3uqW'})
download_data.GetContentFile('information_levels_attributes.xlsx')  
df_information_levels = pd.read_excel('information_levels_attributes.xlsx', names=["None", "Information_Level", "attribute", "description", "additional_notes"])
df_information_levels = df_information_levels.drop(columns=["None"])[1:]
print(df_information_levels.shape)

(313, 4)


# Functions

In [9]:
def get_column_nans(df, sorting_order_ascending=False):
    '''
    Analyzes dataframe by counting nan values for each column.
    
    Args
    - df: Dataframe to be analyzed
    - sorting_order_ascending: Indicates how df will be ordered (default=False)
    
    Returns
    - df_nan: Df that contains na count and percentage for each column
    '''
    df_nan = pd.DataFrame(df.apply(lambda x: x.isna().sum()), columns=["count"]).sort_values(by="count", ascending=sorting_order_ascending)
    df_nan["na_percentage"] = df_nan["count"].apply(lambda x: x/df.shape[0])
    
    return df_nan

# Part 1: Customer Segmentation Report

### Analyze additional information

In [36]:
# Check df_information_levels
df_information_levels.head(5)

Unnamed: 0,Information_Level,attribute,description,additional_notes
1,,AGER_TYP,best-ager typology,in cooperation with Kantar TNS; the informatio...
2,Person,ALTERSKATEGORIE_GROB,age through prename analysis,modelled on millions of first name-age-referen...
3,,ANREDE_KZ,gender,
4,,CJT_GESAMTTYP,Customer-Journey-Typology relating to the pref...,"relating to the preferred information, marketi..."
5,,FINANZ_MINIMALIST,financial typology: low financial interest,Gfk-Typology based on a representative househo...


In [37]:
# Check df_attributes_values
df_attributes_values.head(5)

Unnamed: 0,attribute,description,value,meaning
1,AGER_TYP,best-ager typology,-1,unknown
2,,,0,no classification possible
3,,,1,passive elderly
4,,,2,cultural elderly
5,,,3,experience-driven elderly


In [38]:
# Forward filling the attribute values df for better filtering/slicing
df_attributes_values["attribute"] = df_attributes_values["attribute"].fillna(method="ffill")
df_attributes_values["description"] = df_attributes_values["description"].fillna(method="ffill")
df_attributes_values.head(5)

Unnamed: 0,attribute,description,value,meaning
1,AGER_TYP,best-ager typology,-1,unknown
2,AGER_TYP,best-ager typology,0,no classification possible
3,AGER_TYP,best-ager typology,1,passive elderly
4,AGER_TYP,best-ager typology,2,cultural elderly
5,AGER_TYP,best-ager typology,3,experience-driven elderly


In [57]:
# Comparing unique values in azdias and customers data as well as additional information
attributes_values_attributes = set(df_attributes_values["attribute"].tolist())
information_levels_attributes = set(df_information_levels["attribute"].tolist())
print(f"Length attributes_values_attributes:{len(attributes_values_attributes)}")
print(f"Length information_levels_attributes:{len(information_levels_attributes)}")
print(f"Length of columns in df_azdias:{len(df_azdias.columns.tolist())}")
print(f"Length of columns in df_customers:{len(df_customers.columns.tolist())}")

Length attributes_values_attributes:314
Length information_levels_attributes:313
Length of columns in df_azdias:360
Length of columns in df_customers:363


In [69]:
# No description available
no_description_available = set(df_azdias.columns.tolist()).difference(attributes_values_attributes)
description_available = set(df_azdias.columns.tolist()).intersection(attributes_values_attributes)
print(f"no_description_available: {len(no_description_available)}")
print(f"description_available: {len(description_available)}")

no_description_available: 88
description_available: 272


In [70]:
df_azdias[no_description_available].head(3)

Unnamed: 0,D19_BILDUNG,D19_BANKEN_REST,KBA13_ANTG1,D19_FREIZEIT,D19_DIGIT_SERV,D19_BANKEN_GROSS,EINGEFUEGT_AM,D19_HAUS_DEKO,CJT_TYP_3,D19_VERSI_OFFLINE_DATUM,D19_TIERARTIKEL,KBA13_HHZ,KBA13_ANTG2,D19_TELKO_ONLINE_QUOTE_12,D19_BUCH_CD,D19_SCHUHE,RT_SCHNAEPPCHEN,SOHO_KZ,CJT_TYP_2,GEMEINDETYP,D19_BEKLEIDUNG_REST,VHA,D19_SAMMELARTIKEL,STRUKTURTYP,CJT_TYP_4,D19_LETZTER_KAUF_BRANCHE,D19_DROGERIEARTIKEL,D19_KONSUMTYP_MAX,D19_TELKO_REST,UMFELD_ALT,CAMEO_INTL_2015,D19_HANDWERK,ANZ_KINDER,D19_ENERGIE,VK_DHT4A,RT_UEBERGROESSE,D19_TELKO_MOBILE,D19_VERSI_ONLINE_DATUM,D19_BANKEN_DIREKT,KBA13_GBZ,D19_TECHNIK,ALTERSKATEGORIE_FEIN,MOBI_RASTER,D19_RATGEBER,KBA13_ANTG4,KBA13_KMH_210,KBA13_BAUMAX,D19_BEKLEIDUNG_GEH,D19_LEBENSMITTEL,RT_KEIN_ANREIZ,D19_VOLLSORTIMENT,VK_DISTANZ,D19_BANKEN_LOKAL,D19_VERSI_ONLINE_QUOTE_12,VERDICHTUNGSRAUM,D19_BIO_OEKO,D19_WEIN_FEINKOST,VK_ZG11,D19_VERSICHERUNGEN,CJT_KATALOGNUTZER,ARBEIT,EINGEZOGENAM_HH_JAHR,UMFELD_JUNG,KONSUMZELLE,CJT_TYP_1,D19_KINDERARTIKEL,ANZ_STATISTISCHE_HAUSHALTE,D19_SONSTIGE,VHN,CJT_TYP_5,D19_VERSAND_REST,DSL_FLAG,LNR,KBA13_ANTG3,D19_LOTTO,FIRMENDICHTE,D19_GARTEN,CJT_TYP_6,KOMBIALTER,HH_DELTA_FLAG,KBA13_CCM_1401_2500,D19_KOSMETIK,D19_NAHRUNGSERGAENZUNG,AKT_DAT_KL,D19_REISEN,D19_VERSI_DATUM,D19_SOZIALES,UNGLEICHENN_FLAG
0,0,0,,0,0,0,,0,5.0,10,0,,,,0,0,4.0,,1.0,,0,,0,,5.0,,0,9,0,,,0,,0,,1.0,0,10,0,,0,,,0,,,,0,0,1.0,0,,0,,,0,0,,0,5.0,,,,,1.0,0,,0,,5.0,0,,910215,,,,0,5.0,9,,,0,0,,0,10,,
1,0,0,2.0,0,0,0,1992-02-10 00:00:00,0,2.0,10,0,5.0,4.0,,0,0,3.0,1.0,5.0,22.0,0,0.0,0,2.0,3.0,,0,9,0,3.0,51.0,0,0.0,0,8.0,5.0,0,10,0,4.0,0,21.0,1.0,0,1.0,4.0,2.0,0,0,5.0,0,11.0,0,,0.0,0,0,10.0,0,1.0,3.0,2004.0,3.0,1.0,5.0,0,12.0,0,4.0,1.0,0,1.0,910220,2.0,,2.0,0,1.0,1,0.0,3.0,0,0,9.0,0,10,,1.0
2,6,0,2.0,0,0,0,1992-02-12 00:00:00,0,1.0,10,0,4.0,3.0,0.0,0,0,4.0,0.0,4.0,22.0,0,0.0,0,3.0,3.0,D19_UNBEKANNT,0,8,0,2.0,24.0,0,0.0,0,9.0,5.0,0,10,0,4.0,6,17.0,2.0,0,0.0,4.0,1.0,0,0,5.0,7,9.0,0,0.0,1.0,0,0,6.0,0,2.0,3.0,2000.0,5.0,0.0,4.0,0,7.0,6,2.0,2.0,0,1.0,910225,1.0,0.0,4.0,0,2.0,2,0.0,3.0,6,0,9.0,0,10,0.0,0.0


In [71]:
df_azdias[description_available].head(3)

Unnamed: 0,KBA05_KW3,D19_VERSAND_ONLINE_DATUM,ANZ_PERSONEN,KBA13_MAZDA,ONLINE_AFFINITAET,CJT_GESAMTTYP,SEMIO_KRIT,D19_BANKEN_OFFLINE_DATUM,HEALTH_TYP,KBA13_SEG_UTILITIES,TITEL_KZ,KBA13_KW_60,KBA05_ZUL2,PLZ8_ANTG1,KBA13_FAB_ASIEN,KBA05_HERST3,KBA13_HALTER_66,KBA13_SEG_OBERKLASSE,KBA05_ALTER1,GEBAEUDETYP_RASTER,KBA05_KW2,NATIONALITAET_KZ,KBA13_FIAT,KBA13_VORB_2,KBA05_ANHANG,GEBURTSJAHR,KBA05_CCM2,KBA13_KRSSEG_KLEIN,KBA13_NISSAN,D19_VERSAND_ANZ_24,KBA05_SEG8,KBA05_ZUL4,KBA05_ALTER2,KBA13_HALTER_20,LP_LEBENSPHASE_GROB,KBA05_SEG3,KBA05_MOD2,GREEN_AVANTGARDE,KBA13_HERST_ASIEN,KBA13_KRSHERST_BMW_BENZ,INNENSTADT,KBA05_SEG9,KBA13_HALTER_40,KBA05_HERST4,KBA05_KW1,KBA05_HERST2,KONSUMNAEHE,KBA13_HERST_BMW_BENZ,KBA05_KRSKLEIN,KBA13_ALTERHALTER_61,KBA13_SEG_WOHNMOBILE,KBA05_VORB0,KBA13_SEG_SPORTWAGEN,KBA13_BJ_1999,KBA13_KW_90,KBA13_CCM_2000,KBA13_KW_120,SEMIO_KULT,KBA13_SEG_KOMPAKTKLASSE,SEMIO_REL,KBA13_VORB_1,KBA13_KRSHERST_AUDI_VW,ALTERSKATEGORIE_GROB,D19_VERSI_ANZ_24,KBA13_SEG_MINIVANS,KBA13_KRSZUL_NEU,KBA13_PEUGEOT,FINANZTYP,KBA05_AUTOQUOT,KBA05_KRSHERST2,KBA13_SEG_GROSSRAUMVANS,KBA05_SEG1,PLZ8_BAUMAX,KBA13_KW_0_60,GFK_URLAUBERTYP,D19_GESAMT_DATUM,KBA05_ANTG4,KBA05_HERSTTEMP,KBA05_SEG6,KBA13_CCM_2500,KBA13_SEG_KLEINWAGEN,LP_STATUS_GROB,CAMEO_DEU_2015,KBA13_HERST_AUDI_VW,KBA05_ALTER4,KBA05_MAXSEG,KBA13_HALTER_50,KBA13_SEG_VAN,FINANZ_HAUSBAUER,KBA05_MODTEMP,KBA13_OPEL,KBA13_SEG_MINIWAGEN,KBA05_MOD1,KBA13_KRSSEG_OBER,PLZ8_ANTG3,KBA13_SEG_MITTELKLASSE,KBA05_DIESEL,KBA05_ZUL3,KBA13_MERCEDES,ORTSGR_KLS9,KBA13_CCM_1600,KBA13_HERST_SONST,KBA13_KMH_251,D19_GESAMT_ONLINE_DATUM,KKK,OST_WEST_KZ,D19_VERSAND_DATUM,KBA05_CCM1,KBA13_HALTER_25,SEMIO_MAT,D19_BANKEN_ANZ_24,SEMIO_DOM,SEMIO_LUST,SHOPPER_TYP,KBA13_SEG_KLEINST,LP_FAMILIE_GROB,KBA13_ALTERHALTER_45,KBA13_KW_80,KBA13_KW_110,KBA05_BAUMAX,KBA13_KMH_250,KBA13_KW_30,KBA13_HALTER_35,WOHNDAUER_2008,KBA05_ANTG2,KBA05_SEG4,ANZ_HH_TITEL,D19_VERSI_ANZ_12,ANREDE_KZ,KBA05_FRAU,EWDICHTE,KBA13_HERST_EUROPA,KBA13_BJ_2004,KBA13_VORB_1_2,KBA13_SITZE_5,LP_FAMILIE_FEIN,RETOURTYP_BK_S,SEMIO_PFLICHT,KBA05_KRSHERST3,KBA13_HALTER_65,KBA13_SITZE_6,RELAT_AB,D19_BANKEN_DATUM,FINANZ_VORSORGER,KBA05_HERST5,D19_TELKO_ANZ_12,KBA13_ALTERHALTER_60,FINANZ_UNAUFFAELLIGER,KBA13_CCM_1800,KBA05_SEG5,KBA13_CCM_1400,BALLRAUM,KBA13_FAB_SONSTIGE,KBA13_KMH_140,FINANZ_ANLEGER,KBA13_TOYOTA,KBA13_BJ_2000,PLZ8_GBZ,WOHNLAGE,GEBAEUDETYP,KBA13_CCM_1000,KBA05_KRSZUL,KBA05_MAXBJ,KBA13_MOTOR,D19_BANKEN_ONLINE_DATUM,KBA05_CCM3,KBA13_ALTERHALTER_30,KBA13_KW_70,PLZ8_HHZ,ALTER_HH,KBA13_KMH_140_210,KBA13_HALTER_60,D19_VERSAND_ONLINE_QUOTE_12,D19_VERSAND_ANZ_12,KBA13_CCM_1200,D19_KONSUMTYP,REGIOTYP,SEMIO_VERT,KBA13_KW_121,D19_VERSAND_OFFLINE_DATUM,KBA13_CCM_3001,MIN_GEBAEUDEJAHR,KBA13_BMW,KBA05_GBZ,SEMIO_KAEM,KBA13_CCM_0_1400,KBA05_HERST1,KBA05_SEG2,KBA13_ANZAHL_PKW,KBA13_KMH_110,KBA05_KRSVAN,MOBI_REGIO,PRAEGENDE_JUGENDJAHRE,D19_GESAMT_OFFLINE_DATUM,KBA05_MOD3,KBA13_BJ_2008,SEMIO_ERL,SEMIO_FAM,KBA13_HALTER_30,VERS_TYP,KBA05_SEG10,KBA13_KMH_180,KBA05_KRSOBER,ZABEOTYP,KBA05_MOD8,KBA13_SEG_GELAENDEWAGEN,AGER_TYP,ANZ_TITEL,KBA13_CCM_1500,KBA13_KW_61_120,KBA13_RENAULT,KBA05_ANTG3,KBA05_ZUL1,KBA05_VORB1,KBA13_SEG_OBEREMITTELKLASSE,D19_GESAMT_ANZ_24,KBA13_VORB_3,KBA05_KRSAQUOT,KBA13_CCM_3000,KBA13_HALTER_45,SEMIO_SOZ,KBA05_MAXAH,SEMIO_TRADV,KBA13_KW_50,KBA05_ANTG1,D19_GESAMT_ONLINE_QUOTE_12,SEMIO_RAT,D19_GESAMT_ANZ_12,FINANZ_MINIMALIST,KBA05_CCM4,D19_TELKO_OFFLINE_DATUM,KBA05_MOTRAD,KBA13_KMH_0_140,KBA05_MOTOR,LP_LEBENSPHASE_FEIN,KBA13_HERST_FORD_OPEL,FINANZ_SPARER,KBA05_KRSHERST1,W_KEIT_KIND_HH,KBA05_MOD4,HH_EINKOMMEN_SCORE,D19_TELKO_ONLINE_DATUM,KBA05_MAXHERST,KBA05_SEG7,KBA05_VORB2,KBA13_AUDI,KBA13_VORB_0,PLZ8_ANTG2,D19_TELKO_ANZ_24,D19_TELKO_DATUM,KBA13_FORD,KBA13_KW_40,KBA13_BJ_2006,KBA13_CCM_2501,KBA13_BJ_2009,KBA13_HALTER_55,D19_BANKEN_ONLINE_QUOTE_12,PLZ8_ANTG4,KBA13_KRSSEG_VAN,KBA13_VW,KBA13_KRSAQUOT,KBA13_SITZE_4,CAMEO_DEUG_2015,ANZ_HAUSHALTE_AKTIV,KBA05_ALTER3,KBA13_KMH_211,KBA13_KRSHERST_FORD_OPEL,D19_BANKEN_ANZ_12,LP_STATUS_FEIN,KBA13_SEG_SONSTIGE,KBA13_AUTOQUOTE,KBA05_MAXVORB
0,,10,,,1.0,2.0,7,10,-1,,,,,,,,,,,,,0,,,,0,,,,0,,,,,4.0,,,0,,,,,,,,,,,,,,,,,,,,3,,7,,,2,0,,,,4,,,,,,,10.0,10,,,,,,1.0,,,,,,,3,,,,,,,,,,,,,,,10,,,10,,,5,0,6,5,-1,,2.0,,,,,,,,,,,,0,1,,,,,,,2.0,5.0,5,,,,,10,3,,0,,5,,,,,,,5,,,,,,,,,,10,,,,,,,,,0,,,,1,,10,,,,,6,,,,,,,,0,10,,,3,6,,-1,,,,3,,,-1,,,,,,,,,0,,,,,2,,3,,,,4,0,3,,10,,,,15.0,,4,,,,2.0,10,,,,,,,0,10,,,,,,,,,,,,,,,,,,0,1.0,,,
1,4.0,10,2.0,2.0,3.0,5.0,4,10,3,3.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,3.0,3.0,3.0,3.0,1,4.0,3.0,0.0,1996,5.0,2.0,2.0,0,3.0,2.0,4.0,3.0,6.0,2.0,2.0,0,1.0,3.0,8.0,0.0,3.0,2.0,1.0,5.0,1.0,4.0,1.0,4.0,2.0,1.0,3.0,3.0,3.0,5.0,4.0,3,5.0,4,3.0,4.0,1,0,4.0,1.0,4.0,1,1.0,4.0,3.0,0.0,1.0,3.0,10.0,10,2.0,4.0,1.0,3.0,2.0,1.0,8A,4.0,4.0,4.0,2.0,4.0,5,1.0,3.0,2.0,3.0,2.0,2.0,3.0,2.0,0.0,4.0,5.0,2.0,3.0,1.0,10,2.0,W,10,1.0,3.0,3,0,7,2,3,2.0,3.0,2.0,2.0,4.0,5.0,3.0,1.0,3.0,9.0,0.0,2.0,0.0,0,2,4.0,3.0,4.0,3.0,3.0,3.0,5.0,1.0,7,2.0,3.0,4.0,4.0,10,2,0.0,0,3.0,4,2.0,2.0,4.0,6.0,3.0,3.0,5,2.0,3.0,4.0,4.0,8.0,0.0,2.0,1.0,3.0,10,1.0,3.0,1.0,5.0,0.0,3.0,3.0,,0,0.0,,3.0,1,3.0,10,5.0,1992.0,3.0,1.0,4,2.0,5.0,1.0,963.0,1.0,1.0,1.0,14,10,2.0,3.0,2,4,3.0,2,4.0,2.0,2.0,5,0.0,2.0,-1,0.0,1.0,3.0,3.0,0.0,5.0,1.0,3.0,0,3.0,1.0,0.0,2.0,5,2.0,6,4.0,0.0,,6,0,1,4.0,10,0.0,3.0,3.0,21.0,2.0,5,5.0,3.0,0.0,6.0,10,2.0,3.0,5.0,4.0,3.0,3.0,0,10,2.0,2.0,3.0,3.0,2.0,3.0,,1.0,2.0,4.0,2.0,3.0,8.0,11.0,1.0,3.0,3.0,0,2.0,2.0,2.0,3.0
2,2.0,10,1.0,3.0,2.0,3.0,7,10,3,5.0,0.0,1.0,3.0,3.0,4.0,3.0,3.0,3.0,2.0,4.0,2.0,1,3.0,2.0,0.0,1979,2.0,2.0,3.0,0,0.0,4.0,3.0,3.0,1.0,3.0,2.0,1,3.0,3.0,4.0,1.0,2.0,2.0,3.0,2.0,5.0,4.0,3.0,3.0,2.0,4.0,4.0,2.0,2.0,3.0,4.0,3,1.0,3,4.0,3.0,3,0,3.0,1.0,3.0,1,3.0,2.0,3.0,2.0,1.0,1.0,10.0,10,0.0,4.0,0.0,3.0,3.0,2.0,4C,2.0,3.0,1.0,3.0,3.0,5,4.0,2.0,3.0,0.0,3.0,1.0,2.0,0.0,4.0,4.0,5.0,3.0,3.0,1.0,10,2.0,W,10,5.0,3.0,3,0,7,4,2,3.0,1.0,2.0,4.0,3.0,0.0,4.0,1.0,2.0,9.0,3.0,3.0,0.0,0,2,3.0,4.0,3.0,4.0,4.0,2.0,1.0,3.0,3,3.0,4.0,3.0,2.0,10,1,5.0,0,3.0,3,4.0,1.0,2.0,2.0,3.0,1.0,2,3.0,2.0,4.0,2.0,1.0,1.0,3.0,4.0,3.0,10,3.0,3.0,4.0,4.0,17.0,2.0,3.0,0.0,0,2.0,9.0,2.0,4,4.0,10,5.0,1992.0,4.0,3.0,7,1.0,2.0,5.0,712.0,1.0,2.0,3.0,15,10,5.0,3.0,6,1,2.0,1,1.0,2.0,2.0,5,1.0,5.0,-1,0.0,4.0,5.0,3.0,1.0,2.0,2.0,4.0,0,2.0,3.0,3.0,3.0,4,3.0,3,2.0,1.0,0.0,4,0,1,0.0,10,1.0,1.0,1.0,3.0,3.0,4,3.0,3.0,1.0,4.0,10,5.0,0.0,3.0,3.0,3.0,3.0,0,10,4.0,1.0,5.0,4.0,1.0,3.0,0.0,0.0,2.0,2.0,3.0,4.0,4.0,10.0,3.0,4.0,2.0,0,3.0,2.0,3.0,1.0


In [100]:
n_threshold = 15

more_than_n_unique_values, less_than_n_unique_values, error_list, numeric_list = [], [], [], []

for col in description_available:
  df_temp = df_attributes_values.loc[df_attributes_values["attribute"] == col]
  try:
    if(len(df_temp["value"].tolist()) == 1):
      numeric_list.append(col)
    elif(len(df_temp["value"].tolist()) > n_threshold):
      more_than_n_unique_values.append(col)
    else:
      less_than_n_unique_values.append(col)
  except:
    error_list.append(error)


In [101]:
df_attributes_values.loc[df_attributes_values["attribute"].isin(numeric_list)]

Unnamed: 0,attribute,description,value,meaning
37,ANZ_HAUSHALTE_AKTIV,number of households in the building,…,numeric value (typically coded from 1-10)
38,ANZ_HH_TITEL,number of academic title holder in building,…,numeric value (typically coded from 1-10)
39,ANZ_PERSONEN,number of adult persons in the household,…,numeric value (typically coded from 1-3)
40,ANZ_TITEL,number of professional title holder in household,…,numeric value (typically coded from 1-10)
712,GEBURTSJAHR,year of birth,…,numeric value
1168,KBA13_ANZAHL_PKW,number of cars in the PLZ8,…,numeric value
1987,MIN_GEBAEUDEJAHR,year the building was first mentioned in our d...,…,numeric value


In [104]:
df_attributes_values.loc[df_attributes_values["attribute"].isin(more_than_n_unique_values)]

Unnamed: 0,attribute,description,value,meaning
12,ALTER_HH,main age within the household,0,unknown / no main age detectable
13,ALTER_HH,main age within the household,1,01.01.1895 bis 31.12.1899
14,ALTER_HH,main age within the household,2,01.01.1900 bis 31.12.1904
15,ALTER_HH,main age within the household,3,01.01.1905 bis 31.12.1909
16,ALTER_HH,main age within the household,4,01.01.1910 bis 31.12.1914
...,...,...,...,...
2066,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,11,"80ies - ecological awareness (Avantgarde, W)"
2067,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,12,80ies - FDJ / communist party youth organisati...
2068,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,13,"80ies - Swords into ploughshares (Avantgarde, O)"
2069,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,14,"90ies - digital media kids (Mainstream, O+W)"


In [103]:
set(df_attributes_values.loc[df_attributes_values["attribute"].isin(more_than_n_unique_values)]["attribute"])

{'ALTER_HH', 'CAMEO_DEU_2015', 'LP_LEBENSPHASE_FEIN', 'PRAEGENDE_JUGENDJAHRE'}

In [107]:
df_attributes_values.loc[df_attributes_values["attribute"] == "PRAEGENDE_JUGENDJAHRE"]

Unnamed: 0,attribute,description,value,meaning
2055,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,"-1, 0",unknown
2056,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,1,"40ies - war years (Mainstream, O+W)"
2057,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,2,"40ies - reconstruction years (Avantgarde, O+W)"
2058,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,3,"50ies - economic miracle (Mainstream, O+W)"
2059,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,4,50ies - milk bar / Individualisation (Avantgar...
2060,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,5,"60ies - economic miracle (Mainstream, O+W)"
2061,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,6,60ies - generation 68 / student protestors (Av...
2062,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,7,60ies - opponents to the building of the Wall ...
2063,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,8,"70ies - family orientation (Mainstream, O+W)"
2064,PRAEGENDE_JUGENDJAHRE,dominating movement in the person's youth (ava...,9,"70ies - peace movement (Avantgarde, O+W)"


In [106]:
df_attributes_values.loc[df_attributes_values["attribute"] == "LP_LEBENSPHASE_FEIN"]

Unnamed: 0,attribute,description,value,meaning
1915,LP_LEBENSPHASE_FEIN,lifestage fine,1,single low-income earners of younger age
1916,LP_LEBENSPHASE_FEIN,lifestage fine,2,single low-income earners of middle age
1917,LP_LEBENSPHASE_FEIN,lifestage fine,3,single average earners of younger age
1918,LP_LEBENSPHASE_FEIN,lifestage fine,4,single average earners of middle age
1919,LP_LEBENSPHASE_FEIN,lifestage fine,5,single low-income earners of advanced age
1920,LP_LEBENSPHASE_FEIN,lifestage fine,6,single low-income earners at retirement age
1921,LP_LEBENSPHASE_FEIN,lifestage fine,7,single average earners of advanced age
1922,LP_LEBENSPHASE_FEIN,lifestage fine,8,single average earners at retirement age
1923,LP_LEBENSPHASE_FEIN,lifestage fine,9,single independant persons
1924,LP_LEBENSPHASE_FEIN,lifestage fine,10,wealthy single homeowners


In [105]:
df_attributes_values.loc[df_attributes_values["attribute"] == "ALTER_HH"]

Unnamed: 0,attribute,description,value,meaning
12,ALTER_HH,main age within the household,0,unknown / no main age detectable
13,ALTER_HH,main age within the household,1,01.01.1895 bis 31.12.1899
14,ALTER_HH,main age within the household,2,01.01.1900 bis 31.12.1904
15,ALTER_HH,main age within the household,3,01.01.1905 bis 31.12.1909
16,ALTER_HH,main age within the household,4,01.01.1910 bis 31.12.1914
17,ALTER_HH,main age within the household,5,01.01.1915 bis 31.12.1919
18,ALTER_HH,main age within the household,6,01.01.1920 bis 31.12.1924
19,ALTER_HH,main age within the household,7,01.01.1925 bis 31.12.1929
20,ALTER_HH,main age within the household,8,01.01.1930 bis 31.12.1934
21,ALTER_HH,main age within the household,9,01.01.1935 bis 31.12.1939


### Get an overview over nan values

In [10]:
# get nans for demographic data
df_azdias_nan = get_column_nans(df_azdias)
df_azdias_nan.loc[df_azdias_nan["na_percentage"] > 0.3]

Unnamed: 0,count,na_percentage
ALTER_KIND4,890016,0.998648
ALTER_KIND3,885051,0.993077
ALTER_KIND2,861722,0.9669
ALTER_KIND1,810163,0.909048
EXTSEL992,654153,0.733996
KK_KUNDENTYP,584612,0.655967


In [11]:
# get nans for customer data
df_customers_nan = get_column_nans(df_customers)
df_customers_nan.loc[df_customers_nan["na_percentage"] > 0.3]

Unnamed: 0,count,na_percentage
ALTER_KIND4,191416,0.998769
ALTER_KIND3,190377,0.993347
ALTER_KIND2,186552,0.973389
ALTER_KIND1,179886,0.938607
KK_KUNDENTYP,111937,0.584064
EXTSEL992,85283,0.444989


In [14]:
# Check those columns for further information in the dataframes df_attributes_values and df_information_levels
high_nan_columns = df_azdias_nan.loc[df_azdias_nan["na_percentage"] > 0.3].index.tolist()

for col in high_nan_columns:
  if((df_attributes_values.loc[df_attributes_values["attribute"] == col].shape[0] == 0) & (df_information_levels.loc[df_information_levels["attribute"] == col].shape[0] == 0)):
    print(f"There is no further information available for column: {col}")

There is no further information available for column: ALTER_KIND4
There is no further information available for column: ALTER_KIND3
There is no further information available for column: ALTER_KIND2
There is no further information available for column: ALTER_KIND1
There is no further information available for column: EXTSEL992
There is no further information available for column: KK_KUNDENTYP


In [15]:
# There is not much information to be gained about these columns
# Also these columns have very high nan values. So I will drop them
# Drop the columns with high nan percentages
print(f"df_azdias: {df_azdias.shape}")
print(f"df_customers: {df_customers.shape}")
df_azdias = df_azdias.drop(columns=high_nan_columns)
df_customers = df_customers.drop(columns=high_nan_columns)
print(f"New df_azdias: {df_azdias.shape}")
print(f"New df_customers: {df_customers.shape}")

df_azdias: (891221, 366)
df_customers: (191652, 369)
New df_azdias: (891221, 360)
New df_customers: (191652, 363)


### Which colums differ between df_azdias and df_customers?

In [16]:
# Check which columns differ between the two data sets
only_customer_cols = list(set(df_customers_nan.index.tolist()).difference(set(df_azdias_nan.index.tolist())))
only_customer_cols

['PRODUCT_GROUP', 'ONLINE_PURCHASE', 'CUSTOMER_GROUP']

In [17]:
# Inspect the column that are only in customer df
df_customers[only_customer_cols].head(5)

Unnamed: 0,PRODUCT_GROUP,ONLINE_PURCHASE,CUSTOMER_GROUP
0,COSMETIC_AND_FOOD,0,MULTI_BUYER
1,FOOD,0,SINGLE_BUYER
2,COSMETIC_AND_FOOD,0,MULTI_BUYER
3,COSMETIC,0,MULTI_BUYER
4,FOOD,0,MULTI_BUYER


In [18]:
# Check na percentage for columns that are only in customer df
df_customers_nan.loc[df_customers_nan.index.isin(only_customer_cols)]

Unnamed: 0,count,na_percentage
ONLINE_PURCHASE,0,0.0
CUSTOMER_GROUP,0,0.0
PRODUCT_GROUP,0,0.0


### How are numeric and non-numeric columns distributed?

In [19]:
# Check all for unique dtypes in df_azdias
set(df_azdias.dtypes)

{dtype('int64'), dtype('float64'), dtype('O')}

In [20]:
# Check out how many numeric and how many non-numeric cols we have
num_cols, non_num_cols = [], []

for col in df_azdias.columns:
  if(df_azdias[col].dtype == "O"):
    non_num_cols.append(col)
  else:
    num_cols.append(col)

print(f"num_cols: {len(num_cols)}")
print(f"non_num_cols: {len(non_num_cols)}")

num_cols: 354
non_num_cols: 6


### Analyze the nom-numeric columns

In [21]:
# Subset df_azdias to inspect non_num_cols
df_azdias[non_num_cols].head(3)

Unnamed: 0,CAMEO_DEU_2015,CAMEO_DEUG_2015,CAMEO_INTL_2015,D19_LETZTER_KAUF_BRANCHE,EINGEFUEGT_AM,OST_WEST_KZ
0,,,,,,
1,8A,8.0,51.0,,1992-02-10 00:00:00,W
2,4C,4.0,24.0,D19_UNBEKANNT,1992-02-12 00:00:00,W


In [22]:
# Get further description for non_num_cols
df_information_levels.loc[df_information_levels["attribute"].isin(non_num_cols)]

Unnamed: 0,Information_Level,attribute,description,additional_notes
84,,OST_WEST_KZ,flag indicating the former GDR/FRG,
86,Microcell (RR4_ID),CAMEO_DEUG_2015,CAMEO_4.0: uppergroup,New German CAMEO Typology established together...
87,,CAMEO_DEU_2015,CAMEO_4.0: specific group,


In [23]:
# Get nan values for non_num_cols in demographic data
df_azdias_nan.loc[df_azdias_nan.index.isin(non_num_cols)]

Unnamed: 0,count,na_percentage
D19_LETZTER_KAUF_BRANCHE,257113,0.288495
CAMEO_DEU_2015,98979,0.11106
CAMEO_DEUG_2015,98979,0.11106
CAMEO_INTL_2015,98979,0.11106
EINGEFUEGT_AM,93148,0.104517
OST_WEST_KZ,93148,0.104517


In [24]:
# Get nan values for non_num_cols in customers data
df_customers_nan.loc[df_customers_nan.index.isin(non_num_cols)]

Unnamed: 0,count,na_percentage
CAMEO_DEU_2015,50428,0.263123
CAMEO_DEUG_2015,50428,0.263123
CAMEO_INTL_2015,50428,0.263123
OST_WEST_KZ,49927,0.260509
EINGEFUEGT_AM,49927,0.260509
D19_LETZTER_KAUF_BRANCHE,47697,0.248873


In [25]:
# CAMEO_DEU_2015
df_attributes_values.loc[df_attributes_values["attribute"] == "CAMEO_DEU_2015"].head(5)

Unnamed: 0,attribute,description,value,meaning
62,CAMEO_DEU_2015,CAMEO classification 2015 - detailled classifi...,1A,Work-Life-Balance
63,CAMEO_DEU_2015,CAMEO classification 2015 - detailled classifi...,1B,Wealthy Best Ager
64,CAMEO_DEU_2015,CAMEO classification 2015 - detailled classifi...,1C,Successful Songwriter
65,CAMEO_DEU_2015,CAMEO classification 2015 - detailled classifi...,1D,Old Nobility
66,CAMEO_DEU_2015,CAMEO classification 2015 - detailled classifi...,1E,City Nobility


In [26]:
# CAMEO_DEUG_2015
df_attributes_values.loc[df_attributes_values["attribute"] == "CAMEO_DEUG_2015"].head(5)

Unnamed: 0,attribute,description,value,meaning
52,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,-1,unknown
53,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,1,upper class
54,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,2,upper middleclass
55,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,3,established middleclasse
56,CAMEO_DEUG_2015,CAMEO classification 2015 - Uppergroup,4,consumption-oriented middleclass


In [27]:
# OST_WEST_KZ
df_attributes_values.loc[df_attributes_values["attribute"] == "OST_WEST_KZ"].head(5)

Unnamed: 0,attribute,description,value,meaning
2014,OST_WEST_KZ,flag indicating the former GDR/FRG,-1,unknown
2015,OST_WEST_KZ,flag indicating the former GDR/FRG,O,East (GDR)
2016,OST_WEST_KZ,flag indicating the former GDR/FRG,W,West (FRG)


In [28]:
# D19_LETZTER_KAUF_BRANCHE
set(df_customers["D19_LETZTER_KAUF_BRANCHE"].tolist())

{'D19_BANKEN_DIREKT',
 'D19_BANKEN_GROSS',
 'D19_BANKEN_LOKAL',
 'D19_BANKEN_REST',
 'D19_BEKLEIDUNG_GEH',
 'D19_BEKLEIDUNG_REST',
 'D19_BILDUNG',
 'D19_BIO_OEKO',
 'D19_BUCH_CD',
 'D19_DIGIT_SERV',
 'D19_DROGERIEARTIKEL',
 'D19_ENERGIE',
 'D19_FREIZEIT',
 'D19_GARTEN',
 'D19_HANDWERK',
 'D19_HAUS_DEKO',
 'D19_KINDERARTIKEL',
 'D19_KOSMETIK',
 'D19_LEBENSMITTEL',
 'D19_LOTTO',
 'D19_NAHRUNGSERGAENZUNG',
 'D19_RATGEBER',
 'D19_REISEN',
 'D19_SAMMELARTIKEL',
 'D19_SCHUHE',
 'D19_SONSTIGE',
 'D19_TECHNIK',
 'D19_TELKO_MOBILE',
 'D19_TELKO_REST',
 'D19_TIERARTIKEL',
 'D19_UNBEKANNT',
 'D19_VERSAND_REST',
 'D19_VERSICHERUNGEN',
 'D19_VOLLSORTIMENT',
 'D19_WEIN_FEINKOST',
 nan}

In [30]:
df = pd.get_dummies(df_azdias["CAMEO_DEU_2015"], prefix="CAMEO_DEU_2015")
df = pd.concat([df, pd.get_dummies(df_azdias["CAMEO_DEUG_2015"], prefix="CAMEO_DEUG_2015")], axis=1)
df = pd.concat([df, pd.get_dummies(df_azdias["OST_WEST_KZ"], prefix="OST_WEST_KZ")], axis=1)
df = pd.concat([df, pd.get_dummies(df_azdias["D19_LETZTER_KAUF_BRANCHE"], prefix="D19_LETZTER_KAUF_BRANCHE")], axis=1)
df.shape

(891221, 92)

### Analyze the numeric columns

In [None]:
for col in num_cols[1:]:
  col_series = df_azdias[col]
  print(f"{col}: ")
  print(f"unique values: {set(col_series.tolist())}")
  print("___________________________________________________________")

AGER_TYP: 
unique values: {0, 1, 2, 3, -1}
___________________________________________________________
AKT_DAT_KL: 
unique values: {nan, 1.0, nan, nan, nan, 5.0, 6.0, 7.0, 8.0, 9.0, nan, 4.0, nan, nan, nan, nan, 3.0, 2.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan

KeyboardInterrupt: ignored

In [None]:
df_azdias["AGER_TYP"]

0        -1
1        -1
2        -1
3         2
4        -1
         ..
891216   -1
891217   -1
891218   -1
891219   -1
891220   -1
Name: AGER_TYP, Length: 891221, dtype: int64

In [None]:
df_azdias[num_cols].head(3)

Unnamed: 0,LNR,AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTER_KIND1,ALTER_KIND2,ALTER_KIND3,ALTER_KIND4,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_KINDER,ANZ_PERSONEN,ANZ_STATISTISCHE_HAUSHALTE,ANZ_TITEL,ARBEIT,BALLRAUM,CJT_GESAMTTYP,CJT_KATALOGNUTZER,CJT_TYP_1,CJT_TYP_2,CJT_TYP_3,CJT_TYP_4,CJT_TYP_5,CJT_TYP_6,D19_BANKEN_ANZ_12,D19_BANKEN_ANZ_24,D19_BANKEN_DATUM,D19_BANKEN_DIREKT,D19_BANKEN_GROSS,D19_BANKEN_LOKAL,D19_BANKEN_OFFLINE_DATUM,D19_BANKEN_ONLINE_DATUM,D19_BANKEN_ONLINE_QUOTE_12,D19_BANKEN_REST,D19_BEKLEIDUNG_GEH,D19_BEKLEIDUNG_REST,D19_BILDUNG,D19_BIO_OEKO,D19_BUCH_CD,D19_DIGIT_SERV,D19_DROGERIEARTIKEL,D19_ENERGIE,D19_FREIZEIT,D19_GARTEN,D19_GESAMT_ANZ_12,D19_GESAMT_ANZ_24,D19_GESAMT_DATUM,D19_GESAMT_OFFLINE_DATUM,D19_GESAMT_ONLINE_DATUM,D19_GESAMT_ONLINE_QUOTE_12,D19_HANDWERK,D19_HAUS_DEKO,D19_KINDERARTIKEL,D19_KONSUMTYP,D19_KONSUMTYP_MAX,D19_KOSMETIK,D19_LEBENSMITTEL,D19_LOTTO,D19_NAHRUNGSERGAENZUNG,D19_RATGEBER,D19_REISEN,D19_SAMMELARTIKEL,D19_SCHUHE,D19_SONSTIGE,D19_SOZIALES,D19_TECHNIK,D19_TELKO_ANZ_12,D19_TELKO_ANZ_24,D19_TELKO_DATUM,D19_TELKO_MOBILE,D19_TELKO_OFFLINE_DATUM,D19_TELKO_ONLINE_DATUM,D19_TELKO_ONLINE_QUOTE_12,D19_TELKO_REST,D19_TIERARTIKEL,D19_VERSAND_ANZ_12,D19_VERSAND_ANZ_24,D19_VERSAND_DATUM,D19_VERSAND_OFFLINE_DATUM,D19_VERSAND_ONLINE_DATUM,D19_VERSAND_ONLINE_QUOTE_12,D19_VERSAND_REST,D19_VERSI_ANZ_12,D19_VERSI_ANZ_24,D19_VERSI_DATUM,D19_VERSI_OFFLINE_DATUM,D19_VERSI_ONLINE_DATUM,D19_VERSI_ONLINE_QUOTE_12,D19_VERSICHERUNGEN,D19_VOLLSORTIMENT,D19_WEIN_FEINKOST,DSL_FLAG,EINGEZOGENAM_HH_JAHR,EWDICHTE,EXTSEL992,FINANZ_ANLEGER,FINANZ_HAUSBAUER,FINANZ_MINIMALIST,FINANZ_SPARER,FINANZ_UNAUFFAELLIGER,FINANZ_VORSORGER,FINANZTYP,FIRMENDICHTE,GEBAEUDETYP,GEBAEUDETYP_RASTER,GEBURTSJAHR,GEMEINDETYP,GFK_URLAUBERTYP,GREEN_AVANTGARDE,HEALTH_TYP,HH_DELTA_FLAG,HH_EINKOMMEN_SCORE,INNENSTADT,KBA05_ALTER1,KBA05_ALTER2,KBA05_ALTER3,KBA05_ALTER4,KBA05_ANHANG,KBA05_ANTG1,KBA05_ANTG2,KBA05_ANTG3,KBA05_ANTG4,KBA05_AUTOQUOT,KBA05_BAUMAX,KBA05_CCM1,KBA05_CCM2,KBA05_CCM3,KBA05_CCM4,KBA05_DIESEL,KBA05_FRAU,KBA05_GBZ,KBA05_HERST1,KBA05_HERST2,KBA05_HERST3,KBA05_HERST4,KBA05_HERST5,KBA05_HERSTTEMP,KBA05_KRSAQUOT,KBA05_KRSHERST1,KBA05_KRSHERST2,KBA05_KRSHERST3,KBA05_KRSKLEIN,KBA05_KRSOBER,KBA05_KRSVAN,KBA05_KRSZUL,KBA05_KW1,KBA05_KW2,KBA05_KW3,KBA05_MAXAH,KBA05_MAXBJ,KBA05_MAXHERST,KBA05_MAXSEG,KBA05_MAXVORB,KBA05_MOD1,KBA05_MOD2,KBA05_MOD3,KBA05_MOD4,KBA05_MOD8,KBA05_MODTEMP,KBA05_MOTOR,KBA05_MOTRAD,KBA05_SEG1,KBA05_SEG10,KBA05_SEG2,KBA05_SEG3,KBA05_SEG4,KBA05_SEG5,KBA05_SEG6,KBA05_SEG7,KBA05_SEG8,KBA05_SEG9,KBA05_VORB0,KBA05_VORB1,KBA05_VORB2,KBA05_ZUL1,KBA05_ZUL2,KBA05_ZUL3,KBA05_ZUL4,KBA13_ALTERHALTER_30,KBA13_ALTERHALTER_45,KBA13_ALTERHALTER_60,KBA13_ALTERHALTER_61,KBA13_ANTG1,KBA13_ANTG2,KBA13_ANTG3,KBA13_ANTG4,KBA13_ANZAHL_PKW,KBA13_AUDI,KBA13_AUTOQUOTE,KBA13_BAUMAX,KBA13_BJ_1999,KBA13_BJ_2000,KBA13_BJ_2004,KBA13_BJ_2006,KBA13_BJ_2008,KBA13_BJ_2009,KBA13_BMW,KBA13_CCM_0_1400,KBA13_CCM_1000,KBA13_CCM_1200,KBA13_CCM_1400,KBA13_CCM_1401_2500,KBA13_CCM_1500,KBA13_CCM_1600,KBA13_CCM_1800,KBA13_CCM_2000,KBA13_CCM_2500,KBA13_CCM_2501,KBA13_CCM_3000,KBA13_CCM_3001,KBA13_FAB_ASIEN,KBA13_FAB_SONSTIGE,KBA13_FIAT,KBA13_FORD,KBA13_GBZ,KBA13_HALTER_20,KBA13_HALTER_25,KBA13_HALTER_30,KBA13_HALTER_35,KBA13_HALTER_40,KBA13_HALTER_45,KBA13_HALTER_50,KBA13_HALTER_55,KBA13_HALTER_60,KBA13_HALTER_65,KBA13_HALTER_66,KBA13_HERST_ASIEN,KBA13_HERST_AUDI_VW,KBA13_HERST_BMW_BENZ,KBA13_HERST_EUROPA,KBA13_HERST_FORD_OPEL,KBA13_HERST_SONST,KBA13_HHZ,KBA13_KMH_0_140,KBA13_KMH_110,KBA13_KMH_140,KBA13_KMH_140_210,KBA13_KMH_180,KBA13_KMH_210,KBA13_KMH_211,KBA13_KMH_250,KBA13_KMH_251,KBA13_KRSAQUOT,KBA13_KRSHERST_AUDI_VW,KBA13_KRSHERST_BMW_BENZ,KBA13_KRSHERST_FORD_OPEL,KBA13_KRSSEG_KLEIN,KBA13_KRSSEG_OBER,KBA13_KRSSEG_VAN,KBA13_KRSZUL_NEU,KBA13_KW_0_60,KBA13_KW_110,KBA13_KW_120,KBA13_KW_121,KBA13_KW_30,KBA13_KW_40,KBA13_KW_50,KBA13_KW_60,KBA13_KW_61_120,KBA13_KW_70,KBA13_KW_80,KBA13_KW_90,KBA13_MAZDA,KBA13_MERCEDES,KBA13_MOTOR,KBA13_NISSAN,KBA13_OPEL,KBA13_PEUGEOT,KBA13_RENAULT,KBA13_SEG_GELAENDEWAGEN,KBA13_SEG_GROSSRAUMVANS,KBA13_SEG_KLEINST,KBA13_SEG_KLEINWAGEN,KBA13_SEG_KOMPAKTKLASSE,KBA13_SEG_MINIVANS,KBA13_SEG_MINIWAGEN,KBA13_SEG_MITTELKLASSE,KBA13_SEG_OBEREMITTELKLASSE,KBA13_SEG_OBERKLASSE,KBA13_SEG_SONSTIGE,KBA13_SEG_SPORTWAGEN,KBA13_SEG_UTILITIES,KBA13_SEG_VAN,KBA13_SEG_WOHNMOBILE,KBA13_SITZE_4,KBA13_SITZE_5,KBA13_SITZE_6,KBA13_TOYOTA,KBA13_VORB_0,KBA13_VORB_1,KBA13_VORB_1_2,KBA13_VORB_2,KBA13_VORB_3,KBA13_VW,KK_KUNDENTYP,KKK,KOMBIALTER,KONSUMNAEHE,KONSUMZELLE,LP_FAMILIE_FEIN,LP_FAMILIE_GROB,LP_LEBENSPHASE_FEIN,LP_LEBENSPHASE_GROB,LP_STATUS_FEIN,LP_STATUS_GROB,MIN_GEBAEUDEJAHR,MOBI_RASTER,MOBI_REGIO,NATIONALITAET_KZ,ONLINE_AFFINITAET,ORTSGR_KLS9,PLZ8_ANTG1,PLZ8_ANTG2,PLZ8_ANTG3,PLZ8_ANTG4,PLZ8_BAUMAX,PLZ8_GBZ,PLZ8_HHZ,PRAEGENDE_JUGENDJAHRE,REGIOTYP,RELAT_AB,RETOURTYP_BK_S,RT_KEIN_ANREIZ,RT_SCHNAEPPCHEN,RT_UEBERGROESSE,SEMIO_DOM,SEMIO_ERL,SEMIO_FAM,SEMIO_KAEM,SEMIO_KRIT,SEMIO_KULT,SEMIO_LUST,SEMIO_MAT,SEMIO_PFLICHT,SEMIO_RAT,SEMIO_REL,SEMIO_SOZ,SEMIO_TRADV,SEMIO_VERT,SHOPPER_TYP,SOHO_KZ,STRUKTURTYP,TITEL_KZ,UMFELD_ALT,UMFELD_JUNG,UNGLEICHENN_FLAG,VERDICHTUNGSRAUM,VERS_TYP,VHA,VHN,VK_DHT4A,VK_DISTANZ,VK_ZG11,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,ANREDE_KZ,ALTERSKATEGORIE_GROB
0,910215,-1,,,,,,,,,,,,,,,,2.0,5.0,1.0,1.0,5.0,5.0,5.0,5.0,0,0,10,0,0,0,10,10,,0,0,0,0,0,0,0,0,0,0,0,0,0,10,10,10,,0,0,0,,9,0,0,,0,0,0,0,0,0,,0,0,0,10,0,10,10,,0,0,0,0,10,10,10,,0,0,0,10,10,10,,0,0,0,,,,,5,3,3,4,5,3,4,,,,0,,10.0,0,-1,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9,,,2.0,2.0,15.0,4.0,1.0,1.0,,,,0,1.0,,,,,,,,,0,,,5.0,1.0,4.0,1.0,6,3,6,6,7,3,5,5,5,4,7,2,3,1,-1,,,,,,,,-1,,,,,,,,,3,1,2
1,910220,-1,9.0,0.0,,,,,21.0,11.0,0.0,0.0,2.0,12.0,0.0,3.0,6.0,5.0,1.0,5.0,5.0,2.0,3.0,1.0,1.0,0,0,10,0,0,0,10,10,,0,0,0,0,0,0,0,0,0,0,0,0,0,10,10,10,,0,0,0,,9,0,0,,0,0,0,0,0,0,,0,0,0,10,0,10,10,,0,0,0,0,10,10,10,,0,0,0,10,10,10,,0,0,0,1.0,2004.0,3.0,,5,5,1,5,4,2,1,2.0,8.0,3.0,1996,22.0,10.0,0,3,0.0,6.0,8.0,3.0,4.0,1.0,4.0,0.0,0.0,0.0,0.0,2.0,1.0,5.0,1.0,5.0,1.0,4.0,2.0,4.0,1.0,5.0,5.0,2.0,2.0,0.0,4.0,1.0,5.0,4.0,2.0,1.0,2.0,1.0,2.0,1.0,3.0,4.0,2.0,1.0,2.0,4.0,3.0,3.0,2.0,2.0,0.0,0.0,1.0,3.0,0.0,0.0,4.0,1.0,2.0,2.0,2.0,1.0,3.0,3.0,0.0,1.0,1.0,5.0,5.0,1.0,0.0,2.0,3.0,2.0,3.0,4.0,2.0,4.0,2.0,1.0,963.0,4.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,0.0,0.0,4.0,3.0,1.0,2.0,2.0,5.0,3.0,3.0,0.0,5.0,2.0,3.0,4.0,2.0,4.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,4.0,1.0,4.0,4.0,4.0,2.0,3.0,5.0,3.0,1.0,3.0,3.0,2.0,4.0,3.0,3.0,1.0,2.0,4.0,3.0,3.0,2.0,2.0,2.0,1.0,3.0,4.0,4.0,3.0,1.0,2.0,4.0,0.0,3.0,1.0,2.0,3.0,2.0,4.0,3.0,2.0,3.0,4.0,3.0,2.0,3.0,2.0,2.0,5.0,4.0,2.0,3.0,3.0,3.0,2.0,3.0,3.0,4.0,2.0,3.0,3.0,4.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,,2.0,1,1.0,1.0,5.0,3.0,21.0,6.0,2.0,1.0,1992.0,1.0,1.0,1,3.0,5.0,2.0,3.0,2.0,1.0,1.0,4.0,5.0,14,3.0,4.0,1.0,5.0,3.0,5.0,7,2,4,4,4,3,2,3,7,6,4,5,6,1,3,1.0,2.0,0.0,3.0,3.0,1.0,0.0,2,0.0,4.0,8.0,11.0,10.0,3.0,9.0,4.0,5,2,1
2,910225,-1,9.0,17.0,,,,,17.0,10.0,0.0,0.0,1.0,7.0,0.0,3.0,2.0,3.0,2.0,4.0,4.0,1.0,3.0,2.0,2.0,0,0,10,0,0,0,10,10,0.0,0,0,0,6,0,0,0,0,0,0,0,0,0,10,10,10,0.0,0,0,0,9.0,8,6,0,0.0,0,0,0,0,0,6,0.0,6,0,0,10,0,10,10,0.0,0,0,0,0,10,10,10,0.0,0,0,0,10,10,10,0.0,0,7,0,1.0,2000.0,4.0,14.0,2,5,1,4,3,1,1,4.0,1.0,4.0,1979,22.0,10.0,1,3,0.0,4.0,4.0,2.0,3.0,3.0,3.0,0.0,1.0,3.0,1.0,0.0,3.0,0.0,5.0,2.0,3.0,0.0,0.0,3.0,3.0,2.0,2.0,3.0,2.0,5.0,4.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,3.0,3.0,2.0,2.0,3.0,4.0,5.0,1.0,1.0,0.0,2.0,5.0,1.0,1.0,4.0,1.0,1.0,2.0,1.0,5.0,3.0,3.0,1.0,0.0,0.0,0.0,1.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0,1.0,0.0,712.0,3.0,3.0,1.0,2.0,2.0,4.0,5.0,3.0,1.0,4.0,1.0,1.0,2.0,2.0,3.0,4.0,3.0,4.0,3.0,3.0,4.0,3.0,5.0,4.0,3.0,3.0,4.0,4.0,3.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,2.0,4.0,3.0,3.0,3.0,4.0,1.0,1.0,1.0,2.0,2.0,4.0,4.0,4.0,1.0,3.0,3.0,3.0,2.0,2.0,3.0,2.0,1.0,1.0,3.0,4.0,4.0,1.0,1.0,2.0,1.0,5.0,4.0,4.0,2.0,3.0,4.0,3.0,3.0,2.0,3.0,3.0,5.0,3.0,3.0,3.0,1.0,3.0,3.0,2.0,4.0,3.0,2.0,4.0,5.0,3.0,2.0,4.0,2.0,3.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,,2.0,2,5.0,0.0,1.0,1.0,3.0,1.0,3.0,2.0,1992.0,2.0,3.0,1,2.0,5.0,3.0,3.0,1.0,0.0,1.0,4.0,4.0,15,2.0,2.0,3.0,5.0,4.0,5.0,7,6,1,7,7,3,4,3,3,4,3,4,3,4,2,0.0,3.0,0.0,2.0,5.0,0.0,1.0,1,0.0,2.0,9.0,9.0,6.0,3.0,9.0,2.0,5,2,3


In [None]:
not_truly_num, truly_num = [], []

for col in num_cols:
  if(df_attributes_values.loc[df_attributes_values["attribute"] == col].shape[0] == 0):
    truly_num.append(col)
  else:
    not_truly_num.append(col)

print(f"truly_num: {len(truly_num)}")
print(f"not_truly_num: {len(not_truly_num)}")

truly_num: 91
not_truly_num: 269


In [None]:
df_attributes_values.loc[df_attributes_values["attribute"] == "ANZ_HAUSHALTE_AKTIV"]

Unnamed: 0,attribute,description,value,meaning
37,ANZ_HAUSHALTE_AKTIV,number of households in the building,…,numeric value (typically coded from 1-10)


In [None]:
df_azdias[not_truly_num].head(3)

Unnamed: 0,AGER_TYP,ALTER_HH,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_PERSONEN,ANZ_TITEL,BALLRAUM,CJT_GESAMTTYP,D19_BANKEN_ANZ_12,D19_BANKEN_ANZ_24,D19_BANKEN_DATUM,D19_BANKEN_OFFLINE_DATUM,D19_BANKEN_ONLINE_DATUM,D19_BANKEN_ONLINE_QUOTE_12,D19_GESAMT_ANZ_12,D19_GESAMT_ANZ_24,D19_GESAMT_DATUM,D19_GESAMT_OFFLINE_DATUM,D19_GESAMT_ONLINE_DATUM,D19_GESAMT_ONLINE_QUOTE_12,D19_KONSUMTYP,D19_TELKO_ANZ_12,D19_TELKO_ANZ_24,D19_TELKO_DATUM,D19_TELKO_OFFLINE_DATUM,D19_TELKO_ONLINE_DATUM,D19_VERSAND_ANZ_12,D19_VERSAND_ANZ_24,D19_VERSAND_DATUM,D19_VERSAND_OFFLINE_DATUM,D19_VERSAND_ONLINE_DATUM,D19_VERSAND_ONLINE_QUOTE_12,D19_VERSI_ANZ_12,D19_VERSI_ANZ_24,EWDICHTE,FINANZ_ANLEGER,FINANZ_HAUSBAUER,FINANZ_MINIMALIST,FINANZ_SPARER,FINANZ_UNAUFFAELLIGER,FINANZ_VORSORGER,FINANZTYP,GEBAEUDETYP,GEBAEUDETYP_RASTER,GEBURTSJAHR,GFK_URLAUBERTYP,GREEN_AVANTGARDE,HEALTH_TYP,HH_EINKOMMEN_SCORE,INNENSTADT,KBA05_ALTER1,KBA05_ALTER2,KBA05_ALTER3,KBA05_ALTER4,KBA05_ANHANG,KBA05_ANTG1,KBA05_ANTG2,KBA05_ANTG3,KBA05_ANTG4,KBA05_AUTOQUOT,KBA05_BAUMAX,KBA05_CCM1,KBA05_CCM2,KBA05_CCM3,KBA05_CCM4,KBA05_DIESEL,KBA05_FRAU,KBA05_GBZ,KBA05_HERST1,KBA05_HERST2,KBA05_HERST3,KBA05_HERST4,KBA05_HERST5,KBA05_HERSTTEMP,KBA05_KRSAQUOT,KBA05_KRSHERST1,KBA05_KRSHERST2,KBA05_KRSHERST3,KBA05_KRSKLEIN,KBA05_KRSOBER,KBA05_KRSVAN,KBA05_KRSZUL,KBA05_KW1,KBA05_KW2,KBA05_KW3,KBA05_MAXAH,KBA05_MAXBJ,KBA05_MAXHERST,KBA05_MAXSEG,KBA05_MAXVORB,KBA05_MOD1,KBA05_MOD2,KBA05_MOD3,KBA05_MOD4,KBA05_MOD8,KBA05_MODTEMP,KBA05_MOTOR,KBA05_MOTRAD,KBA05_SEG1,KBA05_SEG10,KBA05_SEG2,KBA05_SEG3,KBA05_SEG4,KBA05_SEG5,KBA05_SEG6,KBA05_SEG7,KBA05_SEG8,KBA05_SEG9,KBA05_VORB0,KBA05_VORB1,KBA05_VORB2,KBA05_ZUL1,KBA05_ZUL2,KBA05_ZUL3,KBA05_ZUL4,KBA13_ALTERHALTER_30,KBA13_ALTERHALTER_45,KBA13_ALTERHALTER_60,KBA13_ALTERHALTER_61,KBA13_ANZAHL_PKW,KBA13_AUDI,KBA13_AUTOQUOTE,KBA13_BJ_1999,KBA13_BJ_2000,KBA13_BJ_2004,KBA13_BJ_2006,KBA13_BJ_2008,KBA13_BJ_2009,KBA13_BMW,KBA13_CCM_0_1400,KBA13_CCM_1000,KBA13_CCM_1200,KBA13_CCM_1400,KBA13_CCM_1500,KBA13_CCM_1600,KBA13_CCM_1800,KBA13_CCM_2000,KBA13_CCM_2500,KBA13_CCM_2501,KBA13_CCM_3000,KBA13_CCM_3001,KBA13_FAB_ASIEN,KBA13_FAB_SONSTIGE,KBA13_FIAT,KBA13_FORD,KBA13_HALTER_20,KBA13_HALTER_25,KBA13_HALTER_30,KBA13_HALTER_35,KBA13_HALTER_40,KBA13_HALTER_45,KBA13_HALTER_50,KBA13_HALTER_55,KBA13_HALTER_60,KBA13_HALTER_65,KBA13_HALTER_66,KBA13_HERST_ASIEN,KBA13_HERST_AUDI_VW,KBA13_HERST_BMW_BENZ,KBA13_HERST_EUROPA,KBA13_HERST_FORD_OPEL,KBA13_HERST_SONST,KBA13_KMH_0_140,KBA13_KMH_110,KBA13_KMH_140,KBA13_KMH_140_210,KBA13_KMH_180,KBA13_KMH_211,KBA13_KMH_250,KBA13_KMH_251,KBA13_KRSAQUOT,KBA13_KRSHERST_AUDI_VW,KBA13_KRSHERST_BMW_BENZ,KBA13_KRSHERST_FORD_OPEL,KBA13_KRSSEG_KLEIN,KBA13_KRSSEG_OBER,KBA13_KRSSEG_VAN,KBA13_KRSZUL_NEU,KBA13_KW_0_60,KBA13_KW_110,KBA13_KW_120,KBA13_KW_121,KBA13_KW_30,KBA13_KW_40,KBA13_KW_50,KBA13_KW_60,KBA13_KW_61_120,KBA13_KW_70,KBA13_KW_80,KBA13_KW_90,KBA13_MAZDA,KBA13_MERCEDES,KBA13_MOTOR,KBA13_NISSAN,KBA13_OPEL,KBA13_PEUGEOT,KBA13_RENAULT,KBA13_SEG_GELAENDEWAGEN,KBA13_SEG_GROSSRAUMVANS,KBA13_SEG_KLEINST,KBA13_SEG_KLEINWAGEN,KBA13_SEG_KOMPAKTKLASSE,KBA13_SEG_MINIVANS,KBA13_SEG_MINIWAGEN,KBA13_SEG_MITTELKLASSE,KBA13_SEG_OBEREMITTELKLASSE,KBA13_SEG_OBERKLASSE,KBA13_SEG_SONSTIGE,KBA13_SEG_SPORTWAGEN,KBA13_SEG_UTILITIES,KBA13_SEG_VAN,KBA13_SEG_WOHNMOBILE,KBA13_SITZE_4,KBA13_SITZE_5,KBA13_SITZE_6,KBA13_TOYOTA,KBA13_VORB_0,KBA13_VORB_1,KBA13_VORB_1_2,KBA13_VORB_2,KBA13_VORB_3,KBA13_VW,KKK,KONSUMNAEHE,LP_FAMILIE_FEIN,LP_FAMILIE_GROB,LP_LEBENSPHASE_FEIN,LP_LEBENSPHASE_GROB,LP_STATUS_FEIN,LP_STATUS_GROB,MIN_GEBAEUDEJAHR,MOBI_REGIO,NATIONALITAET_KZ,ONLINE_AFFINITAET,ORTSGR_KLS9,PLZ8_ANTG1,PLZ8_ANTG2,PLZ8_ANTG3,PLZ8_ANTG4,PLZ8_BAUMAX,PLZ8_GBZ,PLZ8_HHZ,PRAEGENDE_JUGENDJAHRE,REGIOTYP,RELAT_AB,RETOURTYP_BK_S,SEMIO_DOM,SEMIO_ERL,SEMIO_FAM,SEMIO_KAEM,SEMIO_KRIT,SEMIO_KULT,SEMIO_LUST,SEMIO_MAT,SEMIO_PFLICHT,SEMIO_RAT,SEMIO_REL,SEMIO_SOZ,SEMIO_TRADV,SEMIO_VERT,SHOPPER_TYP,TITEL_KZ,VERS_TYP,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,ANREDE_KZ,ALTERSKATEGORIE_GROB
0,-1,,,,,,,2.0,0,0,10,10,10,,0,0,10,10,10,,,0,0,10,10,10,0,0,10,10,10,,0,0,,5,3,3,4,5,3,4,,,0,10.0,0,-1,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,15.0,4.0,1.0,1.0,,,0,1.0,,,,,,,,,0,,,5.0,6,3,6,6,7,3,5,5,5,4,7,2,3,1,-1,,-1,,,,3,1,2
1,-1,0.0,11.0,0.0,2.0,0.0,6.0,5.0,0,0,10,10,10,,0,0,10,10,10,,,0,0,10,10,10,0,0,10,10,10,,0,0,3.0,5,5,1,5,4,2,1,8.0,3.0,1996,10.0,0,3,6.0,8.0,3.0,4.0,1.0,4.0,0.0,0.0,0.0,0.0,2.0,1.0,5.0,1.0,5.0,1.0,4.0,2.0,4.0,1.0,5.0,5.0,2.0,2.0,0.0,4.0,1.0,5.0,4.0,2.0,1.0,2.0,1.0,2.0,1.0,3.0,4.0,2.0,1.0,2.0,4.0,3.0,3.0,2.0,2.0,0.0,0.0,1.0,3.0,0.0,0.0,4.0,1.0,2.0,2.0,2.0,1.0,3.0,3.0,0.0,1.0,1.0,5.0,5.0,1.0,0.0,2.0,3.0,2.0,3.0,4.0,963.0,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,0.0,0.0,4.0,1.0,2.0,2.0,5.0,3.0,3.0,0.0,5.0,2.0,3.0,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,4.0,1.0,4.0,4.0,4.0,2.0,3.0,3.0,1.0,3.0,3.0,2.0,3.0,3.0,1.0,2.0,4.0,3.0,3.0,2.0,2.0,2.0,1.0,3.0,4.0,4.0,3.0,1.0,2.0,4.0,0.0,3.0,1.0,2.0,3.0,2.0,4.0,3.0,2.0,3.0,4.0,3.0,2.0,3.0,2.0,2.0,5.0,4.0,2.0,3.0,3.0,3.0,2.0,3.0,3.0,4.0,2.0,3.0,3.0,4.0,2.0,3.0,3.0,3.0,3.0,3.0,4.0,2.0,1.0,5.0,3.0,21.0,6.0,2.0,1.0,1992.0,1.0,1,3.0,5.0,2.0,3.0,2.0,1.0,1.0,4.0,5.0,14,3.0,4.0,1.0,7,2,4,4,4,3,2,3,7,6,4,5,6,1,3,0.0,2,3.0,9.0,4.0,5,2,1
2,-1,17.0,10.0,0.0,1.0,0.0,2.0,3.0,0,0,10,10,10,0.0,0,0,10,10,10,0.0,9.0,0,0,10,10,10,0,0,10,10,10,0.0,0,0,4.0,2,5,1,4,3,1,1,1.0,4.0,1979,10.0,1,3,4.0,4.0,2.0,3.0,3.0,3.0,0.0,1.0,3.0,1.0,0.0,3.0,0.0,5.0,2.0,3.0,0.0,0.0,3.0,3.0,2.0,2.0,3.0,2.0,5.0,4.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,3.0,3.0,2.0,2.0,3.0,4.0,5.0,1.0,1.0,0.0,2.0,5.0,1.0,1.0,4.0,1.0,1.0,2.0,1.0,5.0,3.0,3.0,1.0,0.0,0.0,0.0,1.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,3.0,2.0,3.0,3.0,712.0,3.0,3.0,2.0,2.0,4.0,5.0,3.0,1.0,4.0,1.0,1.0,2.0,2.0,4.0,3.0,4.0,3.0,3.0,4.0,3.0,5.0,4.0,3.0,3.0,4.0,3.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,2.0,4.0,3.0,3.0,3.0,1.0,1.0,1.0,2.0,2.0,4.0,4.0,1.0,3.0,3.0,3.0,2.0,2.0,3.0,2.0,1.0,1.0,3.0,4.0,4.0,1.0,1.0,2.0,1.0,5.0,4.0,4.0,2.0,3.0,4.0,3.0,3.0,2.0,3.0,3.0,5.0,3.0,3.0,3.0,1.0,3.0,3.0,2.0,4.0,3.0,2.0,4.0,5.0,3.0,2.0,4.0,2.0,3.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,2.0,5.0,1.0,1.0,3.0,1.0,3.0,2.0,1992.0,3.0,1,2.0,5.0,3.0,3.0,1.0,0.0,1.0,4.0,4.0,15,2.0,2.0,3.0,7,6,1,7,7,3,4,3,3,4,3,4,3,4,2,0.0,1,3.0,9.0,2.0,5,2,3


In [None]:
df_azdias[truly_num].head(3)

Unnamed: 0,LNR,AKT_DAT_KL,ALTER_KIND1,ALTER_KIND2,ALTER_KIND3,ALTER_KIND4,ALTERSKATEGORIE_FEIN,ANZ_KINDER,ANZ_STATISTISCHE_HAUSHALTE,ARBEIT,CJT_KATALOGNUTZER,CJT_TYP_1,CJT_TYP_2,CJT_TYP_3,CJT_TYP_4,CJT_TYP_5,CJT_TYP_6,D19_BANKEN_DIREKT,D19_BANKEN_GROSS,D19_BANKEN_LOKAL,D19_BANKEN_REST,D19_BEKLEIDUNG_GEH,D19_BEKLEIDUNG_REST,D19_BILDUNG,D19_BIO_OEKO,D19_BUCH_CD,D19_DIGIT_SERV,D19_DROGERIEARTIKEL,D19_ENERGIE,D19_FREIZEIT,D19_GARTEN,D19_HANDWERK,D19_HAUS_DEKO,D19_KINDERARTIKEL,D19_KONSUMTYP_MAX,D19_KOSMETIK,D19_LEBENSMITTEL,D19_LOTTO,D19_NAHRUNGSERGAENZUNG,D19_RATGEBER,D19_REISEN,D19_SAMMELARTIKEL,D19_SCHUHE,D19_SONSTIGE,D19_SOZIALES,D19_TECHNIK,D19_TELKO_MOBILE,D19_TELKO_ONLINE_QUOTE_12,D19_TELKO_REST,D19_TIERARTIKEL,D19_VERSAND_REST,D19_VERSI_DATUM,D19_VERSI_OFFLINE_DATUM,D19_VERSI_ONLINE_DATUM,D19_VERSI_ONLINE_QUOTE_12,D19_VERSICHERUNGEN,D19_VOLLSORTIMENT,D19_WEIN_FEINKOST,DSL_FLAG,EINGEZOGENAM_HH_JAHR,EXTSEL992,FIRMENDICHTE,GEMEINDETYP,HH_DELTA_FLAG,KBA13_ANTG1,KBA13_ANTG2,KBA13_ANTG3,KBA13_ANTG4,KBA13_BAUMAX,KBA13_CCM_1401_2500,KBA13_GBZ,KBA13_HHZ,KBA13_KMH_210,KK_KUNDENTYP,KOMBIALTER,KONSUMZELLE,MOBI_RASTER,RT_KEIN_ANREIZ,RT_SCHNAEPPCHEN,RT_UEBERGROESSE,SOHO_KZ,STRUKTURTYP,UMFELD_ALT,UMFELD_JUNG,UNGLEICHENN_FLAG,VERDICHTUNGSRAUM,VHA,VHN,VK_DHT4A,VK_DISTANZ,VK_ZG11
0,910215,,,,,,,,,,5.0,1.0,1.0,5.0,5.0,5.0,5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,,0,0,0,0,0,0,,0,0,,0,0,0,10,10,10,,0,0,0,,,,,,,,,,,,,,,,,9,,,1.0,4.0,1.0,,,,,,,,,,,
1,910220,9.0,,,,,21.0,0.0,12.0,3.0,1.0,5.0,5.0,2.0,3.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,,0,0,0,0,0,0,,0,0,,0,0,0,10,10,10,,0,0,0,1.0,2004.0,,2.0,22.0,0.0,2.0,4.0,2.0,1.0,2.0,3.0,4.0,5.0,4.0,,1,1.0,1.0,5.0,3.0,5.0,1.0,2.0,3.0,3.0,1.0,0.0,0.0,4.0,8.0,11.0,10.0
2,910225,9.0,,,,,17.0,0.0,7.0,3.0,2.0,4.0,4.0,1.0,3.0,2.0,2.0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,8,6,0,0.0,0,0,0,0,0,6,0.0,6,0,0.0,0,0,0,10,10,10,0.0,0,7,0,1.0,2000.0,14.0,4.0,22.0,0.0,2.0,3.0,1.0,0.0,1.0,3.0,4.0,4.0,4.0,,2,0.0,2.0,5.0,4.0,5.0,0.0,3.0,2.0,5.0,0.0,1.0,0.0,2.0,9.0,9.0,6.0


In [None]:
df_information_levels.loc[df_information_levels["attribute"] == "ALTER_HH"]

Unnamed: 0,Information_Level,attribute,description,additional_notes
44,Household,ALTER_HH,main age within the household,
