In [68]:
import requests
import tarfile
import os

import numpy as np
import pandas as pd
import plotly.express as px

In [69]:
def download_and_extract(url: str, extract_to: str):
    """
    Downloads a tar.gz file from a URL and extracts it to a directory.
    Args:
    - url (str): URL of the tar.gz file to download.
    - extract_to (str): Directory path to extract the contents of the tar.gz file.
    """
    # Get the filename from the URL
    filename = url.split('/')[-1]

    # Download the file
    print("Downloading the file...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.raw.read())
        print("Download completed.")
    else:
        print("Failed to download the file.")
        return

    # Extract the tar.gz file
    print("Extracting the file...")
    try:
        with tarfile.open(filename, 'r:gz') as tar:
            tar.extractall(path=extract_to)
        print("Extraction completed.")
    except Exception as e:
        print(f"Failed to extract the file: {e}")
    finally:
        # Optionally remove the tar.gz file after extraction
        os.remove(filename)
        print("Downloaded tar.gz file removed.")

# URL of the tar.gz file
url = "https://video.udacity-data.com/topher/2024/August/66b9ba05_arvato_data.tar/arvato_data.tar.gz"

# Call the function with the URL
# download_and_extract(url, extract_to=".")

# Part 0: Get to Know the Data

In [70]:
population = pd.read_csv("../data/Udacity_AZDIAS_052018.csv", sep=";", nrows=10000)
customers = pd.read_csv("../data/Udacity_CUSTOMERS_052018.csv", sep=";", nrows=10000)

population.columns = population.columns.str.lower()
customers.columns = customers.columns.str.lower()


Columns (18,19) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (18,19) have mixed types. Specify dtype option on import or set low_memory=False.



In [71]:
meta = pd.read_excel("../data/meta/dias_values.xlsx", header=1)
meta.columns = meta.columns.str.lower()
meta.drop(columns="unnamed: 0", inplace=True)

meta[["attribute", "description"]] = meta[["attribute", "description"]].ffill()
meta["attribute"] = meta["attribute"].str.lower()

meta["meaning"] = meta["meaning"].ffill()

meta["value"] = [v.split(", ") if type(v) == str and "," in v else v for v in meta["value"]]
meta = meta.explode("value")

## Data Types
Besides numeric values there are strings in the datasets which need a separate preprocessing:
- `CAMEO_DEU_2015`, `CAMEO_DEUG_2015` and `CAMEO_INTL_2015`
  - have `X` or `XX` values that should be treated as null values
  - partially have numeric values as strings that should be unified
  - are classifications that should be one-hot-encoded
    - `CAMEO_INTL_2015` values are a composition of household and family type and should be separated
- `D19_LETZTER_KAUF_BRANCHE` shows the sector of the last purchase which is ineligible for our segmentation report and model
- `EINGEFUEGT_AM` is the timestamp where the data was created in the database and also unnecessary
- `OST_WEST_KZ` is a flag labeled as `W` or `O` that should be one-hot-encoded

In [72]:
population.dtypes.value_counts()

float64    267
int64       93
object       6
Name: count, dtype: int64

In [73]:
population.select_dtypes(include="object").head()

Unnamed: 0,cameo_deu_2015,cameo_deug_2015,cameo_intl_2015,d19_letzter_kauf_branche,eingefuegt_am,ost_west_kz
0,,,,,,
1,8A,8.0,51.0,,1992-02-10 00:00:00,W
2,4C,4.0,24.0,D19_UNBEKANNT,1992-02-12 00:00:00,W
3,2A,2.0,12.0,D19_UNBEKANNT,1997-04-21 00:00:00,W
4,6B,6.0,43.0,D19_SCHUHE,1992-02-12 00:00:00,W


In [74]:
population["cameo_deu_2015"].unique()

array([nan, '8A', '4C', '2A', '6B', '8C', '4A', '2D', '1A', '1E', '9D',
       '5C', '8B', '7A', '5D', '9E', '9B', '1B', '3D', '4E', '4B', '3C',
       '5A', '7B', '9A', '6D', '6E', '2C', '7C', '9C', '7D', '5E', '1D',
       '8D', '6C', '6A', '5B', '4D', '3A', '2B', '7E', '3B', '6F', '5F',
       '1C', 'XX'], dtype=object)

In [75]:
population["cameo_deug_2015"].unique()

array([nan, 8.0, 4.0, 2.0, 6.0, 1.0, 9.0, 5.0, 7.0, 3.0, '4', '3', '7',
       '2', '8', '9', '6', '5', '1', 'X'], dtype=object)

In [76]:
population["cameo_intl_2015"].unique()

array([nan, 51.0, 24.0, 12.0, 43.0, 54.0, 22.0, 14.0, 13.0, 15.0, 33.0,
       41.0, 34.0, 55.0, 25.0, 23.0, 31.0, 52.0, 35.0, 45.0, 44.0, 32.0,
       '22', '24', '41', '12', '54', '51', '44', '35', '23', '25', '14',
       '34', '52', '55', '31', '32', '15', '13', '43', '33', '45', 'XX'],
      dtype=object)

In [77]:
meta[meta["attribute"] == "cameo_intl_2015"].head(10)

Unnamed: 0,attribute,description,value,meaning
105,cameo_intl_2015,CAMEO classification 2015 - international typo...,-1,unknown
106,cameo_intl_2015,(each German CAMEO code belongs to one interna...,11,Wealthy Households-Pre-Family Couples & Singles
107,cameo_intl_2015,(each German CAMEO code belongs to one interna...,12,Wealthy Households-Young Couples With Children
108,cameo_intl_2015,(each German CAMEO code belongs to one interna...,13,Wealthy Households-Families With School Age Ch...
109,cameo_intl_2015,(each German CAMEO code belongs to one interna...,14,Wealthy Households-Older Families & Mature Co...
110,cameo_intl_2015,(each German CAMEO code belongs to one interna...,15,Wealthy Households-Elders In Retirement
111,cameo_intl_2015,(each German CAMEO code belongs to one interna...,21,Prosperous Households-Pre-Family Couples & Sin...
112,cameo_intl_2015,(each German CAMEO code belongs to one interna...,22,Prosperous Households-Young Couples With Children
113,cameo_intl_2015,(each German CAMEO code belongs to one interna...,23,Prosperous Households-Families With School Age...
114,cameo_intl_2015,(each German CAMEO code belongs to one interna...,24,Prosperous Households-Older Families & Mature ...


In [78]:
population["ost_west_kz"].unique()

array([nan, 'W', 'O'], dtype=object)

In [79]:
population = population.drop(columns=["d19_letzter_kauf_branche", "eingefuegt_am"])

In [80]:
def prepare_cameo_classifications(df: pd.DataFrame) -> pd.DataFrame:

    cameo_columns = df.columns[df.columns.str.startswith("cameo")]
    df[cameo_columns] = df[cameo_columns].replace(["X", "XX"], np.nan)

    for cameo_column in cameo_columns:
        if cameo_column != "cameo_deu_2015":
            df[cameo_column] = df[cameo_column].astype(float)

    df["cameo_intl_2015_household"] = [str(v)[0] if v != -1 and pd.notnull(v) else v for v in df["cameo_intl_2015"]]
    df["cameo_intl_2015_family"] = [str(v)[1] if v != -1 and pd.notnull(v) else v for v in df["cameo_intl_2015"]]
    df = df.drop(columns=["cameo_intl_2015"])

    return df

population = prepare_cameo_classifications(population)

## Unknown Values
The provided list that explains the meanings of the numeric attribute values can be used to identify unknown values and handle them as null values.

### Missing Meta Attributes
- some attribute names have to be rectified to map with the population dataset
- these attributes from the meta list couldn't be assigned: `bip_flag`, `geoscore_kls7`, `haushaltsstruktur`, `wachstumsgebiet_nb`


In [81]:
meta[meta["meaning"].str.contains("unknown") | meta["meaning"].str.startswith("no transaction")]

Unnamed: 0,attribute,description,value,meaning
0,ager_typ,best-ager typology,-1,unknown
5,alterskategorie_grob,age classification through prename analysis,-1,unknown
5,alterskategorie_grob,age classification through prename analysis,0,unknown
11,alter_hh,main age within the household,0,unknown / no main age detectable
33,anrede_kz,gender,-1,unknown
...,...,...,...,...
2238,wachstumsgebiet_nb,growing area (population growth in the last 5 ...,0,unknown
2244,w_keit_kind_hh,likelihood of a child present in this household,-1,unknown
2244,w_keit_kind_hh,likelihood of a child present in this household,0,unknown
2251,zabeotyp,typification of energy consumers,-1,unknown


In [172]:
def identify_missing_meta_attributes(meta: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    missing_meta_attributes = list(set(meta["attribute"].unique()).difference(df.columns))
    missing_meta_attributes.sort()
    missing_meta_attributes.remove("cameo_intl_2015")
    return missing_meta_attributes

In [173]:
identify_missing_meta_attributes(meta, population)

['bip_flag', 'geoscore_kls7', 'haushaltsstruktur', 'wachstumsgebiet_nb']

In [167]:
def rectify_meta_attributes(meta: pd.DataFrame) -> pd.DataFrame:
    meta["attribute"] = meta["attribute"].replace(r"_rz$", "", regex=True)

    attribute_renaming = {
        "d19_buch": "d19_buch_cd",
        "d19_kk_kundentyp": "kk_kundentyp",
        "kba13_ccm_1400_2500": "kba13_ccm_1401_2500",
        "soho_flag": "soho_kz"
    }

    meta["attribute"] = meta["attribute"].replace(attribute_renaming)

    return meta

meta = rectify_meta_attributes(meta)

In [190]:
def convert_unknown_values_to_null(df: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
    unknown_values = meta[meta["meaning"].str.contains("unknown") | meta["meaning"].str.startswith("no transaction")]

    for attribute, unknown_value in zip(unknown_values.loc[:, "attribute"], unknown_values.loc[:, "value"]):
        if attribute in df:
            df[attribute] = df[attribute].replace(unknown_value, np.nan)

    return df

population = convert_unknown_values_to_null(population, meta)

### Population Attributes without Meta Information
- the additional `d19` columns will be handled like the other ones, so 0 and 10 will be assumed as null values
- there are no remaining -1 values found, where the chance were high that they represent unknown values
- `lnr` is an identifier for the dataset records and should be removed

In [187]:
missing_population_attributes = list(set(population.columns).difference(meta["attribute"].unique()))
missing_population_attributes.sort()
missing_population_attributes.remove("cameo_intl_2015_household")
missing_population_attributes.remove("cameo_intl_2015_family")

population[missing_population_attributes].describe()

Unnamed: 0,akt_dat_kl,alter_kind1,alter_kind2,alter_kind3,alter_kind4,alterskategorie_fein,anz_kinder,anz_statistische_haushalte,arbeit,cjt_katalognutzer,...,strukturtyp,umfeld_alt,umfeld_jung,ungleichenn_flag,verdichtungsraum,vha,vhn,vk_dht4a,vk_distanz,vk_zg11
count,9003.0,875.0,291.0,75.0,9.0,6894.0,9003.0,8778.0,8734.0,9942.0,...,8733.0,8711.0,8711.0,9003.0,8733.0,9003.0,8438.0,8977.0,8977.0,8977.0
mean,4.429635,11.737143,13.542955,14.32,12.444444,13.756455,0.150172,7.565619,3.163957,3.36381,...,2.524562,3.222248,4.005625,0.090525,4.631284,0.45918,2.410287,5.990531,7.527682,5.951097
std,3.633788,4.239342,3.376497,2.88594,2.788867,5.056536,0.490406,14.327056,0.99921,1.491454,...,0.759727,1.256155,1.115012,0.286949,8.541046,1.168143,1.187305,2.856689,3.253193,2.774372
min,1.0,2.0,5.0,7.0,9.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,1.0,8.0,11.0,12.0,10.0,11.0,0.0,1.0,3.0,2.0,...,2.0,2.0,3.0,0.0,0.0,0.0,2.0,3.0,6.0,4.0
50%,3.0,12.0,14.0,15.0,13.0,14.0,0.0,4.0,3.0,4.0,...,3.0,3.0,4.0,0.0,1.0,0.0,2.0,6.0,8.0,6.0
75%,9.0,15.0,17.0,17.0,14.0,17.0,0.0,9.0,4.0,5.0,...,3.0,4.0,5.0,0.0,5.0,0.0,3.0,9.0,10.0,8.0
max,9.0,18.0,18.0,18.0,16.0,25.0,5.0,292.0,9.0,5.0,...,3.0,5.0,5.0,1.0,45.0,5.0,4.0,11.0,13.0,11.0


In [195]:
population = population.drop(columns=["lnr"])

In [197]:
def convert_unknown_d19_values_to_null(df: pd.DataFrame) -> pd.DataFrame:
    d19_columns = df.columns[df.columns.str.startswith("d19_")]
    df[d19_columns] = df[d19_columns].replace([0, 10], np.nan)
    return df

population = convert_unknown_d19_values_to_null(population)

## Missing Values
- `alter_kind1` to `alter_kind4` are largely null because most of the persons in the dataset haven't kids

In [191]:
missing_values_share = pd.Series(round(population.isnull().sum() / len(population) * 100, 1).sort_values(ascending=False), name="missing_values_share")

In [192]:
missing_values_share[missing_values_share > 0]

alter_kind4                 99.9
alter_kind3                 99.2
d19_telko_online_datum      99.2
d19_banken_lokal            98.3
d19_banken_offline_datum    97.6
                            ... 
lp_status_fein               0.6
gfk_urlaubertyp              0.6
lp_familie_grob              0.6
cjt_gesamttyp                0.6
online_affinitaet            0.6
Name: missing_values_share, Length: 331, dtype: float64

In [198]:
population.isnull().sum(axis=1).sort_values(ascending=False)

5099    316
784     314
9159    314
497     314
7630    314
       ... 
2083     22
5190     21
607      21
9203     20
6014     19
Length: 10000, dtype: int64

In [200]:
population.loc[784]

ager_typ                     NaN
akt_dat_kl                   NaN
alter_hh                     NaN
alter_kind1                  NaN
alter_kind2                  NaN
                            ... 
zabeotyp                       3
anrede_kz                      1
alterskategorie_grob           3
cameo_intl_2015_household    NaN
cameo_intl_2015_family       NaN
Name: 784, Length: 364, dtype: object