In [1]:
import requests
import tarfile
import os

import numpy as np
import pandas as pd
from pandas import notna

In [5]:
def download_and_extract(url: str, extract_to: str):
    """
    Downloads a tar.gz file from a URL and extracts it to a directory.
    Args:
    - url (str): URL of the tar.gz file to download.
    - extract_to (str): Directory path to extract the contents of the tar.gz file.
    """
    # Get the filename from the URL
    filename = url.split('/')[-1]

    # Download the file
    print("Downloading the file...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.raw.read())
        print("Download completed.")
    else:
        print("Failed to download the file.")
        return

    # Extract the tar.gz file
    print("Extracting the file...")
    try:
        with tarfile.open(filename, 'r:gz') as tar:
            tar.extractall(path=extract_to)
        print("Extraction completed.")
    except Exception as e:
        print(f"Failed to extract the file: {e}")
    finally:
        # Optionally remove the tar.gz file after extraction
        os.remove(filename)
        print("Downloaded tar.gz file removed.")

# URL of the tar.gz file
url = "https://video.udacity-data.com/topher/2024/August/66b9ba05_arvato_data.tar/arvato_data.tar.gz"

# Call the function with the URL
download_and_extract(url, extract_to=".")

Downloading the file...
Download completed.
Extracting the file...
Extraction completed.
Downloaded tar.gz file removed.


# Part 0: Get to Know the Data

In [28]:
population = pd.read_csv("../data/Udacity_AZDIAS_052018.csv", sep=";", nrows=10000)
customers = pd.read_csv("../data/Udacity_CUSTOMERS_052018.csv", sep=";", nrows=10000)

population.columns = population.columns.str.lower()
customers.columns = customers.columns.str.lower()

  population = pd.read_csv("../data/Udacity_AZDIAS_052018.csv", sep=";", nrows=10000)
  customers = pd.read_csv("../data/Udacity_CUSTOMERS_052018.csv", sep=";", nrows=10000)


In [38]:
meta = pd.read_excel("../data/meta/dias_values.xlsx", header=1)
meta.columns = meta.columns.str.lower()
meta.drop(columns="unnamed: 0", inplace=True)

meta[["attribute", "description"]] = meta[["attribute", "description"]].ffill()
meta["attribute"] = meta["attribute"].str.lower()

meta["meaning"] = meta["meaning"].ffill()

meta["value"] = [v.split(", ") if type(v) == str and "," in v else v for v in meta["value"]]
meta = meta.explode("value")

## Data Types
Besides numeric values there are strings in the datasets which need a separate preprocessing:
- `CAMEO_DEU_2015`, `CAMEO_DEUG_2015` and `CAMEO_INTL_2015`
  - have `X` or `XX` values that should be treated as null values
  - partially have numeric values as strings that should be unified
  - are classifications that should be one-hot-encoded
    - `CAMEO_INTL_2015` values are a composition of household and family type and should be separated
- `D19_LETZTER_KAUF_BRANCHE` shows the sector of the last purchase which is ineligible for our segmentation report and model
- `EINGEFUEGT_AM` is the timestamp where the data was created in the database and also unnecessary
- `OST_WEST_KZ` is a flag labeled as `W` or `O` that should be one-hot-encoded

In [4]:
population.dtypes.value_counts()

float64    267
int64       93
object       6
Name: count, dtype: int64

In [5]:
population.select_dtypes(include="object").head()

Unnamed: 0,cameo_deu_2015,cameo_deug_2015,cameo_intl_2015,d19_letzter_kauf_branche,eingefuegt_am,ost_west_kz
0,,,,,,
1,8A,8.0,51.0,,1992-02-10 00:00:00,W
2,4C,4.0,24.0,D19_UNBEKANNT,1992-02-12 00:00:00,W
3,2A,2.0,12.0,D19_UNBEKANNT,1997-04-21 00:00:00,W
4,6B,6.0,43.0,D19_SCHUHE,1992-02-12 00:00:00,W


In [6]:
population["cameo_deu_2015"].unique()

array([nan, '8A', '4C', '2A', '6B', '8C', '4A', '2D', '1A', '1E', '9D',
       '5C', '8B', '7A', '5D', '9E', '9B', '1B', '3D', '4E', '4B', '3C',
       '5A', '7B', '9A', '6D', '6E', '2C', '7C', '9C', '7D', '5E', '1D',
       '8D', '6C', '6A', '5B', '4D', '3A', '2B', '7E', '3B', '6F', '5F',
       '1C', 'XX'], dtype=object)

In [7]:
population["cameo_deug_2015"].unique()

array([nan, 8.0, 4.0, 2.0, 6.0, 1.0, 9.0, 5.0, 7.0, 3.0, '4', '3', '7',
       '2', '8', '9', '6', '5', '1', 'X'], dtype=object)

In [8]:
population["cameo_intl_2015"].unique()

array([nan, 51.0, 24.0, 12.0, 43.0, 54.0, 22.0, 14.0, 13.0, 15.0, 33.0,
       41.0, 34.0, 55.0, 25.0, 23.0, 31.0, 52.0, 35.0, 45.0, 44.0, 32.0,
       '22', '24', '41', '12', '54', '51', '44', '35', '23', '25', '14',
       '34', '52', '55', '31', '32', '15', '13', '43', '33', '45', 'XX'],
      dtype=object)

In [11]:
meta[meta["attribute"] == "cameo_intl_2015"].head(10)

Unnamed: 0,attribute,description,value,meaning
105,cameo_intl_2015,CAMEO classification 2015 - international typo...,-1,unknown
106,cameo_intl_2015,(each German CAMEO code belongs to one interna...,11,Wealthy Households-Pre-Family Couples & Singles
107,cameo_intl_2015,(each German CAMEO code belongs to one interna...,12,Wealthy Households-Young Couples With Children
108,cameo_intl_2015,(each German CAMEO code belongs to one interna...,13,Wealthy Households-Families With School Age Ch...
109,cameo_intl_2015,(each German CAMEO code belongs to one interna...,14,Wealthy Households-Older Families & Mature Co...
110,cameo_intl_2015,(each German CAMEO code belongs to one interna...,15,Wealthy Households-Elders In Retirement
111,cameo_intl_2015,(each German CAMEO code belongs to one interna...,21,Prosperous Households-Pre-Family Couples & Sin...
112,cameo_intl_2015,(each German CAMEO code belongs to one interna...,22,Prosperous Households-Young Couples With Children
113,cameo_intl_2015,(each German CAMEO code belongs to one interna...,23,Prosperous Households-Families With School Age...
114,cameo_intl_2015,(each German CAMEO code belongs to one interna...,24,Prosperous Households-Older Families & Mature ...


In [12]:
population["ost_west_kz"].unique()

array([nan, 'W', 'O'], dtype=object)

In [13]:
population = population.drop(columns=["d19_letzter_kauf_branche", "eingefuegt_am"])

In [31]:
def prepare_cameo_classifications(df: pd.DataFrame) -> pd.DataFrame:

    cameo_columns = df.columns[df.columns.str.startswith("cameo")]
    df[cameo_columns] = df[cameo_columns].replace(["X", "XX"], np.nan)

    for cameo_column in cameo_columns:
        if cameo_column != "cameo_deu_2015":
            df[cameo_column] = df[cameo_column].astype(float)

    df["cameo_intl_2015_household"] = [str(v)[0] if v != -1 and pd.notnull(v) else v for v in df["cameo_intl_2015"]]
    df["cameo_intl_2015_family"] = [str(v)[1] if v != -1 and pd.notnull(v) else v for v in df["cameo_intl_2015"]]
    df = df.drop(columns=["cameo_intl_2015"])

    return df

population = prepare_cameo_classifications(population)

## Unknown Values

In [41]:
meta[meta["meaning"].str.contains("unknown")].head()

Unnamed: 0,attribute,description,value,meaning
0,ager_typ,best-ager typology,-1,unknown
5,alterskategorie_grob,age classification through prename analysis,-1,unknown
5,alterskategorie_grob,age classification through prename analysis,0,unknown
11,alter_hh,main age within the household,0,unknown / no main age detectable
33,anrede_kz,gender,-1,unknown


In [63]:
def convert_unknown_values_to_null(df: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
    unknown_values = meta[meta["meaning"].str.contains("unknown")]

    for attribute, unknown_value in zip(unknown_values.loc[:, "attribute"], unknown_values.loc[:, "value"]):
        if attribute in df:
            df[attribute] = df[attribute].replace(unknown_value, np.nan)

    return df

population = convert_unknown_values_to_null(population, meta)

In [81]:
assert not -1 in population.values