In [32]:
import requests
import tarfile
import os

import numpy as np
import pandas as pd

In [5]:
def download_and_extract(url: str, extract_to: str):
    """
    Downloads a tar.gz file from a URL and extracts it to a directory.
    Args:
    - url (str): URL of the tar.gz file to download.
    - extract_to (str): Directory path to extract the contents of the tar.gz file.
    """
    # Get the filename from the URL
    filename = url.split('/')[-1]

    # Download the file
    print("Downloading the file...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.raw.read())
        print("Download completed.")
    else:
        print("Failed to download the file.")
        return

    # Extract the tar.gz file
    print("Extracting the file...")
    try:
        with tarfile.open(filename, 'r:gz') as tar:
            tar.extractall(path=extract_to)
        print("Extraction completed.")
    except Exception as e:
        print(f"Failed to extract the file: {e}")
    finally:
        # Optionally remove the tar.gz file after extraction
        os.remove(filename)
        print("Downloaded tar.gz file removed.")

# URL of the tar.gz file
url = "https://video.udacity-data.com/topher/2024/August/66b9ba05_arvato_data.tar/arvato_data.tar.gz"

# Call the function with the URL
download_and_extract(url, extract_to=".")

Downloading the file...
Download completed.
Extracting the file...
Extraction completed.
Downloaded tar.gz file removed.


# Part 0: Get to Know the Data

In [58]:
population = pd.read_csv("../data/Udacity_AZDIAS_052018.csv", sep=";", nrows=10000)
customers = pd.read_csv("../data/Udacity_CUSTOMERS_052018.csv", sep=";", nrows=10000)

  population = pd.read_csv("data/Udacity_AZDIAS_052018.csv", sep=";", nrows=10000)
  customers = pd.read_csv("data/Udacity_CUSTOMERS_052018.csv", sep=";", nrows=10000)


In [97]:
meta = pd.read_excel("../data/meta/dias_values.xlsx", header=1)
meta.columns = meta.columns.str.lower()
meta.drop(columns="unnamed: 0", inplace=True)

meta[["attribute", "description"]] = meta[["attribute", "description"]].ffill()

meta["value"] = [v.split(", ") if type(v) == str and "," in v else v for v in meta["value"]]
meta = meta.explode("value")

## Data Types
Besides numeric values there are strings in the datasets which need a separate preprocessing:
- `CAMEO_DEU_2015`, `CAMEO_DEUG_2015` and `CAMEO_INTL_2015`
  - have `X` or `XX` values that should be treated as null values
  - partially have numeric values as strings that should be unified
  - are classifications that should be one-hot-encoded
    - `CAMEO_INTL_2015` values are a composition of household and family type and should be separated
- `D19_LETZTER_KAUF_BRANCHE` shows the sector of the last purchase which is ineligible for our segmentation report and model
- `EINGEFUEGT_AM` is the timestamp where the data was created in the database and also unnecessary
- `OST_WEST_KZ` is a flag labeled as `W` or `O` that should be one-hot-encoded

In [72]:
population.dtypes.value_counts()

float64    269
int64       93
object       2
Name: count, dtype: int64

In [73]:
population.select_dtypes(include='object').head()

Unnamed: 0,CAMEO_DEU_2015,OST_WEST_KZ
0,,
1,8A,W
2,4C,W
3,2A,W
4,6B,W


In [68]:
population["CAMEO_DEU_2015"].unique()

array([nan, '8A', '4C', '2A', '6B', '8C', '4A', '2D', '1A', '1E', '9D',
       '5C', '8B', '7A', '5D', '9E', '9B', '1B', '3D', '4E', '4B', '3C',
       '5A', '7B', '9A', '6D', '6E', '2C', '7C', '9C', '7D', '5E', '1D',
       '8D', '6C', '6A', '5B', '4D', '3A', '2B', '7E', '3B', '6F', '5F',
       '1C'], dtype=object)

In [69]:
population["CAMEO_DEUG_2015"].unique()

array([nan,  8.,  4.,  2.,  6.,  1.,  9.,  5.,  7.,  3.])

In [70]:
population["CAMEO_INTL_2015"].unique()

array([nan, 51., 24., 12., 43., 54., 22., 14., 13., 15., 33., 41., 34.,
       55., 25., 23., 31., 52., 35., 45., 44., 32.])

In [100]:
meta[meta["attribute"] == "CAMEO_INTL_2015"].head(10)

Unnamed: 0,attribute,description,value,meaning
105,CAMEO_INTL_2015,CAMEO classification 2015 - international typo...,-1,unknown
106,CAMEO_INTL_2015,(each German CAMEO code belongs to one interna...,11,Wealthy Households-Pre-Family Couples & Singles
107,CAMEO_INTL_2015,(each German CAMEO code belongs to one interna...,12,Wealthy Households-Young Couples With Children
108,CAMEO_INTL_2015,(each German CAMEO code belongs to one interna...,13,Wealthy Households-Families With School Age Ch...
109,CAMEO_INTL_2015,(each German CAMEO code belongs to one interna...,14,Wealthy Households-Older Families & Mature Co...
110,CAMEO_INTL_2015,(each German CAMEO code belongs to one interna...,15,Wealthy Households-Elders In Retirement
111,CAMEO_INTL_2015,(each German CAMEO code belongs to one interna...,21,Prosperous Households-Pre-Family Couples & Sin...
112,CAMEO_INTL_2015,(each German CAMEO code belongs to one interna...,22,Prosperous Households-Young Couples With Children
113,CAMEO_INTL_2015,(each German CAMEO code belongs to one interna...,23,Prosperous Households-Families With School Age...
114,CAMEO_INTL_2015,(each German CAMEO code belongs to one interna...,24,Prosperous Households-Older Families & Mature ...


In [71]:
population["OST_WEST_KZ"].unique()

array([nan, 'W', 'O'], dtype=object)

In [65]:
population = population.drop(columns=["D19_LETZTER_KAUF_BRANCHE", "EINGEFUEGT_AM"])

In [120]:
def prepare_cameo_classifications(df: pd.DataFrame) -> pd.DataFrame:

    cameo_columns = df.columns[df.columns.str.startswith("CAMEO")]
    df[cameo_columns] = df[cameo_columns].replace(["X", "XX"], np.nan)

    for cameo_column in cameo_columns:
        if cameo_column != "CAMEO_DEU_2015":
            df[cameo_column] = df[cameo_column].astype(float)

    df["CAMEO_INTL_2015_HOUSEHOLD"] = [v // 10 if v != -1 else v for v in df["CAMEO_INTL_2015"]]
    df["CAMEO_INTL_2015_FAMILY"] = [v % 10 if v != -1 else v for v in df["CAMEO_INTL_2015"]]
    df = df.drop(columns=["CAMEO_INTL_2015"])

    return df

population = prepare_cameo_classifications(population)