In [1]:
import requests
import tarfile
import os

import pandas as pd

In [5]:
def download_and_extract(url: str, extract_to: str):
    """
    Downloads a tar.gz file from a URL and extracts it to a directory.
    Args:
    - url (str): URL of the tar.gz file to download.
    - extract_to (str): Directory path to extract the contents of the tar.gz file.
    """
    # Get the filename from the URL
    filename = url.split('/')[-1]

    # Download the file
    print("Downloading the file...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.raw.read())
        print("Download completed.")
    else:
        print("Failed to download the file.")
        return

    # Extract the tar.gz file
    print("Extracting the file...")
    try:
        with tarfile.open(filename, 'r:gz') as tar:
            tar.extractall(path=extract_to)
        print("Extraction completed.")
    except Exception as e:
        print(f"Failed to extract the file: {e}")
    finally:
        # Optionally remove the tar.gz file after extraction
        os.remove(filename)
        print("Downloaded tar.gz file removed.")

# URL of the tar.gz file
url = "https://video.udacity-data.com/topher/2024/August/66b9ba05_arvato_data.tar/arvato_data.tar.gz"

# Call the function with the URL
download_and_extract(url, extract_to=".")

Downloading the file...
Download completed.
Extracting the file...
Extraction completed.
Downloaded tar.gz file removed.


## Part 0: Get to Know the Data

In [20]:
population = pd.read_csv("data/Udacity_AZDIAS_052018.csv", sep=";")
customers = pd.read_csv("data/Udacity_CUSTOMERS_052018.csv", sep=";")

  population = pd.read_csv("data/Udacity_AZDIAS_052018.csv", sep=";")
  customers = pd.read_csv("data/Udacity_CUSTOMERS_052018.csv", sep=";")


## Data Types
Besides numeric values there are strings in the datasets which need a separate preprocessing:
- `CAMEO_DEUG_2015` and `CAMEO_INTL_2015` both have
  - `X` and `XX` values that should be treated as null values
  - numeric values as strings that should be converted
- `CAMEO_DEU_2015` is the detailed classification of `CAMEO_DEUG_2015` which should be converted to a separated numeric scale
- `D19_LETZTER_KAUF_BRANCHE` shows the sector of the last purchase which is ineligible for our segmentation report and prediction model
- `EINGEFUEGT_AM` is the timestamp where the data was created in the database and also unnecessary - we should remove that column as well
- `OST_WEST_KZ` is a flag labeled as `W` or `O` that we should convert to a numeric type

In [21]:
population.dtypes.value_counts()

float64    267
int64       93
object       6
Name: count, dtype: int64

In [22]:
population.select_dtypes(include='object').head()

Unnamed: 0,CAMEO_DEU_2015,CAMEO_DEUG_2015,CAMEO_INTL_2015,D19_LETZTER_KAUF_BRANCHE,EINGEFUEGT_AM,OST_WEST_KZ
0,,,,,,
1,8A,8.0,51.0,,1992-02-10 00:00:00,W
2,4C,4.0,24.0,D19_UNBEKANNT,1992-02-12 00:00:00,W
3,2A,2.0,12.0,D19_UNBEKANNT,1997-04-21 00:00:00,W
4,6B,6.0,43.0,D19_SCHUHE,1992-02-12 00:00:00,W


In [23]:
population["CAMEO_DEU_2015"].unique()

array([nan, '8A', '4C', '2A', '6B', '8C', '4A', '2D', '1A', '1E', '9D',
       '5C', '8B', '7A', '5D', '9E', '9B', '1B', '3D', '4E', '4B', '3C',
       '5A', '7B', '9A', '6D', '6E', '2C', '7C', '9C', '7D', '5E', '1D',
       '8D', '6C', '6A', '5B', '4D', '3A', '2B', '7E', '3B', '6F', '5F',
       '1C', 'XX'], dtype=object)

In [24]:
population["CAMEO_DEUG_2015"].unique()

array([nan, 8.0, 4.0, 2.0, 6.0, 1.0, 9.0, 5.0, 7.0, 3.0, '4', '3', '7',
       '2', '8', '9', '6', '5', '1', 'X'], dtype=object)

In [25]:
population["CAMEO_INTL_2015"].unique()

array([nan, 51.0, 24.0, 12.0, 43.0, 54.0, 22.0, 14.0, 13.0, 15.0, 33.0,
       41.0, 34.0, 55.0, 25.0, 23.0, 31.0, 52.0, 35.0, 45.0, 44.0, 32.0,
       '22', '24', '41', '12', '54', '51', '44', '35', '23', '25', '14',
       '34', '52', '55', '31', '32', '15', '13', '43', '33', '45', 'XX'],
      dtype=object)

In [26]:
population["OST_WEST_KZ"].unique()

array([nan, 'W', 'O'], dtype=object)