In [23]:
import warnings

import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

from src.files_io import download_and_extract_raw_datasets
from src.preprocessing import preprocess_data

In [8]:
download_and_extract_raw_datasets()

Raw Datasets already exists.


In [9]:
warnings.simplefilter(action="ignore", category=pd.errors.DtypeWarning)

population = pd.read_csv("../data/Udacity_AZDIAS_052018.csv", sep=";")
population.columns = population.columns.str.lower()

customer = pd.read_csv("../data/Udacity_CUSTOMERS_052018.csv", sep=";")
customer.columns = customer.columns.str.lower()

warnings.resetwarnings()

In [11]:
population, customer = preprocess_data(population, customer, names=("population", "customer"))

Metadata loaded.
Metadata rectified.
Feature config loaded.

Start cleaning population dataset...
Removed 3 irrelevant features.
Converted 324 unknown values from the meta information to null in the dataset.
Converted 8 invalid values to null in the dataset.
Removed 105801 records with over 33.0% missing feature values.
Removed 16 features with over 20.0% missing values:
alter_hh, alter_kind3, alter_kind4, alterskategorie_fein, d19_banken_online_quote_12, d19_gesamt_online_quote_12, d19_konsumtyp, d19_lotto, d19_soziales, d19_telko_online_quote_12, d19_versand_online_quote_12, d19_versi_online_quote_12, extsel992, geburtsjahr, kba05_baumax, kk_kundentyp
Replaced missing values in 129 features.
Extracted decade and cultural orientation as separate features.
Extracted wealth and life phase as separate features.
Removed 51 redundant features.
Removed 24 uncertain features.

Start cleaning customer dataset...
Removed 6 irrelevant features.
Converted 324 unknown values from the meta informa

In [12]:
population["is_customer"] = False
customer["is_customer"] = True

In [21]:
df = pd.concat([population, customer], ignore_index=True)

In [25]:
explained_variance = 0.8
pca = PCA(n_components=explained_variance, random_state=42)

pca_data = pca.fit_transform(df.drop(columns=["is_customer"]))
print(f"Number of PCA components retained: {pca.n_components_}")

pca_df = pd.DataFrame(
    data=pca_data,
    columns=[f"principal_component_{i+1}" for i in range(pca.n_components_)]
)

combined_pca_df = pd.concat([pca_df, df["is_customer"]], axis=1)

Number of PCA components retained: 110


In [26]:
pca_features = combined_pca_df.drop(columns=["is_customer"])
kmeans = KMeans(n_clusters=6, init="k-means++", random_state=42)
cluster_labels = kmeans.fit_predict(pca_features)