In [1]:
import os
import sys
import pandas as pd  # noqa: F401
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
# ============================================================
# ✅ Add the project root to sys.path (not /core)
# ============================================================
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

print("Project root added to sys.path:", project_root)

Project root added to sys.path: /Users/claudiatagbo/Masterschool/traveltide


In [2]:
# Verify that core is importable
try:
    import core
    print(":weißes_häkchen: core module found at:", core.__file__)
except ModuleNotFoundError:
    print(":x: core module not found. Check your sys.path!")

:weißes_häkchen: core module found at: None


In [3]:
## **Pfad zum Speichern von PCA**

In [4]:
# Pfad für Feature-Metriken erstellen:
# Hier werden alle PCA-bezogenen Datensätze gespeichert (z. B. transformierte Daten).
pca_data_path = os.path.join(project_root, 'data', 'processed', 'kmean')

# Pfad für Visualisierungen (Plots, Heatmaps, Grafiken) zur PCA/KMeans-Analyse.
pca_fig_path = os.path.join(project_root, 'reports', 'viz', 'kmean')

#eda_data_path = os.path.join(project_root, 'reports', 'eda', 'results')
# Optionaler Pfad für EDA-Ergebnisse (aktuell deaktiviert).

# Ordner erstellen, falls sie noch nicht existieren.
# exist_ok=True verhindert Fehler, wenn die Ordner bereits vorhanden sind.
os.makedirs(pca_data_path, exist_ok=True)
os.makedirs(pca_fig_path, exist_ok=True)

#os.makedirs(eda_data_path, exist_ok=True)
# Optionaler EDA


## **Import Core Package** 

In [5]:
from core.load_data import load_table

:weißes_häkchen: Connected to PostgreSQL database.


In [6]:
## **Load PCA Components aus CVS**

In [7]:
users = load_table(data_type='feature', table_name='user_base')
user_segment = load_table(data_type='kmean', table_name='user_segment')


 Lade Tabelle 'user_base' aus CSV: /Users/claudiatagbo/Masterschool/traveltide/core/../data/processed/feature_metrics/user_base.csv
 CSV geladen. Zeilen: 5998
 Lade Tabelle 'user_segment' aus CSV: /Users/claudiatagbo/Masterschool/traveltide/core/../data/processed/kmean/user_segment.csv
 CSV geladen. Zeilen: 5998


In [8]:
segment_df = pd.merge(users,user_segment, on='user_id')
print (segment_df.shape)
display(segment_df.dtypes)

(5998, 38)


user_id                       int64
num_clicks                    int64
avg_session_clicks          float64
num_empty_sessions            int64
num_canceled_trips            int64
num_sessions                  int64
avg_session_duration        float64
num_trips                   float64
num_destinations            float64
num_flights                 float64
num_hotels                  float64
avg_money_spent_flight      float64
avg_time_after_booking      float64
avg_money_spent_per_seat    float64
avg_money_spent_hotel       float64
avg_km_flown                float64
avg_bags                    float64
gender                       object
married                        bool
has_children                   bool
home_country                 object
home_city                    object
age                         float64
pca_0                       float64
pca_1                       float64
pca_2                       float64
pca_3                       float64
pca_4                       

In [9]:
import pandas as pd
def preprocess_travel_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the travel dataset:
    - Drop unused columns
    - Convert categorical/boolean features to integers for analysis
    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame with raw features
    Returns
    -------
    pd.DataFrame
        Preprocessed DataFrame ready for clustering/heatmaps
    """
    # Remove features not needed
    drop_cols = ["user_id", "home_city"]
    df = df.drop(columns=drop_cols, errors="ignore")
    # Convert categorical/boolean features
    df["gender"] = df["gender"].map({"F": 0, "M": 1, "O": 2})
    df["married"] = df["married"].astype(int)
    df["has_children"] = df["has_children"].astype(int)
    # Encode home_country as binary (Canada vs. others)
    df["home_country"] = (df["home_country"] == "canada").astype(int)
    return df

In [10]:
df= preprocess_travel_data(df=segment_df)

In [None]:
df.groupby('cluster_5').size()

cluster_3
0    4292
1    1472
2     234
dtype: int64