In [13]:
# Notebook: notebooks/load_dat.ipynb

import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans # For KMeans clustering
from sklearn.metrics import silhouette_score # For evaluating clustering performance
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# ============================================================
# ✅ Add the project root to sys.path (not /core)
# ============================================================
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

print("Project root added to sys.path:", project_root)

Project root added to sys.path: /Users/sadiqqais/Masterschool/Projekt/Mastery_projekt/Travel_tide


In [14]:
# Verify that core is importable
try:
    import core

    print("✅ core module found at:", core.__file__)
except ModuleNotFoundError:
    print("❌ core module not found. Check your sys.path!")

✅ core module found at: /Users/sadiqqais/Masterschool/Projekt/Mastery_projekt/Travel_tide/core/__init__.py


In [15]:
# Define paths for saving PCA and KMeans results
pca_data_path = os.path.join(project_root, 'data', 'processed', 'kmean')
pca_fig_path = os.path.join(project_root, 'reports', 'viz', 'kmean')
# eda_data_path = os.path.join(project_root, 'data', 'processed', 'eda')
# bitte mach mir ein ein ornder namens kmean im processed data ordner
os.makedirs(pca_data_path, exist_ok=True)
os.makedirs(pca_data_path, exist_ok=True)
# os.makedirs(pca_fig_path, exist_ok=True)

In [16]:
from core.load_data import load_table

In [17]:
users = load_table(data_type='feature', table_name='user_base')
user_segment = load_table(data_type='kmean', table_name='user_segment')

:aktenordner: Lade Tabelle 'user_base' aus CSV: /Users/sadiqqais/Masterschool/Projekt/Mastery_projekt/Travel_tide/core/../data/processed/feature_metrics/user_base.csv
:weißes_häkchen: CSV geladen. Zeilen: 5998
:aktenordner: Lade Tabelle 'user_segment' aus CSV: /Users/sadiqqais/Masterschool/Projekt/Mastery_projekt/Travel_tide/core/../data/processed/kmean/user_segment.csv
:weißes_häkchen: CSV geladen. Zeilen: 5998


In [18]:
segment_df = pd.merge(users, user_segment, on = 'user_id')
print(segment_df.shape)
display(segment_df.dtypes)

(5998, 38)


user_id                       int64
num_clicks                    int64
avg_session_clicks          float64
num_empty_sessions            int64
num_canceled_trips            int64
num_sessions                  int64
avg_session_duration        float64
num_trips                   float64
num_destinations            float64
num_flights                 float64
num_hotels                  float64
avg_money_spent_flight      float64
avg_time_after_booking      float64
avg_money_spent_per_seat    float64
avg_money_spent_hotel       float64
avg_km_flown                float64
avg_bags                    float64
gender                       object
married                        bool
has_children                   bool
home_country                 object
home_city                    object
age                         float64
pca_0                       float64
pca_1                       float64
pca_2                       float64
pca_3                       float64
pca_4                       

In [19]:
import pandas as pd
def preprocess_travel_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the travel dataset:
    - Drop unused columns
    - Convert categorical/boolean features to integers for analysis
    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame with raw features
    Returns
    -------
    pd.DataFrame
        Preprocessed DataFrame ready for clustering/heatmaps
    """
    # Remove features not needed
    drop_cols = ["user_id", "home_city"]
    df = df.drop(columns=drop_cols, errors="ignore")
    # Convert categorical/boolean features
    df["gender"] = df["gender"].map({"F": 0, "M": 1, "O": 2})
    df["married"] = df["married"].astype(int)
    df["has_children"] = df["has_children"].astype(int)
    # Encode home_country as binary (Canada vs. others)
    df["home_country"] = (df["home_country"] == "canada").astype(int)
    return df

In [20]:
df = preprocess_travel_data(df = segment_df)
display(df.dtypes)

num_clicks                    int64
avg_session_clicks          float64
num_empty_sessions            int64
num_canceled_trips            int64
num_sessions                  int64
avg_session_duration        float64
num_trips                   float64
num_destinations            float64
num_flights                 float64
num_hotels                  float64
avg_money_spent_flight      float64
avg_time_after_booking      float64
avg_money_spent_per_seat    float64
avg_money_spent_hotel       float64
avg_km_flown                float64
avg_bags                    float64
gender                        int64
married                       int64
has_children                  int64
home_country                  int64
age                         float64
pca_0                       float64
pca_1                       float64
pca_2                       float64
pca_3                       float64
pca_4                       float64
pca_5                       float64
pca_6                       

In [26]:
df.groupby("cluster_7").size()

cluster_7
0    2373
1     948
2      45
3    1786
4     120
5     404
6     322
dtype: int64