In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

# Read the data

In [None]:
predictions_path = "../Data_Sources/Data_Cleaned/Predictions/Segmented_Visitor_Demand_Prediction.csv"
predictions_df = pd.read_csv(predictions_path)

modelling_path = "../Data_Sources/Data_Cleaned/Modelling/Table_for_modelling.csv"
df = pd.read_csv(modelling_path)

In [None]:
predictions_df

In [None]:
cols_to_merge = [
    'Recreatief NL_pred', 'Recreatief NL_actual',
    'Recreatief Buitenland_pred', 'Recreatief Buitenland_actual', 
    'PO_pred', 'PO_actual',
    'VO_pred', 'VO_actual',
    'Student_pred', 'Student_actual',
    'Extern_pred', 'Extern_actual', 
    'Total Visitors_pred', 'Total Visitors_actual',
    'Date'
]

In [None]:
merged_df = df.merge(
            predictions_df[cols_to_merge], 
            on='Date', 
            how='left'
        )
new_df = merged_df.iloc[30:, :]

# Model Crew Size Prediction

In [None]:
class CrewSizePredictionModel:
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.feature_names = None
        self.class_mapping = {}
        
        # Define crew size categories in order of capacity
        self.crew_size_order = ['Gesloten', 'Gesloten maandag', 'A min', 'A', 'B', 'C', 'D']
        
    def clean_crew_size_data(self, df):
        """Clean and prepare crew size data"""
        df_clean = df.copy()
        
        # Handle NaN values in maat_visitors
        # You might want to fill these based on business logic
        df_clean['maat_visitors'] = df_clean['maat_visitors'].fillna('Unknown')
        
        # Create a mapping for crew sizes to numerical values for ordering
        crew_mapping = {size: idx for idx, size in enumerate(self.crew_size_order)}
        crew_mapping['Unknown'] = -1  # For NaN values
        
        df_clean['crew_size_numeric'] = df_clean['maat_visitors'].map(crew_mapping)
        
        print("Crew size distribution:")
        print(df_clean['maat_visitors'].value_counts())
        
        return df_clean
    
    def engineer_crew_features(self, df):
        """Engineer features specifically for crew size prediction with rolling and lagged features"""
        df_features = df.copy()
        
        # Ensure Date is datetime
        if 'Date' in df.columns:
            df_features['Date'] = pd.to_datetime(df_features['Date'])
            df_features = df_features.sort_values('Date').reset_index(drop=True)
        
        # Calculate visitor type ratios
        if 'Total Visitors_pred' in df.columns:
            total_pred = df_features['Total Visitors_pred']
            df_features['recreatief_nl_ratio'] = df_features['Recreatief NL_pred'] / (total_pred + 1)
            df_features['recreatief_buitenland_ratio'] = df_features['Recreatief Buitenland_pred'] / (total_pred + 1)
            df_features['educational_ratio'] = (
                df_features['PO_pred'] + df_features['VO_pred'] + df_features['Student_pred']
            ) / (total_pred + 1)
            df_features['extern_ratio'] = df_features['Extern_pred'] / (total_pred + 1)
        
        # Time-based features
        if 'Date' in df.columns:
            df_features['day_of_week'] = df_features['Date'].dt.dayofweek
            df_features['is_weekend'] = df_features['day_of_week'].isin([5, 6]).astype(int)
            df_features['is_monday'] = (df_features['day_of_week'] == 0).astype(int)
            df_features['month'] = df_features['Date'].dt.month
            df_features['is_summer'] = df_features['month'].isin([6, 7, 8]).astype(int)
        
        # Weather impact on crew needs
        if 'MeanTemp_C' in df.columns and 'Precipitation_mm' in df.columns:
            df_features['good_weather'] = (
                (df_features['MeanTemp_C'] > 15) & 
                (df_features['Precipitation_mm'] < 1)
            ).astype(int)
            df_features['bad_weather'] = (
                (df_features['MeanTemp_C'] < 10) | 
                (df_features['Precipitation_mm'] > 5)
            ).astype(int)
        
        # Holiday and special events
        holiday_cols = [col for col in df.columns if 'holiday' in col.lower()]
        if holiday_cols:
            df_features['any_holiday'] = df_features[holiday_cols].max(axis=1)
        
        # Operational factors
        if 'is_open' in df.columns:
            df_features['is_open'] = df_features['is_open'].fillna(1)
        
        # Museum capacity pressure
        if 'Total Visitors_pred' in df.columns:
            df_features['high_capacity_day'] = (df_features['Total Visitors_pred'] > df_features['Total Visitors_pred'].quantile(0.8)).astype(int)
            df_features['low_capacity_day'] = (df_features['Total Visitors_pred'] < df_features['Total Visitors_pred'].quantile(0.2)).astype(int)
        
        # === LAGGED FEATURES ===
        # Visitor prediction lags
        visitor_pred_cols = [col for col in df_features.columns if col.endswith('_pred')]
        lags = [1, 7, 14]  # 1 day, 1 week, 2 weeks
        
        for lag in lags:
            for col in visitor_pred_cols:
                df_features[f'{col}_lag_{lag}'] = df_features[col].shift(lag)
            
            # Total visitors lag
            if 'Total Visitors_pred' in df.columns:
                df_features[f'total_visitors_lag_{lag}'] = df_features['Total Visitors_pred'].shift(lag)
        
        # Historical crew size patterns (if available)
        if 'maat_visitors' in df.columns:
            # Encode crew sizes numerically for lagged features
            crew_size_mapping = {
                'Gesloten': 0,
                'Gesloten maandag': 1, 
                'A min': 2,
                'A': 3,
                'B': 4,
                'C': 5,
                'D': 6
            }
            
            df_features['crew_size_numeric'] = df_features['maat_visitors'].map(crew_size_mapping)
            df_features['crew_size_numeric'] = df_features['crew_size_numeric'].fillna(-1)
            
            # Lagged crew sizes
            for lag in lags:
                df_features[f'crew_size_lag_{lag}'] = df_features['crew_size_numeric'].shift(lag)
            
            # Same day last week crew size
            df_features['crew_size_last_week'] = df_features['crew_size_numeric'].shift(7)
        
        # === ROLLING FEATURES ===
        windows = [7, 14]  # 1 week, 2 weeks
        
        # Rolling visitor statistics
        for window in windows:
            for col in visitor_pred_cols:
                df_features[f'{col}_rolling_mean_{window}'] = df_features[col].shift(1).rolling(window=window).mean()
                df_features[f'{col}_rolling_std_{window}'] = df_features[col].shift(1).rolling(window=window).std()
                df_features[f'{col}_rolling_max_{window}'] = df_features[col].shift(1).rolling(window=window).max()
        
        # Rolling crew size patterns (mode for categorical data)
        if 'crew_size_numeric' in df_features.columns:
            for window in windows:
                # Most frequent crew size (mode) using numeric values
                def numeric_mode(x):
                    """Calculate mode for numeric crew sizes"""
                    if len(x) == 0 or x.isna().all():
                        return -1  # Unknown
                    mode_result = x.mode()
                    return mode_result.iloc[0] if len(mode_result) > 0 else -1
                
                df_features[f'crew_mode_numeric_{window}'] = (
                    df_features['crew_size_numeric']
                    .shift(1)
                    .rolling(window=window)
                    .apply(numeric_mode, raw=False)
                )
                
                # Crew size stability (consistency)
                def crew_stability(x):
                    """Calculate how consistent crew sizing has been"""
                    if len(x) == 0 or x.isna().all():
                        return 0
                    mode_result = x.mode()
                    if len(mode_result) == 0:
                        return 0
                    most_common = mode_result.iloc[0]
                    return (x == most_common).mean()
                
                df_features[f'crew_stability_{window}'] = (
                    df_features['crew_size_numeric']
                    .shift(1)
                    .rolling(window=window)
                    .apply(crew_stability, raw=False)
                )
                
                # Average crew size level over window
                df_features[f'crew_avg_{window}'] = (
                    df_features['crew_size_numeric']
                    .shift(1)
                    .rolling(window=window)
                    .mean()
                )
                
                # Crew size trend (is it increasing/decreasing?)
                df_features[f'crew_trend_{window}'] = (
                    df_features['crew_size_numeric'] - 
                    df_features['crew_size_numeric'].shift(window)
                )
        
        # Rolling weather patterns
        if 'good_weather' in df_features.columns:
            for window in windows:
                df_features[f'good_weather_freq_{window}'] = (
                    df_features['good_weather']
                    .shift(1)
                    .rolling(window=window)
                    .mean()
                )
        
        # Rolling holiday density
        if 'any_holiday' in df_features.columns:
            for window in windows:
                df_features[f'holiday_density_{window}'] = (
                    df_features['any_holiday']
                    .shift(1)
                    .rolling(window=window)
                    .mean()
                )
        
        # === SEASONAL PATTERNS ===
        if 'Date' in df.columns:
            # Same weekday patterns
            df_features['weekday'] = df_features['Date'].dt.dayofweek
            
            # Average crew size for this weekday in the past (if available)
            if 'crew_size_numeric' in df_features.columns:
                weekday_crew_avg = df_features.groupby('weekday')['crew_size_numeric'].expanding().mean()
                df_features['weekday_crew_avg'] = weekday_crew_avg.reset_index(level=0, drop=True)
                df_features['weekday_crew_avg'] = df_features['weekday_crew_avg'].shift(1)  # Don't use current day
        
        # === INTERACTION FEATURES ===
        # High visitor days with weather
        if 'Total Visitors_pred' in df_features.columns and 'good_weather' in df_features.columns:
            df_features['high_visitors_good_weather'] = (
                df_features['high_capacity_day'] * df_features['good_weather']
            )
        
        # Weekend + holiday interaction
        if 'is_weekend' in df_features.columns and 'any_holiday' in df_features.columns:
            df_features['weekend_holiday'] = (
                df_features['is_weekend'] * df_features['any_holiday']
            )
        
        # === RECENT TREND FEATURES ===
        # Visitor trend (is it increasing/decreasing?)
        if 'Total Visitors_pred' in df_features.columns:
            df_features['visitor_trend_3d'] = (
                df_features['Total Visitors_pred'] - 
                df_features['Total Visitors_pred'].shift(3)
            )
            df_features['visitor_trend_7d'] = (
                df_features['Total Visitors_pred'] - 
                df_features['Total Visitors_pred'].shift(7)
            )
        
        print(f"Feature engineering complete. Shape: {df_features.shape}")
        print(f"Added lagged and rolling features for crew size prediction")
        
        return df_features
    
    def select_features(self, df):
        """Select relevant features for crew size prediction including lagged and rolling features"""
        # Primary features: predicted visitor numbers
        feature_cols = [col for col in df.columns if col.endswith('_pred')]
        
        # Basic engineered features
        engineered_features = [
            'recreatief_nl_ratio', 'recreatief_buitenland_ratio',
            'educational_ratio', 'extern_ratio', 'day_of_week', 'is_weekend', 
            'is_monday', 'month', 'is_summer', 'good_weather', 'bad_weather',
            'any_holiday', 'is_open', 'high_capacity_day', 'low_capacity_day'
        ]
        
        # Weather and operational features
        additional_features = [
            'MeanTemp_C', 'Precipitation_mm', 'school_holiday', 'public_holiday',
            'Events_in_Ams', 'hotel_occupancy_index', 'peak_season_flag'
        ]
        
        # === LAGGED FEATURES ===
        lagged_features = []
        
        # Visitor prediction lags (1, 7, 14 days)
        visitor_pred_base = [col for col in df.columns if col.endswith('_pred')]
        for base_col in visitor_pred_base:
            for lag in [1, 7, 14]:
                lagged_features.append(f'{base_col}_lag_{lag}')
        
        # Total visitors lags
        for lag in [1, 7, 14]:
            lagged_features.append(f'total_visitors_lag_{lag}')
        
        # Historical crew size lags (if available)
        crew_lag_features = [
            'crew_size_lag_1', 'crew_size_lag_7', 'crew_size_lag_14',
            'crew_size_last_week'
        ]
        lagged_features.extend(crew_lag_features)
        
        # === ROLLING FEATURES ===
        rolling_features = []
        
        # Rolling visitor statistics (7, 14, 30 day windows)
        for window in [7, 14, 30]:
            # For each visitor prediction column
            for base_col in visitor_pred_base:
                rolling_features.extend([
                    f'{base_col}_rolling_mean_{window}',
                    f'{base_col}_rolling_std_{window}',
                    f'{base_col}_rolling_max_{window}'
                ])
        
        # Rolling crew size patterns
        crew_rolling_features = []
        for window in [7, 14, 30]:
            crew_rolling_features.extend([
                f'crew_mode_numeric_{window}',    # Most frequent crew size
                f'crew_stability_{window}',       # How consistent crew sizing was
            ])
        rolling_features.extend(crew_rolling_features)
        
        # Rolling weather and holiday patterns
        weather_rolling_features = []
        for window in [7, 14, 30]:
            weather_rolling_features.extend([
                f'good_weather_freq_{window}',    # Good weather frequency
                f'holiday_density_{window}',      # Holiday density
            ])
        rolling_features.extend(weather_rolling_features)
        
        # === SEASONAL PATTERN FEATURES ===
        seasonal_features = [
            'weekday_crew_avg',               # Average crew size for this weekday
        ]
        
        # === INTERACTION FEATURES ===
        interaction_features = [
            'high_visitors_good_weather',     # High capacity + good weather
            'weekend_holiday',                # Weekend + holiday interaction
        ]
        
        # === TREND FEATURES ===
        trend_features = [
            'visitor_trend_3d',               # 3-day visitor change
            'visitor_trend_7d',               # 7-day visitor change
        ]
        
        # === COMBINE ALL FEATURE CATEGORIES ===
        all_features = (
            feature_cols +                    # Visitor predictions
            engineered_features +             # Basic engineered features
            additional_features +             # Weather/operational features
            lagged_features +                 # Lagged features
            rolling_features +                # Rolling window features
            seasonal_features +               # Seasonal patterns
            interaction_features +            # Feature interactions
            trend_features                    # Trend features
        )
        
        # Filter to only include features that actually exist in the dataframe
        selected_features = [col for col in all_features if col in df.columns]
        
        # Print feature summary
        feature_categories = {
            'Visitor Predictions': [col for col in feature_cols if col in df.columns],
            'Basic Features': [col for col in engineered_features if col in df.columns],
            'Weather/Operational': [col for col in additional_features if col in df.columns],
            'Lagged Features': [col for col in lagged_features if col in df.columns],
            'Rolling Features': [col for col in rolling_features if col in df.columns],
            'Seasonal Features': [col for col in seasonal_features if col in df.columns],
            'Interaction Features': [col for col in interaction_features if col in df.columns],
            'Trend Features': [col for col in trend_features if col in df.columns],
        }
        
        print("=== FEATURE SELECTION SUMMARY ===")
        for category, features in feature_categories.items():
            print(f"{category}: {len(features)} features")
            if len(features) <= 5:  # Show all if 5 or fewer
                print(f"  {features}")
            else:  # Show first 3 and last 2 if more than 5
                print(f"  {features[:3]} ... {features[-2:]}")
        
        print(f"\nTotal selected features: {len(selected_features)}")
        
        return selected_features
    
    
    def prepare_data(self, df):
        """Prepare data for training"""
        # Clean crew size data
        df_clean = self.clean_crew_size_data(df)
        
        # Remove rows where museum is closed or crew size is unknown
        # You might want to predict these separately
        df_model = df_clean[
            (~df_clean['maat_visitors'].isin(['Gesloten', 'Unknown', 'Gesloten maandag'])) &
            (df_clean['is_open'] == 1)
        ].copy()
        
        print(f"Training data shape after filtering: {df_model.shape}")
        print("Remaining crew size distribution:")
        print(df_model['maat_visitors'].value_counts())
        
        # Engineer features
        df_features = self.engineer_crew_features(df_model)
        
        # Select features
        feature_cols = self.select_features(df_features)
        
        X = df_features[feature_cols]
        y = df_features['maat_visitors']
        
        # Handle any remaining NaN values
        X = X.fillna(X.median())
        
        return X, y, feature_cols


    def plot_confusion_matrix(self, y_true, y_pred):
        """Plot confusion matrix"""
        cm = confusion_matrix(y_true, y_pred)
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(
            cm, 
            annot=True, 
            fmt='d',
            xticklabels=self.label_encoder.classes_,
            yticklabels=self.label_encoder.classes_,
            cmap='Blues'
        )
        plt.title('Confusion Matrix - Crew Size Prediction')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.tight_layout()
        plt.show()

    
    def train(self, df):
        """Improved training with regularization"""
        X, y, feature_cols = self.prepare_data(df)
        self.feature_names = feature_cols
        
        # Encode labels
        y_encoded = self.label_encoder.fit_transform(y)
        
        # Split data first
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Feature selection to reduce overfitting
        selector = SelectKBest(score_func=f_classif, k=min(20, len(feature_cols)))
        X_train_selected = selector.fit_transform(X_train_scaled, y_train)
        X_test_selected = selector.transform(X_test_scaled)
        
        # Store selector for later use
        self.feature_selector = selector
        self.selected_features = [feature_cols[i] for i in selector.get_support(indices=True)]
        print(f"Selected {len(self.selected_features)} features: {self.selected_features}")
        
        # More regularized model (reduced complexity)
        self.model = xgb.XGBClassifier(
            n_estimators=200,     # Reduced from 300
            learning_rate=0.08,   # Slightly higher learning rate
            max_depth=3,          # Reduced depth
            subsample=0.8,
            colsample_bytree=0.7, # Reduced feature sampling
            min_child_weight=5,   # Increased regularization
            reg_alpha=0.2,        # Increased L1 regularization
            reg_lambda=1.5,       # Increased L2 regularization
            random_state=42,
            eval_metric='mlogloss'
        )
        
        # Simple training without early stopping
        self.model.fit(X_train_selected, y_train)
        
        # Evaluate
        y_pred_test = self.model.predict(X_test_selected)
        y_pred_train = self.model.predict(X_train_selected)

        y_true_combined = np.concatenate([y_test, y_train])
        y_pred_combined = np.concatenate([y_pred_test, y_pred_train])

        self.plot_confusion_matrix(y_true_combined, y_pred_combined)
        
        print(f"\nTrain Accuracy: {accuracy_score(y_train, y_pred_train):.4f}")
        print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
        print(f"Overfitting Gap: {accuracy_score(y_train, y_pred_train) - accuracy_score(y_test, y_pred_test):.4f}")
        
        # Print classification report
        print("\nTest Set Classification Report:")
        print(classification_report(
            y_test, y_pred_test, 
            target_names=self.label_encoder.classes_
        ))
        
        # Plot feature importance
        self.plot_feature_importance_selected()
        
        return X_train_selected, X_test_selected, y_train, y_test, y_pred_test
    
    
    def plot_feature_importance_selected(self, top_n=15):
        """Plot feature importance for selected features"""
        if self.model is None:
            return
        
        importance_df = pd.DataFrame({
            'feature': self.selected_features,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(
            data=importance_df.head(top_n),
            x='importance',
            y='feature'
        )
        plt.title('Feature Importance - Selected Features Only')
        plt.tight_layout()
        plt.show()
    

    def predict(self, df):
        """Predict with feature selection"""
        if self.model is None:
            raise ValueError("Model not trained yet. Call train() first.")
        
        # Engineer features
        df_features = self.engineer_crew_features(df)
        
        # Select original features
        X = df_features[self.feature_names]
        X = X.fillna(X.median())
        
        # Scale features
        X_scaled = self.scaler.transform(X)
        
        # Apply feature selection
        X_selected = self.feature_selector.transform(X_scaled)
        
        # Predict
        y_pred_encoded = self.model.predict(X_selected)
        y_pred_proba = self.model.predict_proba(X_selected)
        
        # Decode predictions
        y_pred = self.label_encoder.inverse_transform(y_pred_encoded)
        
        # Create results dataframe
        results = pd.DataFrame({
            'Date': df['Date'] if 'Date' in df.columns else range(len(df)),
            'predicted_crew_size': y_pred,
            'prediction_confidence': y_pred_proba.max(axis=1)
        })
        
        # Add probability for each class
        for i, class_name in enumerate(self.label_encoder.classes_):
            results[f'prob_{class_name}'] = y_pred_proba[:, i]
        
        return results
    

    def analyze_crew_patterns(self, df):
        """Analyze patterns in crew size assignments"""
        df_clean = self.clean_crew_size_data(df)
        
        # Visitor count vs crew size analysis
        if 'Total Visitors_actual' in df.columns:
            crew_visitor_analysis = df_clean.groupby('maat_visitors').agg({
                'Total Visitors_actual': ['mean', 'median', 'std', 'min', 'max'],
                'Date': 'count'
            }).round(2)
            
            print("Visitor Statistics by Crew Size:")
            print(crew_visitor_analysis)
        
        # Day of week patterns
        if 'Date' in df.columns:
            df_clean['Date'] = pd.to_datetime(df_clean['Date'])
            df_clean['day_of_week'] = df_clean['Date'].dt.day_name()
            
            day_crew_crosstab = pd.crosstab(
                df_clean['day_of_week'], 
                df_clean['maat_visitors'], 
                normalize='index'
            ) * 100
            
            print("\nCrew Size Distribution by Day of Week (%):")
            print(day_crew_crosstab.round(1))
        
        return crew_visitor_analysis if 'Total Visitors_actual' in df.columns else None

In [None]:
if __name__ == "__main__":
    crew_model = CrewSizePredictionModel()

    crew_model.analyze_crew_patterns(df)

    crew_model.train(df)

    predictions = crew_model.predict(df)
    predictions.to_csv("../Data_Sources/Data_Cleaned/Predictions/Crew_Size_Predictions.csv", index=False)