In [None]:
# Part 1: Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error, mean_absolute_error
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel

In [None]:
#configuration phase, which features, balancing techniques and which model perfrom best. in a nested 5 fold cross validation

class FeatureBlockHandler:
    def __init__(self, demographic_columns, health_lifestyle_columns, 
                 personality_columns, politics_values_columns):
        self.blocks = {
            'demographic': demographic_columns,
            'health_lifestyle': health_lifestyle_columns,
            'personality': personality_columns,
            'politics_values': politics_values_columns,
            'all': demographic_columns + health_lifestyle_columns + 
                  personality_columns + politics_values_columns
        }
        
    def get_block(self, X, block_type):
        """Returns the features for the specified block."""
        if block_type not in self.blocks:
            raise ValueError(f"Block type must be one of {list(self.blocks.keys())}")
        return X[self.blocks[block_type]]

    def get_selected_features(self, X, y, method='none', threshold='median'):
        """
        Apply feature selection on ALL features using various methods.
        
        Parameters:
        -----------
        X : pandas DataFrame
            Input features (all features, not block-specific)
        y : array-like
            Target variable
        method : str
            'none', 'lasso', or 'rf'
        threshold : str or float
            Threshold for feature selection, 'median' or float value
        
        Returns:
        --------
        selected_features : list
            List of selected feature names
        selector : object
            Fitted selector object (if applicable)
        """
        if method == 'none':
            return list(X.columns), None
            
        elif method == 'lasso':
            selector = SelectFromModel(
                Lasso(random_state=42),
                threshold=threshold
            )
            
        elif method == 'rf':
            selector = SelectFromModel(
                RandomForestClassifier(random_state=42),
                threshold=threshold
            )
            
        else:
            raise ValueError("Method must be one of ['none', 'lasso', 'rf']")
        
        # Fit selector and get feature mask
        selector.fit(X, y)
        feature_mask = selector.get_support()
        
        # Get selected feature names
        selected_features = X.columns[feature_mask].tolist()
        
        return selected_features, selector

def find_best_combination(self, X, y, feature_blocks):
    """
    Find the best combination comparing:
    1. Different blocks without feature selection
    2. Feature selection methods (lasso, rf) on all features
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=self.random_state
    )
    
    best_score = -np.inf
    best_config = None
    results = {}
    
    # Define approaches to compare
    approaches = [
        # Block-based approaches (no feature selection)
        {'type': 'block', 'name': 'demographic'},
        {'type': 'block', 'name': 'health_lifestyle'},
        {'type': 'block', 'name': 'personality'},
        {'type': 'block', 'name': 'politics_values'},
        # Feature selection approaches (on all features)
        {'type': 'feature_selection', 'name': 'lasso'},
        {'type': 'feature_selection', 'name': 'rf'}
    ]
    
    # Outer CV for honest performance estimation
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, 
                             random_state=self.random_state)
    
    for balancing in ['undersampling', 'oversampling', 'smote']:
        print(f"\nTrying balancing method: {balancing}")
        
        for fold_idx, (train_idx, val_idx) in enumerate(outer_cv.split(X_train, y_train)):
            X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            # Apply balancing
            X_balanced, y_balanced = self.apply_balancing(
                X_fold_train, y_fold_train, balancing
            )
            
            for approach in approaches:
                approach_type = approach['type']
                approach_name = approach['name']
                
                if approach_type == 'block':
                    # Use block-based features
                    X_features = feature_blocks.get_block(X_balanced, approach_name)
                    X_val_features = feature_blocks.get_block(X_fold_val, approach_name)
                else:  # feature_selection
                    # Apply feature selection on all features
                    selected_features, selector = feature_blocks.get_selected_features(
                        X_balanced, y_balanced, method=approach_name
                    )
                    X_features = X_balanced[selected_features]
                    X_val_features = X_fold_val[selected_features]
                
                # Try different models with the selected features
                for model_name, model in self.models.items():
                    print(f"  Trying {approach_type}:{approach_name} with {model_name}")
                    
                    grid_search = GridSearchCV(
                        model, 
                        self.model_params[model_name],
                        cv=3,
                        scoring='f1'
                    )
                    
                    grid_search.fit(X_features, y_balanced)
                    y_pred = grid_search.predict(X_val_features)
                    score = f1_score(y_fold_val, y_pred)
                    
                    config_name = f"{balancing}_{approach_type}_{approach_name}_{model_name}"
                    if config_name not in results:
                        results[config_name] = []
                    results[config_name].append(score)
                    
                    if score > best_score:
                        best_score = score
                        best_config = {
                            'balancing': balancing,
                            'approach_type': approach_type,
                            'approach_name': approach_name,
                            'model': model_name,
                            'params': grid_search.best_params_
                        }
    
    # Calculate average scores across folds
    for config in results:
        results[config] = np.mean(results[config])
    
    return best_config, results

In [None]:
# split the dataset again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=self.random_state)



In [None]:
#train the model with the best configuration with cross validation. get the folds cross validation scores.

from sklearn.model_selection import cross_val_score

def train_final_model(self, X, y, best_config, feature_blocks):
    # Apply balancing
    X_balanced, y_balanced = self.apply_balancing(X, y, best_config['balancing'])

    # Get features
    if best_config['approach_type'] == 'block':
        X_features = feature_blocks.get_block(X_balanced, best_config['approach_name'])
    else:  # feature_selection
        selected_features, _ = feature_blocks.get_selected_features(
            X_balanced, y_balanced, method=best_config['approach_name']
        )
        X_features = X_balanced[selected_features]

    # Train final model with best parameters
    final_model = self.models[best_config['model']]
    final_model.set_params(**best_config['params'])

    # Evaluate using cross-validation
    scores = cross_val_score(final_model, X_features, y_balanced, cv=5, scoring='f1')
    test_score = np.mean(scores)

    # Fit final model on all data
    final_model.fit(X_features, y_balanced)

    return final_model, test_score

In [None]:
# i would get a list of F1-score values from cross_val_score, one for each fold of the cross-validation.
#i could take the mean of these scores to get the overall cross- validated F1-score of the final model.
scores=cross_val_score(final_model, X_features, y_balanced, cv=5, scoring='f1')
test-score=np.mean(scores)


In [None]:
#the train_final_model funstion returns the final model object( final_model). in addition to the cv metrics i could evaluate the final model on hold-out test set
# to see if model generalizes well.
#By comparing the cross-validated performance and the test set performance, you can get a sense of how well the model is expected to perform in the real world. If there's a significant gap between the two, it may indicate issues like overfitting that need to be addressed.
y_pred = final_model.predict(X_test_features)
test_set_f1 = f1_score(y_test, y_pred)