<a href="https://colab.research.google.com/github/Rajparikh18/Statistical-Inference-Collab-Work/blob/main/Lab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Dataset Loading and Handling Imbalance

The `load_credit_card_dataset` method attempts to load the data from a specified file path. If the file is not found, it automatically creates a synthetic dataset with a similar class imbalance to ensure the analysis can proceed. The `sample_dataset_for_analysis` method is included to optionally reduce the dataset size for faster computation while preserving the class distribution.

In [9]:
# From the original code, within the CreditCardFraudAnalyzer class:
# (Only showing the relevant methods)

def load_credit_card_dataset(self, file_path):
    """
    Load the Credit Card Fraud Detection dataset

    Parameters:
    - file_path: Path to the creditcard.csv file
    """
    try:
        print("Loading Credit Card Fraud Detection dataset...")
        self.df = pd.read_csv(file_path)
        self.X = self.df.drop('Class', axis=1).values
        self.y = self.df['Class'].values
        self.class_distribution = Counter(self.y)
        self.dataset_info = {
            'total_samples': len(self.y),
            'n_features': self.X.shape[1],
            'normal_transactions': self.class_distribution[0],
            'fraud_transactions': self.class_distribution[1],
            'fraud_percentage': (self.class_distribution[1] / len(self.y)) * 100
        }
        print("Dataset loaded successfully!")
        print(f"Total samples: {self.dataset_info['total_samples']:,}")
        print(f"Features: {self.dataset_info['n_features']}")
        print(f"Normal transactions: {self.dataset_info['normal_transactions']:,} ({(self.class_distribution[0]/len(self.y)*100):.2f}%)")
        print(f"Fraud transactions: {self.dataset_info['fraud_transactions']:,} ({self.dataset_info['fraud_percentage']:.3f}%)")
        print(f"Imbalance ratio: 1:{self.class_distribution[0]/self.class_distribution[1]:.0f}")
        return self.X, self.y
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found!")
        print("\nPlease download the dataset from:")
        print("https://www.kaggle.com/mlg-ulb/creditcardfraud")
        print("\nOr ensure the file path is correct.")
        print("\nCreating synthetic dataset with similar characteristics as a fallback...")
        return self.create_synthetic_fraud_dataset()
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Creating synthetic dataset as fallback...")
        return self.create_synthetic_fraud_dataset()

def create_synthetic_fraud_dataset(self):
    """
    Create a synthetic dataset similar to credit card fraud data
    """
    from sklearn.datasets import make_classification
    X, y = make_classification(
        n_samples=100000, # Reduced size for faster computation
        n_features=30,
        n_informative=25,
        n_redundant=3,
        n_clusters_per_class=1,
        weights=[0.9983, 0.0017], # Similar to real fraud ratio
        flip_y=0.001,
        random_state=self.random_state
    )
    self.X = X
    self.y = y
    self.class_distribution = Counter(y)
    feature_names = [f'V{i+1}' for i in range(29)] + ['Amount']
    self.df = pd.DataFrame(X, columns=feature_names)
    self.df['Class'] = y
    self.dataset_info = {
        'total_samples': len(y),
        'n_features': X.shape[1],
        'normal_transactions': self.class_distribution[0],
        'fraud_transactions': self.class_distribution[1],
        'fraud_percentage': (self.class_distribution[1] / len(y)) * 100
    }
    print("Synthetic fraud dataset created!")
    print(f"Total samples: {self.dataset_info['total_samples']:,}")
    print(f"Normal transactions: {self.dataset_info['normal_transactions']:,}")
    print(f"Fraud transactions: {self.dataset_info['fraud_transactions']:,}")
    print(f"Fraud percentage: {self.dataset_info['fraud_percentage']:.3f}%")
    return X, y

def sample_dataset_for_analysis(self, sample_size=10000):
    """
    Sample the dataset for faster analysis while maintaining class distribution

    Parameters:
    - sample_size: Number of samples to use for analysis
    """
    if len(self.y) > sample_size:
        print(f"\nSampling {sample_size:,} samples for analysis (maintaining class distribution)...")
        fraud_ratio = self.class_distribution[1] / len(self.y)
        n_fraud_samples = max(int(sample_size * fraud_ratio), 50) # Ensure minimum fraud samples
        n_normal_samples = sample_size - n_fraud_samples
        fraud_indices = np.where(self.y == 1)[0]
        normal_indices = np.where(self.y == 0)[0]
        np.random.seed(self.random_state)
        sampled_fraud_idx = np.random.choice(fraud_indices,
                                           min(n_fraud_samples, len(fraud_indices)),
                                           replace=False)
        sampled_normal_idx = np.random.choice(normal_indices,
                                            min(n_normal_samples, len(normal_indices)),
                                            replace=False)
        sampled_indices = np.concatenate([sampled_fraud_idx, sampled_normal_idx])
        np.random.shuffle(sampled_indices)
        self.X = self.X[sampled_indices]
        self.y = self.y[sampled_indices]
        self.class_distribution = Counter(self.y)
        print(f"Sampled dataset:")
        print(f"Normal transactions: {self.class_distribution[0]:,}")
        print(f"Fraud transactions: {self.class_distribution[1]:,}")
        print(f"Fraud percentage: {(self.class_distribution[1]/len(self.y)*100):.3f}%")

### 2. Cross-Validation Implementation

The `analyze_fold_distributions` method demonstrates how `KFold` and `StratifiedKFold` are used and analyzes the class distribution within each fold for both strategies. This is crucial for understanding why stratified cross-validation is necessary for imbalanced datasets.

In [3]:
# From the original code, within the CreditCardFraudAnalyzer class:
# (Only showing the relevant method)

def analyze_fold_distributions(self):
    """
    Analyze class distributions across different fold strategies
    """
    kfold = KFold(n_splits=self.k_folds, shuffle=True, random_state=self.random_state)
    stratified_kfold = StratifiedKFold(n_splits=self.k_folds, shuffle=True, random_state=self.random_state)
    kfold_distributions = []
    stratified_distributions = []

    print("\nAnalyzing fold distributions...")

    # Analyze K-Fold distributions
    for fold, (train_idx, val_idx) in enumerate(kfold.split(self.X, self.y)):
        val_distribution = Counter(self.y[val_idx])
        fraud_ratio = val_distribution[1] / len(val_idx) if len(val_idx) > 0 else 0
        kfold_distributions.append({
            'fold': fold + 1,
            'fraud_count': val_distribution[1],
            'normal_count': val_distribution[0],
            'fraud_ratio': fraud_ratio,
            'total_samples': len(val_idx)
        })

    # Analyze Stratified K-Fold distributions
    for fold, (train_idx, val_idx) in enumerate(stratified_kfold.split(self.X, self.y)):
        val_distribution = Counter(self.y[val_idx])
        fraud_ratio = val_distribution[1] / len(val_idx) if len(val_idx) > 0 else 0
        stratified_distributions.append({
            'fold': fold + 1,
            'fraud_count': val_distribution[1],
            'normal_count': val_distribution[0],
            'fraud_ratio': fraud_ratio,
            'total_samples': len(val_idx)
        })

    self.kfold_dist_df = pd.DataFrame(kfold_distributions)
    self.stratified_dist_df = pd.DataFrame(stratified_distributions)

    print("\nFold Distribution Analysis:")
    print("K-Fold Cross-Validation:")
    print(f"  Fraud ratio - Mean: {self.kfold_dist_df['fraud_ratio'].mean():.4f}, "
          f"Std: {self.kfold_dist_df['fraud_ratio'].std():.4f}")
    print(f"  Min fraud samples per fold: {self.kfold_dist_df['fraud_count'].min()}")
    print(f"  Max fraud samples per fold: {self.kfold_dist_df['fraud_count'].max()}")

    print("\nStratified K-Fold Cross-Validation:")
    print(f"  Fraud ratio - Mean: {self.stratified_dist_df['fraud_ratio'].mean():.4f}, "
          f"Std: {self.stratified_dist_df['fraud_ratio'].std():.4f}")
    print(f"  Min fraud samples per fold: {self.stratified_dist_df['fraud_count'].min()}")
    print(f"  Max fraud samples per fold: {self.stratified_dist_df['fraud_count'].max()}")

    return self.kfold_dist_df, self.stratified_dist_df

### 3. Model Training and Evaluation

The `train_and_evaluate_models` method trains multiple classification models using both K-Fold and Stratified K-Fold. It calculates the required performance metrics (Accuracy, Precision, Recall, F1-score, and ROC-AUC) for each fold and stores the results.

In [10]:
# From the original code, within the CreditCardFraudAnalyzer class:
# (Only showing the relevant method)

def train_and_evaluate_models(self):
    """
    Train multiple models using both cross-validation strategies
    """
    models = {
        'Random Forest': RandomForestClassifier(
            n_estimators=100,
            class_weight='balanced',
            random_state=self.random_state,
            n_jobs=-1
        ),
        'Gradient Boosting': GradientBoostingClassifier(
            n_estimators=100,
            random_state=self.random_state
        ),
        'Logistic Regression': LogisticRegression(
            class_weight='balanced',
            random_state=self.random_state,
            max_iter=1000
        ),
        'SVM': SVC(
            class_weight='balanced',
            probability=True,
            random_state=self.random_state
        )
    }

    kfold = KFold(n_splits=self.k_folds, shuffle=True, random_state=self.random_state)
    stratified_kfold = StratifiedKFold(n_splits=self.k_folds, shuffle=True, random_state=self.random_state)

    self.results = {
        'K-Fold': {model_name: {'accuracy': [], 'precision': [], 'recall': [],
                               'f1': [], 'roc_auc': []} for model_name in models.keys()},
        'Stratified K-Fold': {model_name: {'accuracy': [], 'precision': [], 'recall': [],
                                          'f1': [], 'roc_auc': []} for model_name in models.keys()}
    }

    print("\nStandardizing features...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(self.X)

    # Evaluate with K-Fold
    print("Evaluating with K-Fold Cross-Validation...")
    for model_name, model in models.items():
        print(f"  Training {model_name}...")
        for fold, (train_idx, val_idx) in enumerate(kfold.split(X_scaled, self.y)):
            X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
            y_train, y_val = self.y[train_idx], self.y[val_idx]

            if len(np.unique(y_val)) < 2:
                print(f"    Warning: Fold {fold+1} has only one class. Skipping...")
                for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
                    self.results['K-Fold'][model_name][metric].append(np.nan)
                continue

            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            y_pred_proba = model.predict_proba(X_val)[:, 1]

            self.results['K-Fold'][model_name]['accuracy'].append(accuracy_score(y_val, y_pred))
            self.results['K-Fold'][model_name]['precision'].append(precision_score(y_val, y_pred, zero_division=0))
            self.results['K-Fold'][model_name]['recall'].append(recall_score(y_val, y_pred, zero_division=0))
            self.results['K-Fold'][model_name]['f1'].append(f1_score(y_val, y_pred, zero_division=0))
            self.results['K-Fold'][model_name]['roc_auc'].append(roc_auc_score(y_val, y_pred_proba))

    # Evaluate with Stratified K-Fold
    print("Evaluating with Stratified K-Fold Cross-Validation...")
    for model_name, model in models.items():
        print(f"  Training {model_name}...")
        for fold, (train_idx, val_idx) in enumerate(stratified_kfold.split(X_scaled, self.y)):
            X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
            y_train, y_val = self.y[train_idx], self.y[val_idx]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            y_pred_proba = model.predict_proba(X_val)[:, 1]

            self.results['Stratified K-Fold'][model_name]['accuracy'].append(accuracy_score(y_val, y_pred))
            self.results['Stratified K-Fold'][model_name]['precision'].append(precision_score(y_val, y_pred, zero_division=0))
            self.results['Stratified K-Fold'][model_name]['recall'].append(recall_score(y_val, y_pred, zero_division=0))
            self.results['Stratified K-Fold'][model_name]['f1'].append(f1_score(y_val, y_pred, zero_division=0))
            self.results['Stratified K-Fold'][model_name]['roc_auc'].append(roc_auc_score(y_val, y_pred_proba))

    print("Model training and evaluation completed!")

### 4. Performance Comparison and Stability Analysis

The `calculate_stability_metrics` method computes the mean, standard deviation, and coefficient of variation for each metric across the folds, providing insights into model stability.

In [11]:
# From the original code, within the CreditCardFraudAnalyzer class:
# (Only showing the relevant method)

def calculate_stability_metrics(self):
    """
    Calculate stability metrics for each model and method
    """
    stability_results = {}

    for method in ['K-Fold', 'Stratified K-Fold']:
        stability_results[method] = {}
        for model_name in self.results[method].keys():
            model_stability = {}
            for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
                scores = [score for score in self.results[method][model_name][metric] if not np.isnan(score)]
                if scores:
                    mean_score = np.mean(scores)
                    std_score = np.std(scores)
                    cv_score = std_score / mean_score if mean_score != 0 else 0
                else:
                    mean_score = std_score = cv_score = 0

                model_stability[metric] = {
                    'mean': mean_score,
                    'std': std_score,
                    'cv': cv_score,
                    'valid_folds': len(scores)
                }
            stability_results[method][model_name] = model_stability

    return stability_results

### 5. Visualizations

The `create_comprehensive_visualizations` method generates plots to visually compare the fold-wise class distribution and the performance metrics across both cross-validation strategies.

In [6]:
# From the original code, within the CreditCardFraudAnalyzer class:
# (Only showing the relevant method)

def create_comprehensive_visualizations(self):
    """
    Create comprehensive visualizations for fraud detection analysis
    """
    fig = plt.figure(figsize=(20, 28))

    # 1. Dataset Overview
    plt.subplot(5, 3, 1)
    class_names = ['Normal', 'Fraud']
    class_counts = [self.class_distribution[0], self.class_distribution[1]]
    colors = ['lightblue', 'lightcoral']
    plt.pie(class_counts, labels=class_names, autopct='%1.3f%%', colors=colors, startangle=90)
    plt.title('Credit Card Transaction Distribution')

    # 2. Fraud Distribution Across Folds
    plt.subplot(5, 3, 2)
    x = range(1, self.k_folds + 1)
    plt.bar([i - 0.2 for i in x], self.kfold_dist_df['fraud_ratio'],
            width=0.4, label='K-Fold', alpha=0.7, color='coral')
    plt.bar([i + 0.2 for i in x], self.stratified_dist_df['fraud_ratio'],
            width=0.4, label='Stratified K-Fold', alpha=0.7, color='lightblue')
    plt.xlabel('Fold Number')
    plt.ylabel('Fraud Ratio')
    plt.title('Fraud Distribution Across Folds')
    plt.legend()
    plt.grid(True, alpha=0.3)

    # 3. Standard Deviation of Fraud Distribution
    plt.subplot(5, 3, 3)
    methods = ['K-Fold', 'Stratified K-Fold']
    std_values = [self.kfold_dist_df['fraud_ratio'].std(),
                 self.stratified_dist_df['fraud_ratio'].std()]
    bars = plt.bar(methods, std_values, color=['coral', 'lightblue'])
    plt.ylabel('Standard Deviation')
    plt.title('Fraud Distribution Stability')
    plt.grid(True, alpha=0.3)

    for bar, value in zip(bars, std_values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.0001,
                f'{value:.5f}', ha='center', va='bottom')

    # 4. Fraud Count Variability
    plt.subplot(5, 3, 4)
    plt.boxplot([self.kfold_dist_df['fraud_count'], self.stratified_dist_df['fraud_count']],
               labels=['K-Fold', 'Stratified K-Fold'])
    plt.ylabel('Fraud Cases per Fold')
    plt.title('Fraud Count Variability Across Folds')
    plt.grid(True, alpha=0.3)

    # 5-14. Performance Metrics Boxplots
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    model_names = list(self.results['K-Fold'].keys())

    plot_positions = [(5, 3, 5), (5, 3, 6), (5, 3, 7), (5, 3, 8), (5, 3, 9),
                     (5, 3, 10), (5, 3, 11), (5, 3, 12), (5, 3, 13), (5, 3, 14)]

    for idx, metric in enumerate(metrics):
        plt.subplot(*plot_positions[idx])
        data_to_plot = []
        labels = []

        for model_name in model_names:
            for method in ['K-Fold', 'Stratified K-Fold']:
                scores = [score for score in self.results[method][model_name][metric] if not np.isnan(score)]
                data_to_plot.append(scores)
                labels.append(f'{model_name}\n({method})')

        box_plot = plt.boxplot(data_to_plot, patch_artist=True, labels=labels)

        colors = ['lightcoral', 'lightblue'] * len(model_names)
        for patch, color in zip(box_plot['boxes'], colors):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)

        plt.xticks(rotation=45, ha='right')
        plt.ylabel(metric.replace('_', ' ').title())
        plt.title(f'{metric.replace("_", " ").title()} - Fraud Detection')
        plt.grid(True, alpha=0.3)

    # 15. Performance Summary Heatmap
    plt.subplot(5, 3, 15)

    summary_data = []
    for method in ['K-Fold', 'Stratified K-Fold']:
        for model_name in model_names:
            for metric in metrics:
                scores = [score for score in self.results[method][model_name][metric] if not np.isnan(scores)]
                mean_score = np.mean(scores) if scores else 0
                summary_data.append([method, model_name, metric, mean_score])

    summary_df = pd.DataFrame(summary_data, columns=['Method', 'Model', 'Metric', 'Score'])
    pivot_df = summary_df.pivot_table(index=['Method', 'Model'], columns='Metric', values='Score')

    sns.heatmap(pivot_df, annot=True, fmt='.3f', cmap='RdYlBu_r',
               cbar_kws={'label': 'Score'})
    plt.title('Credit Card Fraud Detection - Performance Summary')

    plt.tight_layout()
    plt.show()

### 6. Comprehensive Report Generation

The `generate_comprehensive_report` method consolidates the analysis, including dataset information, fold distribution analysis, performance comparison, stability analysis, and critical explanations of the findings. This directly addresses the report deliverable.

In [7]:
# From the original code, within the CreditCardFraudAnalyzer class:
# (Only showing the relevant method)

def generate_comprehensive_report(self):
    """
    Generate a comprehensive analysis report for credit card fraud detection
    """
    stability_results = self.calculate_stability_metrics()

    print("="*90)
    print("COMPREHENSIVE ANALYSIS REPORT")
    print("K-Fold vs Stratified K-Fold Cross-Validation for Credit Card Fraud Detection")
    print("="*90)

    # Dataset Information
    print("\n1. DATASET INFORMATION")
    print("-" * 50)
    print(f"Dataset: Credit Card Fraud Detection")
    print(f"Total samples: {self.dataset_info['total_samples']:,}")
    print(f"Features: {self.dataset_info['n_features']}")
    print(f"Normal transactions: {self.dataset_info['normal_transactions']:,} "
          f"({(self.dataset_info['normal_transactions']/self.dataset_info['total_samples']*100):.3f}%)")
    print(f"Fraud transactions: {self.dataset_info['fraud_transactions']:,} "
          f"({self.dataset_info['fraud_percentage']:.3f}%)")
    print(f"Imbalance ratio: 1:{self.dataset_info['normal_transactions']/self.dataset_info['fraud_transactions']:.0f}")

    # Fold Distribution Analysis
    print("\n2. FOLD DISTRIBUTION ANALYSIS")
    print("-" * 50)
    print("K-Fold Cross-Validation:")
    print(f"  Fraud ratio - Mean: {self.kfold_dist_df['fraud_ratio'].mean():.5f}, "
          f"Std: {self.kfold_dist_df['fraud_ratio'].std():.5f}")
    print(f"  Fraud count range: {self.kfold_dist_df['fraud_count'].min()} - {self.kfold_dist_df['fraud_count'].max()}")

    print("\nStratified K-Fold Cross-Validation:")
    print(f"  Fraud ratio - Mean: {self.stratified_dist_df['fraud_ratio'].mean():.5f}, "
          f"Std: {self.stratified_dist_df['fraud_ratio'].std():.5f}")
    print(f"  Fraud count range: {self.stratified_dist_df['fraud_count'].min()} - {self.stratified_dist_df['fraud_count'].max()}")

    # Performance Comparison
    print("\n3. PERFORMANCE COMPARISON")
    print("-" * 50)

    for model_name in self.results['K-Fold'].keys():
        print(f"\n{model_name}:")
        print("  Metric           K-Fold (Mean±Std)      Stratified K-Fold (Mean±Std)    Valid Folds")
        print("  " + "-" * 80)

        for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
            kfold_stats = stability_results['K-Fold'][model_name][metric]
            stratified_stats = stability_results['Stratified K-Fold'][model_name][metric]

            print(f"  {metric:<12} {kfold_stats['mean']:.3f}±{kfold_stats['std']:.3f} ({kfold_stats['valid_folds']})    "
                  f"{stratified_stats['mean']:.3f}±{stratified_stats['std']:.3f} ({stratified_stats['valid_folds']})")

    # Stability Analysis
    print("\n4. STABILITY ANALYSIS (Coefficient of Variation)")
    print("-" * 50)
    print("Lower CV values indicate more stable performance across folds")

    for model_name in stability_results['K-Fold'].keys():
        print(f"\n{model_name}:")
        print("  Metric       K-Fold CV    Stratified CV    Improvement")
        print("  " + "-" * 55)

        for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
            kfold_cv = stability_results['K-Fold'][model_name][metric]['cv']
            stratified_cv = stability_results['Stratified K-Fold'][model_name][metric]['cv']
            improvement = ((kfold_cv - stratified_cv) / kfold_cv * 100) if kfold_cv != 0 else 0

            print(f"  {metric:<12} {kfold_cv:.4f}      {stratified_cv:.4f}      {improvement:+.1f}%")

    # Critical Analysis for Fraud Detection
    print("\n5. CRITICAL ANALYSIS FOR FRAUD DETECTION")
    print("-" * 50)

    print("\nWhy K-Fold is problematic for fraud detection:")
    print("• With only 0.17% fraud cases, some folds may have very few or zero fraud samples")
    print("• This makes precision, recall, and F1-score calculations unreliable or undefined")
    print("• Model performance estimates become highly variable and misleading")
    print("• Risk of completely missing fraud patterns in validation")

    print("\nHow Stratified K-Fold addresses these issues:")
    print("• Ensures each fold contains representative fraud samples")
    print("• Maintains consistent fraud ratio across all folds")
    print("• Enables reliable calculation of all performance metrics")
    print("• Provides more realistic estimates of model performance")

    print("\nKey findings for fraud detection:")
    fraud_ratio_improvement = (
        (self.kfold_dist_df['fraud_ratio'].std() - self.stratified_dist_df['fraud_ratio'].std()) /
        self.kfold_dist_df['fraud_ratio'].std() * 100
    )
    print(f"• Stratified K-Fold reduces fraud ratio variability by {fraud_ratio_improvement:.1f}%")
    print("• F1-score and Recall are most improved with stratified approach")
    print("• All models show better stability with Stratified K-Fold")

    print("\n6. RECOMMENDATIONS FOR FRAUD DETECTION")
    print("-" * 50)
    print("• ALWAYS use Stratified K-Fold for extremely imbalanced datasets")
    print("• Focus on Precision, Recall, F1-score, and ROC-AUC rather than accuracy")
    print("• Consider cost-sensitive learning approaches")
    print("• Use class balancing techniques (class_weight='balanced')")
    print("• Monitor false positive and false negative rates carefully")
    print("• Consider ensemble methods for better fraud detection")

    return stability_results

To run the analysis, you can use the `main()` function provided in your original code. Make sure to update the `dataset_path` variable with the correct path to your `creditcard.csv` file if you have downloaded it. If not, the code will use the synthetic dataset.

In [12]:
# From the original code:
# (Only showing the main function)

def main():
    """
    Main function to run the complete analysis
    """
    analyzer = CreditCardFraudAnalyzer(k_folds=5, random_state=42)

    print("="*70)
    print("CREDIT CARD FRAUD DETECTION ANALYSIS")
    print("K-Fold vs Stratified K-Fold Cross-Validation")
    print("="*70)

    # IMPORTANT: Replace this path with your actual file path
    dataset_path = "creditcard.csv"  # Change this to your file path
    # Examples:
    # dataset_path = "/path/to/your/creditcard.csv"
    # dataset_path = "C:/Users/YourName/Downloads/creditcard.csv"
    # dataset_path = "./data/creditcard.csv"

    X, y = analyzer.load_credit_card_dataset(dataset_path)

    # Step 2: Sample dataset if it's too large (optional - for faster computation)
    # Comment out this line if you want to use the full dataset
    analyzer.sample_dataset_for_analysis(sample_size=20000)

    # Step 3: Analyze fold distributions
    print("\n" + "="*70)
    print("STEP 1: ANALYZING FOLD DISTRIBUTIONS")
    print("="*70)
    kfold_dist, stratified_dist = analyzer.analyze_fold_distributions()

    # Step 4: Train and evaluate models
    print("\n" + "="*70)
    print("STEP 2: TRAINING AND EVALUATING MODELS")
    print("="*70)
    analyzer.train_and_evaluate_models()

    # Step 5: Create visualizations
    print("\n" + "="*70)
    print("STEP 3: CREATING COMPREHENSIVE VISUALIZATIONS")
    print("="*70)
    analyzer.create_comprehensive_visualizations()

    # Step 6: Generate comprehensive report
    print("\n" + "="*70)
    print("STEP 4: GENERATING COMPREHENSIVE ANALYSIS REPORT")
    print("="*70)
    stability_results = analyzer.generate_comprehensive_report()

    print("\n" + "="*70)
    print("ANALYSIS COMPLETE!")
    print("All visualizations and analysis have been generated.")
    print("Check the plots and detailed report above.")
    print("="*70)

# Run the analysis
if __name__ == "__main__":
    main()

CREDIT CARD FRAUD DETECTION ANALYSIS
K-Fold vs Stratified K-Fold Cross-Validation
Loading Credit Card Fraud Detection dataset...
Dataset loaded successfully!
Total samples: 31,780
Features: 30
Normal transactions: 31,677 (99.68%)
Fraud transactions: 102 (0.321%)
Imbalance ratio: 1:311

Sampling 20,000 samples for analysis (maintaining class distribution)...
Sampled dataset:
Normal transactions: 19,936
Fraud transactions: 64
Fraud percentage: 0.320%

STEP 1: ANALYZING FOLD DISTRIBUTIONS

Analyzing fold distributions...

Fold Distribution Analysis:
K-Fold Cross-Validation:
  Fraud ratio - Mean: 0.0032, Std: 0.0006
  Min fraud samples per fold: 10
  Max fraud samples per fold: 15

Stratified K-Fold Cross-Validation:
  Fraud ratio - Mean: 0.0032, Std: 0.0001
  Min fraud samples per fold: 12
  Max fraud samples per fold: 13

STEP 2: TRAINING AND EVALUATING MODELS

Standardizing features...
Evaluating with K-Fold Cross-Validation...
  Training Random Forest...
  Training Gradient Boosting...