# üéØ Classification Analysis: customers_messy

**Generated:** 2025-12-09 12:50:54  
**Type:** Classification Modeling  
**Dataset:** customers_messy

## üéØ Objective
This notebook provides a complete classification modeling workflow including data exploration, preprocessing, model training, and evaluation.

## üìã Workflow Steps
1. **Data Loading & Exploration**
2. **Target Variable Analysis** 
3. **Feature Engineering & Preprocessing**
4. **Model Training & Selection**
5. **Model Evaluation & Metrics**
6. **Feature Importance Analysis**
7. **Predictions & Results**

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score, roc_curve
)
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

print("‚úÖ All libraries imported successfully!")

## 1. üìÅ Data Loading & Initial Exploration

In [None]:
# Load your dataset - REPLACE 'D:\data-dojo-1\datasets\ecommerce\customers_messy.csv' with your actual file path
df = pd.read_csv('D:\data-dojo-1\datasets\ecommerce\customers_messy.csv')

print("=== DATASET OVERVIEW ===")
print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== BASIC INFO ===")
print(df.info())

print("\n=== FIRST 5 ROWS ===")
display(df.head())

print("\n=== MISSING VALUES ===")
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
    print(missing_data[missing_data > 0])
else:
    print("No missing values found!")
    
print("\n=== DUPLICATE ROWS ===")
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

## 2. üéØ Target Variable Analysis

In [None]:
# IMPORTANT: Define your target variable here
# REPLACE 'target_column' with your actual target column name
target_column = 'target_column'  # ‚ö†Ô∏è UPDATE THIS WITH YOUR TARGET COLUMN

# Check if target column exists
if target_column in df.columns:
    print(f"‚úÖ Target variable found: {target_column}")
    
    # Analyze target distribution
    print("\n=== TARGET DISTRIBUTION ===")
    target_counts = df[target_column].value_counts()
    print(target_counts)
    
    # Calculate class balance
    class_percentages = df[target_column].value_counts(normalize=True) * 100
    print("\n=== CLASS PERCENTAGES ===")
    for class_name, percentage in class_percentages.items():
        print(f"{class_name}: {percentage:.2f}%")
    
    # Visualize target distribution
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    target_counts.plot(kind='bar', color='skyblue')
    plt.title(f'Distribution of {target_column}')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    plt.pie(target_counts.values, labels=target_counts.index, autopct='%1.1f%%')
    plt.title(f'Proportion of {target_column}')
    
    plt.tight_layout()
    plt.show()
    
    # Check for class imbalance
    min_class_pct = class_percentages.min()
    if min_class_pct < 10:
        print(f"‚ö†Ô∏è  WARNING: Class imbalance detected! Smallest class: {min_class_pct:.1f}%")
        print("Consider using techniques like SMOTE, class weights, or stratified sampling.")
    else:
        print("‚úÖ Classes are reasonably balanced.")
        
else:
    print(f"‚ùå Column '{target_column}' not found!")
    print(f"Available columns: {list(df.columns)}")
    print("\nPlease update the 'target_column' variable above.")

## 3. üìä Feature Analysis & Preprocessing

In [None]:
# Separate features and target (only if target column exists)
if target_column in df.columns:
    # Identify feature columns (exclude target and ID columns)
    id_columns = ['id', 'ID', 'index', 'customer_id', 'user_id']  # Add more ID columns if needed
    feature_columns = [col for col in df.columns 
                      if col != target_column and col not in id_columns]
    
    X = df[feature_columns].copy()
    y = df[target_column].copy()
    
    print(f"‚úÖ Features selected: {len(feature_columns)}")
    print(f"Feature columns: {feature_columns}")
    print(f"Target variable: {target_column}")
    
    # Analyze feature types
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"\nüìä Numeric features ({len(numeric_features)}): {numeric_features}")
    print(f"üìã Categorical features ({len(categorical_features)}): {categorical_features}")
    
    # Check for missing values in features
    feature_missing = X.isnull().sum()
    if feature_missing.sum() > 0:
        print("\n‚ö†Ô∏è  Missing values in features:")
        print(feature_missing[feature_missing > 0])
    else:
        print("\n‚úÖ No missing values in features!")
        
else:
    print("‚ùå Please define the target column first!")

## 4. üîß Data Preprocessing

In [None]:
# Data preprocessing pipeline
if target_column in df.columns and 'X' in locals():
    
    # Handle missing values
    print("=== HANDLING MISSING VALUES ===")
    
    # For numeric features: fill with median
    if len(numeric_features) > 0:
        numeric_imputer = SimpleImputer(strategy='median')
        X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])
        print(f"‚úÖ Filled missing values in numeric features with median")
    
    # For categorical features: fill with mode
    if len(categorical_features) > 0:
        categorical_imputer = SimpleImputer(strategy='most_frequent')
        X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])
        print(f"‚úÖ Filled missing values in categorical features with mode")
    
    # Encode categorical variables
    print("\n=== ENCODING CATEGORICAL VARIABLES ===")
    label_encoders = {}
    
    for col in categorical_features:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le
        print(f"‚úÖ Encoded {col}: {len(le.classes_)} unique values")
    
    # Encode target variable if it's categorical
    target_encoder = None
    if y.dtype == 'object':
        target_encoder = LabelEncoder()
        y = target_encoder.fit_transform(y)
        print(f"\n‚úÖ Encoded target variable: {target_encoder.classes_}")
    
    # Feature scaling (for algorithms that need it)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    
    print(f"\n‚úÖ Preprocessing completed!")
    print(f"Final feature matrix shape: {X.shape}")
    print(f"Target variable shape: {y.shape}")
    
else:
    print("‚ùå Please complete previous steps first!")

## 5. üöÇ Train-Test Split

In [None]:
# Split data into training and testing sets
if 'X' in locals() and 'y' in locals():
    
    # Split with stratification to maintain class balance
    test_size = 0.2  # 80% train, 20% test
    random_state = 42
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        random_state=random_state,
        stratify=y  # Maintain class distribution
    )
    
    # Also create scaled versions
    X_train_scaled, X_test_scaled, _, _ = train_test_split(
        X_scaled_df, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )
    
    print("=== TRAIN-TEST SPLIT COMPLETED ===")
    print(f"Training set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
    
    # Check class distribution in splits
    print("\n=== CLASS DISTRIBUTION ===")
    print("Training set:")
    print(pd.Series(y_train).value_counts(normalize=True).sort_index())
    
    print("\nTest set:")
    print(pd.Series(y_test).value_counts(normalize=True).sort_index())
    
else:
    print("‚ùå Please complete preprocessing first!")

## 6. ü§ñ Model Training & Selection

In [None]:
# Train multiple classification models
if 'X_train' in locals():
    
    # Define models to compare
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
        'Support Vector Machine': SVC(random_state=42, probability=True)
    }
    
    # Train and evaluate each model
    model_results = {}
    
    print("=== TRAINING MULTIPLE MODELS ===")
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Use scaled data for algorithms that need it
        if name in ['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine']:
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if len(np.unique(y)) == 2 else None
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1] if len(np.unique(y)) == 2 else None
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        model_results[name] = {
            'model': model,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'predictions': y_pred,
            'probabilities': y_pred_proba
        }
        
        print(f"‚úÖ {name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
    
    # Display results summary
    print("\n=== MODEL COMPARISON SUMMARY ===")
    results_df = pd.DataFrame({
        'Model': list(model_results.keys()),
        'Accuracy': [results['accuracy'] for results in model_results.values()],
        'Precision': [results['precision'] for results in model_results.values()],
        'Recall': [results['recall'] for results in model_results.values()],
        'F1-Score': [results['f1_score'] for results in model_results.values()]
    })
    
    results_df = results_df.sort_values('F1-Score', ascending=False)
    display(results_df)
    
    # Select best model
    best_model_name = results_df.iloc[0]['Model']
    best_model = model_results[best_model_name]['model']
    best_predictions = model_results[best_model_name]['predictions']
    
    print(f"\nüèÜ BEST MODEL: {best_model_name}")
    
else:
    print("‚ùå Please complete train-test split first!")

## 7. üìà Model Evaluation & Metrics

In [None]:
# Detailed evaluation of the best model
if 'best_model' in locals():
    
    print(f"=== DETAILED EVALUATION: {best_model_name} ===")
    
    # Classification report
    print("\nüìä CLASSIFICATION REPORT:")
    print(classification_report(y_test, best_predictions))
    
    # Confusion Matrix
    plt.figure(figsize=(15, 5))
    
    # Confusion matrix heatmap
    plt.subplot(1, 3, 1)
    cm = confusion_matrix(y_test, best_predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix\n{best_model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    
    # Feature importance (for tree-based models)
    if hasattr(best_model, 'feature_importances_'):
        plt.subplot(1, 3, 2)
        importance_df = pd.DataFrame({
            'Feature': X.columns,
            'Importance': best_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        top_features = importance_df.head(10)
        plt.barh(range(len(top_features)), top_features['Importance'])
        plt.yticks(range(len(top_features)), top_features['Feature'])
        plt.title('Top 10 Feature Importance')
        plt.xlabel('Importance')
        plt.gca().invert_yaxis()
    
    # ROC Curve (for binary classification)
    if len(np.unique(y)) == 2 and model_results[best_model_name]['probabilities'] is not None:
        plt.subplot(1, 3, 3)
        fpr, tpr, _ = roc_curve(y_test, model_results[best_model_name]['probabilities'])
        auc_score = roc_auc_score(y_test, model_results[best_model_name]['probabilities'])
        
        plt.plot(fpr, tpr, color='darkorange', lw=2, 
                label=f'ROC curve (AUC = {auc_score:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc="lower right")
    
    plt.tight_layout()
    plt.show()
    
    # Model performance metrics
    accuracy = accuracy_score(y_test, best_predictions)
    precision = precision_score(y_test, best_predictions, average='weighted')
    recall = recall_score(y_test, best_predictions, average='weighted')
    f1 = f1_score(y_test, best_predictions, average='weighted')
    
    print("\nüéØ FINAL PERFORMANCE METRICS:")
    print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    
    if len(np.unique(y)) == 2 and model_results[best_model_name]['probabilities'] is not None:
        auc = roc_auc_score(y_test, model_results[best_model_name]['probabilities'])
        print(f"AUC-ROC:   {auc:.4f}")
    
else:
    print("‚ùå Please complete model training first!")

## 8. üîç Feature Importance Analysis

In [None]:
# Analyze feature importance and model insights
if 'best_model' in locals():
    
    print(f"=== FEATURE IMPORTANCE ANALYSIS ===")
    
    if hasattr(best_model, 'feature_importances_'):
        # Create detailed feature importance dataframe
        importance_df = pd.DataFrame({
            'Feature': X.columns,
            'Importance': best_model.feature_importances_,
            'Importance_Percentage': best_model.feature_importances_ * 100
        }).sort_values('Importance', ascending=False)
        
        print("\nüìä TOP 15 MOST IMPORTANT FEATURES:")
        display(importance_df.head(15))
        
        # Visualize feature importance
        plt.figure(figsize=(12, 8))
        top_20_features = importance_df.head(20)
        
        plt.barh(range(len(top_20_features)), top_20_features['Importance'])
        plt.yticks(range(len(top_20_features)), top_20_features['Feature'])
        plt.xlabel('Feature Importance')
        plt.title(f'Top 20 Feature Importance - {best_model_name}')
        plt.gca().invert_yaxis()
        
        # Add percentage labels
        for i, v in enumerate(top_20_features['Importance']):
            plt.text(v + 0.001, i, f'{v:.3f}', va='center')
        
        plt.tight_layout()
        plt.show()
        
        # Feature importance insights
        print("\nüí° KEY INSIGHTS:")
        top_3_features = importance_df.head(3)
        total_importance = top_3_features['Importance'].sum()
        print(f"‚Ä¢ Top 3 features account for {total_importance:.1%} of model decisions")
        
        for i, (_, row) in enumerate(top_3_features.iterrows(), 1):
            print(f"‚Ä¢ #{i} Most important: '{row['Feature']}' ({row['Importance_Percentage']:.1f}%)")
            
    else:
        print(f"Feature importance not available for {best_model_name}")
        
        # For linear models, show coefficients
        if hasattr(best_model, 'coef_'):
            coef_df = pd.DataFrame({
                'Feature': X.columns,
                'Coefficient': best_model.coef_[0] if best_model.coef_.ndim > 1 else best_model.coef_
            })
            coef_df['Abs_Coefficient'] = np.abs(coef_df['Coefficient'])
            coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)
            
            print("\nüìä TOP 10 FEATURES BY COEFFICIENT MAGNITUDE:")
            display(coef_df.head(10))
            
else:
    print("‚ùå Please complete model training first!")

## 9. üéØ Predictions & Business Insights

In [None]:
# Generate predictions and business insights
if 'best_model' in locals():
    
    print(f"=== PREDICTION ANALYSIS ===")
    
    # Create a results dataframe
    results_df = pd.DataFrame({
        'Actual': y_test,
        'Predicted': best_predictions,
        'Correct': y_test == best_predictions
    })
    
    # Add original feature values for analysis
    test_indices = X_test.index
    results_df = results_df.merge(
        X.loc[test_indices], 
        left_index=True, 
        right_index=True, 
        how='left'
    )
    
    # Prediction accuracy by class
    print("\nüìä PREDICTION ACCURACY BY CLASS:")
    if target_encoder:
        class_accuracy = results_df.groupby('Actual')['Correct'].mean()
        for class_idx, accuracy in class_accuracy.items():
            class_name = target_encoder.classes_[class_idx]
            print(f"Class '{class_name}': {accuracy:.2%} correct predictions")
    else:
        class_accuracy = results_df.groupby('Actual')['Correct'].mean()
        for class_val, accuracy in class_accuracy.items():
            print(f"Class {class_val}: {accuracy:.2%} correct predictions")
    
    # Show some example predictions
    print("\nüîç SAMPLE PREDICTIONS:")
    sample_results = results_df.head(10)[['Actual', 'Predicted', 'Correct']]
    display(sample_results)
    
    # Misclassification analysis
    misclassified = results_df[results_df['Correct'] == False]
    if len(misclassified) > 0:
        print(f"\n‚ùå MISCLASSIFIED EXAMPLES: {len(misclassified)} out of {len(results_df)}")
        
        # Show patterns in misclassifications
        print("\nMost common misclassification patterns:")
        error_patterns = misclassified.groupby(['Actual', 'Predicted']).size().sort_values(ascending=False)
        print(error_patterns.head())
    
    print("\nüéØ MODEL DEPLOYMENT READINESS:")
    print(f"‚úÖ Model Type: {best_model_name}")
    print(f"‚úÖ Overall Accuracy: {accuracy:.2%}")
    print(f"‚úÖ Test Set Size: {len(y_test)} samples")
    print(f"‚úÖ Features Used: {len(X.columns)}")
    
    if accuracy >= 0.8:
        print("üü¢ HIGH ACCURACY: Model ready for deployment!")
    elif accuracy >= 0.7:
        print("üü° MODERATE ACCURACY: Consider feature engineering or more data")
    else:
        print("üî¥ LOW ACCURACY: Model needs significant improvement")
    
else:
    print("‚ùå Please complete model training first!")

## 10. ‚úÖ Summary & Next Steps

In [None]:
# Final summary and recommendations
print("=== üéØ CLASSIFICATION ANALYSIS COMPLETE ===")
print()

if 'best_model' in locals():
    print("üìä ANALYSIS SUMMARY:")
    print(f"‚Ä¢ Dataset: {dataset_name}")
    print(f"‚Ä¢ Target Variable: {target_column}")
    print(f"‚Ä¢ Best Model: {best_model_name}")
    print(f"‚Ä¢ Final Accuracy: {accuracy:.2%}")
    print(f"‚Ä¢ Features Used: {len(X.columns)}")
    print(f"‚Ä¢ Training Samples: {len(X_train)}")
    print(f"‚Ä¢ Test Samples: {len(X_test)}")
    
    print("\nüöÄ RECOMMENDED NEXT STEPS:")
    
    if accuracy >= 0.9:
        print("1. ‚úÖ Excellent performance! Ready for production deployment")
        print("2. üîÑ Set up model monitoring and retraining pipeline")
        print("3. üìà Consider A/B testing in production environment")
    elif accuracy >= 0.8:
        print("1. üéØ Good performance! Consider hyperparameter tuning")
        print("2. üîß Try ensemble methods or advanced algorithms")
        print("3. üìä Collect more data if possible")
    elif accuracy >= 0.7:
        print("1. üîß Feature engineering needed - create new features")
        print("2. üìä Collect more training data")
        print("3. üéØ Try different algorithms or ensemble methods")
        print("4. üîç Analyze and fix data quality issues")
    else:
        print("1. üîç Review data quality and target variable definition")
        print("2. üéØ Significant feature engineering required")
        print("3. üìä Consider if this is the right ML approach")
        print("4. ü§ù Consult domain experts for insights")
    
    print("\nüõ†Ô∏è  TECHNICAL IMPROVEMENTS:")
    print("‚Ä¢ Hyperparameter tuning with GridSearchCV or RandomSearchCV")
    print("‚Ä¢ Cross-validation for more robust evaluation")
    print("‚Ä¢ Feature selection techniques (RFE, SelectKBest)")
    print("‚Ä¢ Handle class imbalance (SMOTE, class weights)")
    print("‚Ä¢ Ensemble methods (Voting, Stacking)")
    print("‚Ä¢ Deep learning approaches if dataset is large")
    
    print("\nüíæ SAVE YOUR MODEL:")
    print("# Uncomment to save the trained model")
    print("# import joblib")
    print("# joblib.dump(best_model, 'classification_model.pkl')")
    print("# print('Model saved successfully!')")
    
else:
    print("‚ö†Ô∏è  Analysis incomplete. Please run all previous cells.")

print("\nüéâ Classification analysis workflow completed!")