In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from skopt import BayesSearchCV

# 1. Enhanced Data Preprocessing
def preprocess_data(df):
    # Create copies to avoid modifying the original dataframe
    df = df.copy()
    
    # First ensure target column is not null
    if df['ABC_Classification'].isnull().any():
        print("Dropping rows with missing target values")
        df = df.dropna(subset=['ABC_Classification'])
    
    X = df.drop('ABC_Classification', axis=1)
    y = df['ABC_Classification']
    
    # Identify column types
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = X.select_dtypes(exclude=['number']).columns.tolist()
    
    # Create transformers for numeric and categorical features
    transformers = []
    
    if len(numeric_cols) > 0:
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('power', PowerTransformer(method='yeo-johnson', standardize=True))
        ])
        transformers.append(('num', numeric_transformer, numeric_cols))
    
    if len(categorical_cols) > 0:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
        transformers.append(('cat', categorical_transformer, categorical_cols))
    
    preprocessor = ColumnTransformer(transformers)
    
    return X, y, preprocessor

# 2. Train Model with Bayesian Optimization
def optimize_models(X_train, y_train, preprocessor):
    # Process the training data once to avoid repeated preprocessing
    print("Preprocessing training data...")
    X_train_processed = preprocessor.fit_transform(X_train)
    print(f"Processed training data shape: {X_train_processed.shape}")
    
    # Create SMOTE instance outside the pipeline
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    print(f"Resampled training data shape: {X_train_resampled.shape}")
    
    classifiers = {
        'SVM': SVC(probability=True, random_state=42),
        'KNN': KNeighborsClassifier(),
        'NaiveBayes': GaussianNB(),
        'RandomForest': RandomForestClassifier(random_state=42),
        'DecisionTree': DecisionTreeClassifier(random_state=42)
    }
    
    param_spaces = {
        'SVM': {
            'C': (0.1, 100, 'log-uniform'), 
            'gamma': (0.001, 1, 'log-uniform'), 
            'kernel': ['rbf', 'poly']
        },
        'KNN': {
            'n_neighbors': (1, 30),
            'weights': ['uniform', 'distance'],
            'p': [1, 2]  # 1 for Manhattan, 2 for Euclidean
        },
        'NaiveBayes': {
            'var_smoothing': (1e-9, 1e-1, 'log-uniform')
        },
        'RandomForest': {
            'n_estimators': (50, 500), 
            'max_depth': (3, 50)
        },
        'DecisionTree': {
            'max_depth': (3, 50),
            'min_samples_split': (2, 20),
            'min_samples_leaf': (1, 20)
        }
    }
    
    best_models = {}
    for name, clf in classifiers.items():
        print(f"Optimizing {name}...")
        
        # Use BayesSearchCV directly on the classifier with preprocessed data
        optimizer = BayesSearchCV(
            clf, 
            param_spaces[name], 
            n_iter=10,
            cv=3,
            scoring='accuracy', 
            n_jobs=-1,
            random_state=42
        )
        
        optimizer.fit(X_train_resampled, y_train_resampled)
        best_models[name] = optimizer.best_estimator_
        
        print(f"{name} Best Params: {optimizer.best_params_}")
        print(f"{name} Best Score: {optimizer.best_score_:.4f}")
    
    return best_models, preprocessor, smote

# 3. Create Ensemble
def create_ensemble(best_models):
    ensemble = VotingClassifier(
        estimators=[
            ('svm', best_models['SVM']),
            ('knn', best_models['KNN']),
            ('nb', best_models['NaiveBayes']),
            ('rf', best_models['RandomForest']),
            ('dt', best_models['DecisionTree'])
        ], 
        voting='soft'
    )
    return ensemble

# 4. Evaluate Model
def evaluate_model(model, X_test_transformed, y_test):
    y_pred = model.predict(X_test_transformed)
    accuracy = accuracy_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Cohen's Kappa: {kappa:.4f}")
    print(f"MCC: {mcc:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return accuracy

# Main Execution Function
def improve_model_performance(df):
    # Make a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Check if target exists
    if 'ABC_Classification' not in df.columns:
        raise ValueError("Target column 'ABC_Classification' not found in dataset")
    
    # Handle NaN values explicitly before modeling
    print("\nHandling missing values before processing...")
    
    # Report on missing values
    missing_counts = df.isnull().sum()
    print("Missing values per column before preprocessing:")
    print(missing_counts[missing_counts > 0])
    
    # Drop rows where target is NaN
    if df['ABC_Classification'].isnull().any():
        df = df.dropna(subset=['ABC_Classification'])
        print(f"Dropped rows with missing target values. Remaining rows: {df.shape[0]}")
    
    # Preprocess data
    X, y, preprocessor = preprocess_data(df)
    
    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    # Print dataset shapes
    print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
    
    # Get baseline model performance
    print("Class distribution:")
    print(y_train.value_counts(normalize=True))
    
    # Optimize models - now returns preprocessor and smote
    best_models, fitted_preprocessor, fitted_smote = optimize_models(X_train, y_train, preprocessor)
    
    # Create ensemble
    ensemble_model = create_ensemble(best_models)
    
    # Transform test data using the fitted preprocessor
    X_test_transformed = fitted_preprocessor.transform(X_test)
    
    # Train the ensemble on preprocessed and resampled data
    print("Training final ensemble model...")
    X_train_processed = fitted_preprocessor.transform(X_train)
    X_train_resampled, y_train_resampled = fitted_smote.fit_resample(X_train_processed, y_train)
    ensemble_model.fit(X_train_resampled, y_train_resampled)
    
    # Evaluate the final model
    print("Final Ensemble Model Performance:")
    final_accuracy = evaluate_model(ensemble_model, X_test_transformed, y_test)
    
    # Also evaluate individual models
    print("\nIndividual Model Performances:")
    for name, model in best_models.items():
        print(f"\n{name} Performance:")
        _ = evaluate_model(model, X_test_transformed, y_test)
    
    # Return the full pipeline components
    return ensemble_model, fitted_preprocessor, fitted_smote, final_accuracy

# Load Dataset and run
if __name__ == "__main__":
    try:
        # Load data with error handling
        print("Loading dataset...")
        df = pd.read_csv("Inventory_dataset (2).csv")
        df=df.head(600)
        print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
        
        # Only use first 600 rows if specified
        df = df.head(600)
        
        # Quick data overview
        print("\nData overview:")
        print(df.dtypes)
        print("\nMissing values per column:")
        print(df.isnull().sum())
        
        # Preprocess NaN values for the entire dataset
        print("\nPreprocessing NaN values...")
        
        # 1. For numeric columns, replace NaN with median
        numeric_cols = df.select_dtypes(include=['number']).columns
        for col in numeric_cols:
            if df[col].isnull().any():
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
                print(f"Filled NaN in {col} with median: {median_val}")
        
        # 2. For categorical columns, replace NaN with mode
        categorical_cols = df.select_dtypes(exclude=['number']).columns
        for col in categorical_cols:
            if df[col].isnull().any():
                mode_val = df[col].mode()[0]
                df[col] = df[col].fillna(mode_val)
                print(f"Filled NaN in {col} with mode: {mode_val}")
        
        # Verify NaN values are handled
        remaining_nans = df.isnull().sum().sum()
        print(f"Remaining NaN values: {remaining_nans}")
        
        if remaining_nans > 0:
            print("Warning: Still have NaN values. Removing rows with NaN...")
            df = df.dropna()
            print(f"After dropping NaNs: {df.shape[0]} rows remaining")
        
        # Run the optimization
        print("\nStarting model optimization...")
        ensemble_model, preprocessor, smote, best_accuracy = improve_model_performance(df)
        print(f"\nBest model achieved {best_accuracy:.4f} accuracy")
        
        # Save the model components if needed
        # import joblib
        # joblib.dump(ensemble_model, 'ensemble_model.pkl')
        # joblib.dump(preprocessor, 'preprocessor.pkl')
        # joblib.dump(smote, 'smote.pkl')
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()


Loading dataset...
Dataset loaded: 600 rows, 14 columns

Data overview:
SKU_ID                    object
Item_Cost                float64
Item_Count               float64
Total_Cost               float64
Lead_Time                float64
Shelf_Life               float64
EOQ                      float64
Lead_Time_Variability     object
Seasonality               object
Warehouse_Location        object
Customer_Reviews         float64
Historical_Sales_Data     object
Demand_Fluctuation        object
ABC_Classification        object
dtype: object

Missing values per column:
SKU_ID                    0
Item_Cost                 1
Item_Count               16
Total_Cost               10
Lead_Time                14
Shelf_Life                6
EOQ                       4
Lead_Time_Variability     5
Seasonality               5
Warehouse_Location        1
Customer_Reviews         12
Historical_Sales_Data    15
Demand_Fluctuation        7
ABC_Classification       24
dtype: int64

Preprocessing NaN 



KNN Best Params: OrderedDict([('n_neighbors', 1), ('p', 2), ('weights', 'distance')])
KNN Best Score: 0.8599
Optimizing NaiveBayes...
NaiveBayes Best Params: OrderedDict([('var_smoothing', 0.04403762515730915)])
NaiveBayes Best Score: 0.7837
Optimizing RandomForest...
RandomForest Best Params: OrderedDict([('max_depth', 24), ('n_estimators', 463)])
RandomForest Best Score: 0.9096
Optimizing DecisionTree...
DecisionTree Best Params: OrderedDict([('max_depth', 41), ('min_samples_leaf', 4), ('min_samples_split', 13)])
DecisionTree Best Score: 0.8262
Training final ensemble model...
Final Ensemble Model Performance:
Accuracy: 0.9000
Cohen's Kappa: 0.8485
MCC: 0.8488
Classification Report:
               precision    recall  f1-score   support

           A       0.91      0.86      0.89        36
           B       0.92      0.92      0.92        37
           C       0.88      0.91      0.90        47

    accuracy                           0.90       120
   macro avg       0.90      0.90