In [7]:
import pickle
from collections import OrderedDict
import deepchecks
import deepchecks.tabular.checks as checks
import ipywidgets as widgets
from deepchecks.tabular import Suite, Dataset
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_and_save_model(train_dataset, filepath='model.pkl'):
    """Train a sample RandomForest model and save it to a pickle file."""
    train_df = train_dataset.dataframe
    X_train = train_df.drop(columns=['label'])
    y_train = train_df['label']
    
    model = RandomForestClassifier(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)
    
    with open(filepath, 'wb') as file:
        pickle.dump(model, file)
    
    print(f"Model saved to {filepath}")
    return model

In [8]:
def load_model_from_pickle(filepath='model.pkl'):
    """Load a trained model from a pickle file."""
    with open(filepath, 'rb') as file:
        model = pickle.load(file)
    print(f"Model loaded from {filepath}")
    return model


In [9]:
#def train_sample_model(train_dataset):
#    """Train a sample RandomForest model."""
#    train_df = train_dataset.data
#    X_train = train_df.drop(columns=['label'])
#    y_train = train_df['label']
#    model = RandomForestClassifier(n_estimators=50, random_state=42)
#    model.fit(X_train, y_train)
#    return model
def generate_sample_data():
    """Generate sample train and test datasets."""
    np.random.seed(42)
    data = pd.DataFrame({
        'feature1': np.random.normal(0, 1, 2000),
        'feature2': np.random.normal(5, 2, 2000),
        'feature3': np.random.randint(0, 2, 2000),
        'label': np.random.randint(0, 2, 2000)
    })
    
    train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)
    train_dataset = Dataset(train_data, label='label')
    test_dataset = Dataset(test_data, label='label')
    return train_dataset, test_dataset

from sklearn.ensemble import RandomForestClassifier

def train_sample_model(train_dataset, test_dataset, filepath='model.pkl'):
    """Train a sample RandomForest model and add predicted probabilities and labels."""
    train_df = train_dataset.data
    test_df = test_dataset.data

    X_train = train_df.drop(columns=['label'])
    y_train = train_df['label']
    
    X_test = test_df.drop(columns=['label'])
    
    # Train the model
    model = RandomForestClassifier(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)

    with open(filepath, 'wb') as file:
        pickle.dump(model, file)
    
    print(f"Model saved to {filepath}")
    
    # Add predicted probabilities and labels
    train_df["predicted_probability"] = model.predict_proba(X_train)[:, 1]  # Assuming binary classification
    train_df["predicted_label"] = model.predict(X_train)
    
    test_df["predicted_probability"] = model.predict_proba(X_test)[:, 1]
    test_df["predicted_label"] = model.predict(X_test)

    return model, train_df, test_df



In [10]:
#
train_dataset, test_dataset = generate_sample_data()
model, train_dataset, test_dataset = train_sample_model(train_dataset, test_dataset)

train_dataset.to_csv(r"train.csv", index = False)
test_dataset.to_csv(r"test.csv", index = False)



Model saved to model.pkl


In [11]:
train_dataset
tmp=Dataset(train_dataset, label = label)
tmp.data



Unnamed: 0,feature1,feature2,feature3,label,predicted_probability,predicted_label
440,0.384065,8.499168,0,0,0.42,0
573,-0.219101,4.899591,1,0,0.04,0
946,-1.525525,5.170419,0,0,0.32,0
997,0.640843,2.516479,1,0,0.22,0
503,0.562969,3.775527,0,0,0.28,0
...,...,...,...,...,...,...
1130,0.321357,4.821532,1,0,0.12,0
1294,0.081829,2.585957,0,0,0.12,0
860,0.202923,3.937571,0,1,0.84,1
1459,0.673181,2.656691,1,1,0.70,1


In [12]:
print(model.feature_names_in_)
#list(model.feature_names_in_).append(label)
feat_list = list(model.feature_names_in_)
feat_list.append(label)
feat_list

['feature1' 'feature2' 'feature3']


['feature1', 'feature2', 'feature3', 'label']

In [20]:
from collections import OrderedDict
import deepchecks
import deepchecks.tabular.checks as checks
import ipywidgets as widgets
from deepchecks.tabular import Suite, Dataset
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def get_user_input():
    """Get user-defined parameter values for checks that require input."""
    params = {
        'drift_threshold': 0.1,
        'correlation_threshold': 0.9,
        'max_missing_ratio': 0.05,
        'min_accuracy': 0.7,
        'max_overfit_ratio': 1.5
    }
    return params

def get_model_evaluation_checks(params):
    """Retrieve and organize model evaluation checks into an OrderedDict with conditions."""
    check_dict_default = OrderedDict({
        0: checks.TrainTestPerformance().add_condition_test_performance_greater_than(min_score = 0.20),
        1: checks.RocReport().add_condition_auc_greater_than(min_auc = 0.7),
        2: checks.SimpleModelComparison(),
        4: checks.CalibrationScore()
    })

    check_dict_default = OrderedDict({
    0: checks.TrainTestPerformance().add_condition_test_performance_greater_than(min_score = 0.20),
    1: checks.RocReport().add_condition_auc_greater_than(min_auc = 0.7),
    2: checks.SimpleModelComparison(),
    4: checks.CalibrationScore()
    })
    
    return check_dict_default

def create_model_evaluation_suite():
    """Create a Model Evaluation suite with configured conditions."""
    params = get_user_input()
    check_dict = get_model_evaluation_checks(params)
    suite = Suite("Model Evaluation Suite", *check_dict.values())
    return suite

def evaluate_model(model, train_dataset, test_dataset):

    print("Creating datasets...")
    #y_pred_train = train_dataset[prediction_label_column]
    #y_pred_test = test_dataset[prediction_label_column]

    y_pred_train = train_dataset.pop(prediction_label_column)
    y_pred_test = test_dataset.pop(prediction_label_column)
    print(y_pred_train)

    train_dataset = Dataset(train_dataset, label = label)
    test_dataset = Dataset(test_dataset, label = label)
    
    
    """Run model evaluation checks using deepchecks."""
    if prediction_label_column is not None:

        #train_dataset = Dataset(train_dataset, label = label)
        #test_dataset = Dataset(test_dataset, label = label)
        
        suite1 = create_model_evaluation_suite()
        result1 = suite1.run(train_dataset = train_dataset, 
                            test_dataset = test_dataset, 
                            y_pred_train = y_pred_train, 
                            y_pred_test = y_pred_test)

        result1.show()

    
    if model is not None:
        feat_list = model.feature_names_in_
        feat_list = list(model.feature_names_in_)
        feat_list.append(label)
        print(feat_list)
        display(train_dataset)
        train_dataset = Dataset(train_dataset.data[feat_list], label=label)
        test_dataset = Dataset(test_dataset.data[feat_list], label=label)
        #display(train_dataset)
        suite2 = create_model_evaluation_suite()
        result2 = suite2.run(train_dataset = train_dataset, test_dataset = test_dataset, model = model)
        result2.show()

    return result1, result2

# Example Usage:
# suite = create_train_test_suite()
# train_dataset, test_dataset = generate_sample_data()
# suite.run(train_dataset, test_dataset)

# Running Model Evaluation:
# model = train_sample_model(train_dataset)
# evaluate_model(model, train_dataset, test_dataset)


In [21]:
def load_csv(file_path):
    """Load CSV file into a Pandas DataFrame."""
    try:
        df = pd.read_csv(file_path)
        if df.empty:
            raise ValueError("Uploaded file is empty.")
        print(f"Loaded {file_path} with shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

# Example usage
train_path = "train.csv"
test_path = "test.csv"
label = "label"
model_file = "model.pkl"
prediction_label_column = "predicted_label"
probability_column = "predicted_probability"

import pandas as pd
print("Loading data...")
train_df = load_csv(train_path)
test_df = load_csv(test_path)    

evaluate_model(model, train_df, test_df)



Loading data...
Loaded train.csv with shape: (1000, 6)
Loaded test.csv with shape: (1000, 6)
Creating datasets...
0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    1
998    1
999    1
Name: predicted_label, Length: 1000, dtype: int64




Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_4XC0HJ3TMN8HMPIWKAFPTCBH5">Model Evaluation S…

['feature1', 'feature2', 'feature3', 'label']


Unnamed: 0,Column,DType,Kind,Additional Info
0,label,integer,,
1,feature1,floating,Numerical Feature,
2,feature2,floating,Numerical Feature,
3,feature3,integer,Categorical Feature,
4,predicted_probability,floating,Numerical Feature,

Unnamed: 0,label,feature1,feature2,feature3,predicted_probability
0,0,0.384065,8.499168,0,0.42
1,0,-0.219101,4.899591,1,0.04
2,0,-1.525525,5.170419,0,0.32
3,0,0.640843,2.516479,1,0.22
4,0,0.562969,3.775527,0,0.28
...,...,...,...,...,...
995,0,0.321357,4.821532,1,0.12
996,0,0.081829,2.585957,0,0.12
997,1,0.202923,3.937571,0,0.84
998,1,0.673181,2.656691,1,0.70




Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_Z4MME56EVRDRSSW3BQDUQ3U8H">Model Evaluation S…

(Model Evaluation Suite, Model Evaluation Suite)

In [None]:
from collections import OrderedDict
import deepchecks
import deepchecks.tabular.checks as checks
import ipywidgets as widgets
from deepchecks.tabular import Suite, Dataset
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def get_user_input():
    """Get user-defined parameter values for checks that require input."""
    params = {
        'drift_threshold': 0.1,
        'correlation_threshold': 0.9,
        'max_missing_ratio': 0.05,
        'min_accuracy': 0.7,
        'max_overfit_ratio': 1.5
    }
    return params

#def get_train_test_checks(params):
#    """Retrieve and organize train-test validation checks into an OrderedDict with conditions."""
#    check_dict = OrderedDict({
#        0: checks.TrainTestFeatureDrift().add_condition_drift_score_less_than(params['drift_threshold']),
#        1: checks.TrainTestLabelDrift().add_condition_drift_score_less_than(params['drift_threshold']),
#        2: checks.TrainTestFeatureCorrelation().add_condition_correlation_less_than(params['correlation_threshold']),
#        3: checks.TrainTestLabelCorrelationChange(),
#        4: checks.TrainTestPredictionDrift().add_condition_drift_score_less_than(params['drift_threshold']),
#        5: checks.TrainTestSamplesMix(),
#        6: checks.TrainTestMissingValuesComparison().add_condition_max_missing_fraction_less_than(params['max_missing_ratio']),
#        7: checks.TrainTestDuplicateSamples(),
#        8: checks.TrainTestCategoryMismatch()
#    })
#    return check_dict

#def create_train_test_suite():
#    """Create a Train-Test Evaluation suite with configured conditions."""
#    params = get_user_input()
#    check_dict = get_train_test_checks(params)
#    suite = Suite("Train-Test Evaluation Suite", *check_dict.values())
#    return suite

def get_model_evaluation_checks(params):
    """Retrieve and organize model evaluation checks into an OrderedDict with conditions."""
    check_dict = OrderedDict({
        #0: checks.ModelPerformanceReport().add_condition_accuracy_greater_than(params['min_accuracy']),
        0: checks.TrainTestPerformance().add_condition_test_performance_greater_than(min_score = 0.20),
        #1: checks.OverfitDetector().add_condition_overfit_ratio_less_than(params['max_overfit_ratio']),
        1: checks.RocReport().add_condition_auc_greater_than(min_auc = 0.7),
        2: checks.SimpleModelComparison(),
        #3: checks.RobustnessReport(),
        4: checks.CalibrationScore(),
        #5: FeatureImportanceCheck()  # Custom Feature Importance Check

    })
    return check_dict

def create_model_evaluation_suite():
    """Create a Model Evaluation suite with configured conditions."""
    params = get_user_input()
    check_dict = get_model_evaluation_checks(params)
    suite = Suite("Model Evaluation Suite", *check_dict.values())
    return suite

def generate_sample_data():
    """Generate sample train and test datasets."""
    np.random.seed(42)
    data = pd.DataFrame({
        'feature1': np.random.normal(0, 1, 2000),
        'feature2': np.random.normal(5, 2, 2000),
        'feature3': np.random.randint(0, 2, 2000),
        'label': np.random.randint(0, 2, 2000)
    })
    
    train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)
    train_dataset = Dataset(train_data, label='label')
    test_dataset = Dataset(test_data, label='label')
    return train_dataset, test_dataset

def train_sample_model(train_dataset):
    """Train a sample RandomForest model."""
    train_df = train_dataset.data
    X_train = train_df.drop(columns=['label'])
    y_train = train_df['label']
    model = RandomForestClassifier(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, train_dataset, test_dataset):
    """Run model evaluation checks using deepchecks."""
    suite = create_model_evaluation_suite()
    result = suite.run(train_dataset, test_dataset, model)
    #result.show()
    return result

# Example Usage:
# suite = create_train_test_suite()
# train_dataset, test_dataset = generate_sample_data()
# suite.run(train_dataset, test_dataset)

# Running Model Evaluation:
# model = train_sample_model(train_dataset)
# evaluate_model(model, train_dataset, test_dataset)


In [None]:
#train_dataset, test_dataset = generate_sample_data()
#train_dataset.data.to_csv(r"C:\Users\DELL\Downloads\train_dataset.csv", index = False)
#test_dataset.data.to_csv(r"C:\Users\DELL\Downloads\test_dataset.csv", index = False)

In [None]:
train_dataset, test_dataset = generate_sample_data()

# Running Model Evaluation:
model = train_sample_model(train_dataset)
evaluate_model(model, train_dataset, test_dataset)
