In [2]:
def compare(X_datasets, datasets_str, y_datasets):
    """
    Compare the performance of different operations on the dataset.

    Params: 
            X_datasets - Array with all the X sets that are the result of an action performed on X_train.
            datasets_str - Array with all the names of action performed on X_train.
            y_datasets - Array with all the corresponding y sets.
    
    Output: A table containing the training and cross validation scores for the various actions performed.
    """
    train_score = []
    cv_score = []
    num_sets = len(X_datasets)
    
    for i in range(num_sets):
        # Fit logistic regression model on the current dataset
        model_LR = LogisticRegression(max_iter=3000)
        model_LR.fit(X_datasets[i], y_datasets[i])
        cv_model_LR = cross_validate(model_LR, X_datasets[i], y_datasets[i], scoring='accuracy', n_jobs=-1, \
                                     return_train_score=True)

        train_score.append(cv_model_LR['train_score'].mean())
        cv_score.append(cv_model_LR['test_score'].mean())

    # Give names to columns and rows
    train_score = pd.DataFrame(train_score, columns=['train_score'])
    cv_score = pd.DataFrame(cv_score, columns=['cv_score'])
    scores = pd.concat([train_score, cv_score], axis=1)
    scores.index = datasets_str
    
    return scores

In [4]:
from scipy import stats
from scipy.stats import zscore

def remove_outliers(dataset, feature):
    """
    Discover outliers for 'feature' is 'dataset' using Z-score. If the Z-score value is greater than or less than
    3 or -3 respectively, that data point will be identified as outlier.
    
    Params: 
            dataset - DataFrame that contains the data set.
            feature - The name of the current feature for which we would like to look for outliers.
    
    Output: DataFrame that contains samples from 'dataset' that are not found to be outliers for 'feature'.
    """
    z_scores = stats.zscore(dataset[feature])
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < 3)
    
    return dataset[filtered_entries]

In [None]:
def show_outliers(dataset, feature):
    """
    Return the samples that are considered as outliers for 'feature' in 'dataset' using Z-score.
    
    Params: 
            dataset - DataFrame that contains the data set.
            feature - The name of the current feature for which we would like to look for outliers.
    
    Output: DataFrame that contains samples from 'dataset' that are found to be outliers for 'feature'.
    """    
    z_scores = stats.zscore(dataset[feature])
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores >= 3)
    
    return dataset[filtered_entries]

In [2]:
def model_evaluation(model):
    """
    Print the best estimator found for 'model' during GridSearchCV and the corresponding
    training and cross-validation scores.
    """
    print(f"\033[1mBest parameters found: \033[0m {model.best_params_}")
    print(f"\033[1mBest estimator found: \033[0m {model.best_estimator_}")
    print(f"\033[1mBest CV score found: \033[0m {model.best_score_}")
    index = model.best_index_
    print(f"\033[1mCorresponding training set score: \033[0m {model.cv_results_['mean_train_score'][index]}")

In [None]:
def test_results(predictions, y_test):
    """
    Print evaluation metrics for test set.
    """
    print(f"\n\033[1mTest accuracy score: \033[0m {accuracy_score(y_test, predictions)}")
    
    print(classification_report(y_test, predictions, target_names=['target 0', 'target 1']))
    conf_mat = pd.DataFrame(confusion_matrix(y_test, predictions))
    conf_mat.index = ['Actual 0', 'Actual 1']
    conf_mat.columns = ['Predicted 0', 'Predicted 1']
    print("\033[1mConfusion matrix:\033[0m")
    print(conf_mat)