## Graphical Visualization for Data Analysis

In [8]:
def next_hist(row, col, ind, dataset, feature, str_feature, bins, color):
    plt.subplot(row, col, ind)
    plt.hist(dataset[feature], bins = bins, color = color, edgecolor = 'black')
    plt.title(str_feature + ' Distribution')
    plt.xlabel(str_feature)
    plt.ylabel('Frequency')

def next_scatter(row, col, ind, x, y, hue, str_x, str_y, legend_adjustment):
    plt.subplot(row, col, ind)
    sns.scatterplot(x = x, y = y, hue = hue, palette = 'viridis')
    plt.xlabel(str_x)
    plt.ylabel(str_y)
    plt.title('Scatter Plot: ' + str_y + ' vs. ' + str_x)
    plt.legend(title = 'Obes/No', loc = legend_adjustment, fancybox = True, framealpha = 0.7)
    plt.grid(True)

def next_hist_multi(row, col, ind, dataset, multiplied_df, feature, str_feature, k, colors, labels):
    plt.subplot(row, col, ind)
    bins = np.linspace(dataset[feature].min(), dataset[feature].max(), num = k + 1)
    plt.hist(multiplied_df, bins = bins, color = colors, edgecolor = 'black', label = labels)
    plt.legend(prop={'size': 8})
    plt.title(str_feature + ' Distribution')
    plt.xlabel(str_feature)
    plt.grid(True)

def get_hot_encoder(dataset):
    NObeyesdad_cat = dataset[['NObeyesdad']]
    cat_encoder = OneHotEncoder()
    NObeyesdad_cat_1hot = cat_encoder.fit_transform(NObeyesdad_cat)
    return cat_encoder, NObeyesdad_cat_1hot

def multiply_feature_by_one_hot(dataset, feature_name, cat_encoder, NObeyesdad_cat_1hot):
    # Extract the feature from dataset
    feature = dataset[feature_name].values.reshape(-1, 1)  # Reshape to a column vector
    # Perform element-wise multiplication
    multiplied = NObeyesdad_cat_1hot.multiply(feature)
    # Convert the result to a DataFrame
    multiplied_df = pd.DataFrame()
    multiplied_df = pd.DataFrame(multiplied.toarray(), columns = cat_encoder.get_feature_names_out(['NObeyesdad']))
    return(multiplied_df)

def next_boxplot_multi(row, col, ind, multiplied_df, feature, str_feature, labels):
    data_vectors = []
    for col_ind in multiplied_df.columns:
        col_df = multiplied_df[col_ind][multiplied_df[col_ind] != 0]
        data_vectors.append(np.array(col_df))
    plt.subplot(row, col, ind)
    plt.boxplot(data_vectors, labels = labels)
    plt.title('Data Distribution over ' + str_feature)
    plt.grid(True)
    
def get_hot_encoder(dataset):
    NObeyesdad_cat = dataset[['NObeyesdad']]
    cat_encoder = OneHotEncoder()
    NObeyesdad_cat_1hot = cat_encoder.fit_transform(NObeyesdad_cat)
    return cat_encoder, NObeyesdad_cat_1hot

def multiply_feature_by_one_hot(dataset, feature_name, cat_encoder, NObeyesdad_cat_1hot):
    # Extract the feature from dataset
    feature = dataset[feature_name].values.reshape(-1, 1)  # Reshape to a column vector
    # Perform element-wise multiplication
    multiplied = NObeyesdad_cat_1hot.multiply(feature)
    # Convert the result to a DataFrame
    multiplied_df = pd.DataFrame()
    multiplied_df = pd.DataFrame(multiplied.toarray(), columns = cat_encoder.get_feature_names_out(['NObeyesdad']))
    return(multiplied_df)

def box_plot_zoom(df, feature_y, feature_x, labels, title):
    data_vectors = list()
    unique_values = sorted(pd.unique(df[feature_x]))
    for i in unique_values:
        df_i = public_trans[feature_y][df[feature_x] == i]
        data_vectors.append(np.array(df_i))
    plt.boxplot(data_vectors, labels = labels)
    plt.title(title)
    plt.grid(True)

## Model Fitting

In [1]:
# Functions for fitting the models
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

def LR_fitting(X_train, y_train, hyper_C, max_iter):
    """
    The function creates and fits a logistic regression model.

    Args:
        X_train: A train feature matrix.
        y_train: A train output vector.
        hyper_C: A hyperparameter C which is used in the denominator in the penalty term.
        max_iter: A hyperparameter - maximum number of iterations. 

    Returns:
        The fitted model.
    """
    model = LogisticRegression(C = hyper_C, multi_class = 'multinomial', solver = 'lbfgs', max_iter = max_iter)
    model.fit(X_train, y_train)

    return model

def KNN_fitting(X_train, y_train, n_neighbors = 3):
    """
    The function creates and fits a KNN model.

    Args:
        X_train: A train feature matrix.
        y_train: A train output vector.
        n_neighbors: A hyperparameter - a number of neighbors.

    Returns:
        The fitted model.
    """
    model = KNeighborsClassifier(n_neighbors = n_neighbors)
    model.fit(X_train, y_train)

    return model


def DT_fitting(X_train, y_train, min_samples_leaf, max_depth):
    """
    The function creates and fits a KNN model.

    Args:
        X_train: A train feature matrix.
        y_train: A train output vector.
        min_samples_leaf: A hyperparameter - minimum number of samples required to split an internal node.
        max_depth: A hyperparameter - maximum depth of the tree.

    Returns:
        The fitted model.
    """
    model = DecisionTreeClassifier(criterion = 'entropy', min_samples_leaf = min_samples_leaf, max_depth = max_depth)
    model.fit(X_train, y_train)

    return model

def SVM_fitting(X_train, y_train, hyper_C):
    """
    The function creates and fits a support vector classificator with the linear kernel.

    Args:
        X_train: A train feature matrix.
        y_train: A train output vector.
        hyper_C: A hyperparameter C which is inverse proportional to a regularization parameter.

    Returns:
        The fitted model.
    """
    model = SVC(C = hyper_C, kernel = 'linear')
    model.fit(X_train, y_train)

    return model


In [9]:
def model_choice(X_train_i, y_train, model_name, dict_of_params):
    """
    The function chooses the model depending on model_name.

    Args:
        X_train_i: A train feature matrix. 
        y_train: A train output vector.
        model_name: A model name.
        dict_of_params: A dictionary containing hyperparameters.

    Returns:
        The fitted model.
    """
    if model_name   == 'LR':
        iters       = dict_of_params['max_iter']
        hyper_C     = dict_of_params['C']
        model       = LR_fitting(X_train_i, y_train, hyper_C, iters)
    elif model_name == 'KNN':
        n_neighbors = dict_of_params['n_neighbors']
        model       = KNN_fitting(X_train_i, y_train, n_neighbors)
    elif model_name == 'DT':
        min_samples_leaf = dict_of_params['min_samples_leaf']
        max_depth   = dict_of_params['max_depth']
        model       = DT_fitting(X_train_i, y_train, min_samples_leaf, max_depth)
    elif model_name == 'SVC':
        hyper_C     = dict_of_params['C']
        model       = SVM_fitting(X_train_i, y_train, hyper_C)

    return model

## Feature Extraction Using Backward Stepwise Algorithm

In this section, the backward stepwise algorithm is used for constructing performance metrics in the space ofa numberr of features and a chosen hyperparameter. In KNN the number of neighbors is used as a hyperparameter. Inthe  Decision Tree depth of a tree is used as a hyperparameter.

In [10]:
def feature_extraction(X_train_scaled, y_train, X_test_scaled, y_test, model_name, dict_of_params):
    """
    The function extracts features using the backward stepwise algorithm.

    Args:
        X_train_scaled: A scaled train feature matrix.
        y_train:        A train output vector. 
        X_test_scaled:  A scaled test feature matrix. 
        y_test:         A test output vector.
        model_name:     A model name.
        dict_of_params: A dictionary containing hyperparameters.

    Returns:
        df_train: A data frame that contains the following columns
                 'acc' accuracy of the best among the models containing a reduced number of features
                 'f1'  f1 score of the best among the models containing a reduced number of features
                 'voided_column' the columns corresponding to a column number excluded at each step
        df_test:  A test frame that contains the same columns for test data
    """
    
    X_train_i = X_train_scaled
    X_test_i  = X_test_scaled
    model = model_choice(X_train_i, y_train, model_name, dict_of_params)

    acc_train, f1_train = clf_performance(model, X_train_i, y_train)
    acc_test, f1_test   = clf_performance(model, X_test_i, y_test)
    
    new_row_train = {'acc': [acc_train], 'f1': [f1_train], 'voided_column': [16]}
    df_train = pd.DataFrame(new_row_train)
    new_row_test  = {'acc': [acc_test],  'f1': [f1_test],  'voided_column': [16]}
    df_test  = pd.DataFrame(new_row_test)

    for i in range(X_train_scaled.shape[1] - 1):
#    for i in range(3):
        n = X_train_scaled.shape[1] - i
#        print('i =', n)
        acc_max = 0
        f1_max  = 0
        mod_max = 0
        j_max   = 0
        for j in range(n):
    #        print(j)
            X_train_j = np.delete(X_train_i, j, axis = 1)
            model = model_choice(X_train_j, y_train, model_name, dict_of_params)
            acc_train, f1_train = clf_performance(model, X_train_j, y_train)
            if f1_train > f1_max:
                f1_max  = f1_train
                acc_max = acc_train
                mod_max = model
                j_max   = j
        X_train_i = np.delete(X_train_i, j_max, axis = 1)
        X_test_i  = np.delete(X_test_i,  j_max, axis = 1)
        acc_test, f1_test = clf_performance(mod_max, X_test_i, y_test)

        df_train.loc[len(df_train.index)] = [acc_max,  f1_max,  j_max]
        df_test.loc[len(df_test.index)]   = [acc_test, f1_test, j_max]

    df_train['voided_column'] = df_train['voided_column'].astype(int)
    df_test['voided_column']  = df_test['voided_column'].astype(int)

    return  df_train, df_test

def features_list(features_list, j_list, n_features = 16, is_print = False):
    """
    The function extracts the important feature list obtained by the function feature_extraction.

    Args:
        features_list: A list of all features.
        j_list:        A list of feature numbers excluded at each step of the backward stepwise algorithm.
        n_features:    A number of features in the output feature list.
        is_print:      A variable equals True for the sets of features printing.

    Returns:
        return_list: A shrunk feature list.
    """
    list_lenght = len(features_list)
    if is_print == True:
        print(features_list)

    ind = list_lenght - n_features
    return_list = features_list

    N = len(features_list)
    for i in range(N):
        col_ind = j_list[i]
        if col_ind < list_lenght:
            features_list = np.delete(features_list, col_ind)
            if is_print == True:
                print(features_list)
            if i == ind:
                return_list = features_list
    return return_list

def get_matrices(n_features, X_train_scaled, X_test_scaled, j_list):
    """
    The function extracts the matrices of the important feature obtained by the function feature_extraction.

    Args:
        n_features:     A number of features in the output feature list.
        X_train_scaled: A scaled matrix of train data.
        X_test_scaled:  A scaled matrix of test data.
        j_list:         A list of feature numbers excluded at each step of the backward stepwise algorithm.

    Returns:
        X_train_i: A shrunk matrix of train data.
        X_test_i:  A shrunk matrix of test data.
    """
    i = 0
    X_train_i = X_train_scaled
    X_test_i  = X_test_scaled
    
    for i in range(X_train_scaled.shape[1] - n_features + 1):
        col_ind = j_list[i]
        if col_ind < X_test_scaled.shape[1]:
            X_train_i = np.delete(X_train_i, col_ind, axis = 1)
            X_test_i  = np.delete(X_test_i, col_ind, axis = 1)
    
#    # Checking if the extraction of columns is correct
#    print('X_test_scaled =')
#    print(X_test_scaled[:3, :])
#    print('X_test =')
#    print(X_test_i[:3, :])

    return X_train_i, X_test_i

def get_performace_on_grid(X_train_scaled, y_train, X_test_scaled, y_test, x_range, model_name):
    """
    The function extracts the matrices of the important feature obtained by the function feature_extraction.

    Args:
        X_train_scaled: A scaled matrix of train data.
        y_train:        A train output.
        X_test_scaled:  A scaled matrix of test data.
        y_test:         A test output.
        x_range:        A grid for a hyperparameter.
        model_name:     A model mane.

    Returns:
        Acc_train_matr: A matrix of the training accuracy of each grid node.
        F1_train_matr:  A matrix of the training F1 score of each grid node. 
        Acc_test_matr:  A matrix of the test accuracy of each grid node.
        F1_test_matr:   A matrix of the test F1 score of each grid node. 
        Voided_col:     A matrix of voided columns for each number of features. 
    """
    for i in range(len(x_range)):
        if model_name == 'KNN':
            print('Number of neighbors:', x_range[i])
            dict_of_params = {'n_neighbors': x_range[i]}
        elif model_name == 'DT':
            print('Max depth:', x_range[i])
            dict_of_params = {'min_samples_leaf': 3, 'max_depth': x_range[i]}
        elif model_name == 'SVC':
            print('C:', x_range[i])
            dict_of_params = {'C': x_range[i]}
            
        df_train, df_test = feature_extraction(X_train_scaled, y_train, X_test_scaled, y_test, model_name, dict_of_params)
        
        if i == 0:
            Acc_train_matr = df_train['acc'].to_numpy()[::-1]
            F1_train_matr  = df_train['f1'].to_numpy()[::-1]
            Acc_test_matr  = df_test['acc'].to_numpy()[::-1]
            F1_test_matr   = df_test['f1'].to_numpy()[::-1]
            Voided_col     = df_train['voided_column'].to_numpy()
            
        else:
            Acc_train_matr = np.column_stack((Acc_train_matr, df_train['acc'].to_numpy()[::-1]))
            F1_train_matr  = np.column_stack((F1_train_matr,  df_train['f1'].to_numpy()[::-1]))
            Acc_test_matr  = np.column_stack((Acc_test_matr,  df_test['acc'].to_numpy()[::-1]))
            F1_test_matr   = np.column_stack((F1_test_matr,   df_test['f1'].to_numpy()[::-1]))
            Voided_col     = np.column_stack((Voided_col,     df_test['voided_column'].to_numpy()))

    return Acc_train_matr, F1_train_matr, Acc_test_matr, F1_test_matr, Voided_col

#### Plot of a performance metric in a space of number of features and a chosen hyperparameter

In [11]:
def plot_3d(ax, metric, x_range, y_range, metric_name, param_name, is_error = False):
    """
    The function plots a 3D performance metric in the space of a number of features and a number of neighbors in KNN.

    Args:
        ax: A subplot.
        metric: A matrix of a performance metric.
        x_range: An x-axis range.
        y_range: A y-axis range.
        metric_name: A name of a performance metric.
        param_name: A hyperparameer name.
        is_error: A variable should be True for construction of an Error 

    Returns:
        Nothing
    """
    X, Y = np.meshgrid(x_range, y_range)
    Z = metric
    if is_error == True:
        Z = np.ones(Z.shape) - Z
    ax.plot_surface(X, Y, Z, cmap = 'viridis', alpha = 0.7)
    
    # Customize the plot
    ax.set_xlabel(param_name)
    ax.set_ylabel('Features')
    ax.set_title(metric_name)

def plot_performance_metric(row, col, ind, train_metric, test_metric, x_range, metric_name, n_features, x_label, loc_pos):
    """
    The function plots a 2D performance metric for the given number of features versus a number of neighbors in KNN.

    Args:
        row: A row of a subplot.
        col: A column of a subplot.
        ind: An index of a graph.
        train_metric: A matrix of a performance metric obtained for the train data. 
        test_metric:  A matrix of a performance metric obtained for the test data.
        x_range: A vector of x-axes labels
        metric_name:  A name of a performance metric.
        n_features:   A given number of features (fixed for the graph).
        x_label: A name of x_axes.
        loc_pos: A legend location (4 - right bottom, 5 - right center)

    Returns:
        Nothing
    """
    if metric_name == 'Error':
        train_list = np.ones(train_metric.shape[1]) - train_metric[n_features][:]
        test_list  = np.ones(test_metric.shape[1])  - test_metric[n_features][:]
    else:
        train_list = train_metric[n_features][:]
        test_list  = test_metric[n_features][:]
        
#    positions = list(range(1, len(train_list) + 1))
    positions = x_range
    y_label = metric_name
    
    plt.subplot(row, col, ind)
    plt.plot(positions, train_list, marker = 'o', linestyle = '-', color = 'b', label = 'Train ' + y_label)
    plt.plot(positions, test_list, marker = 'o', linestyle = '-', color = 'g', label = 'Test ' + y_label)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    if n_features > 0:
        plt.title(y_label + ' vs. ' + x_label + ' for ' + str(n_features) + ' Features')
    else:
        plt.title(y_label + ' vs. ' + x_label)
    plt.legend(loc = loc_pos)  
    plt.grid(True)

## Getting Model Performance

In [None]:
def clf_performance(model, X, y, is_print = False, y_pred = np.array([])):
    """
    The function calculates the performance of a classifier.

    Args:
        model: A model for the performance estimation (if no model, then y_pred is used). 
        X: A test feature matrix.
        y: A test output vector.
        is_print: If True, the results will be printed.
        y_pred: A predicted output vector.

    Returns:
        acc: The accuracy
        f1: F1-score.
    """
    if y_pred.shape[0] == 0:
        y_pred = model.predict(X)
    f1 = f1_score(y, y_pred, average = 'macro')
    acc = accuracy_score(y, y_pred)
    if is_print == True:
        print(f"Accuracy = {acc:.4f}")
        print(f"F1 = {f1:.4f}")
        cm = confusion_matrix(y, y_pred)
        print('Confusion matrix:')
        print(cm)

    return acc, f1
