Sahib Athwal\
A13450589\
COGS 118A: Final\
Professor Fleischer






# **Experiment Overview**

This section will run through each of the respective supervised machine learning algorithms that include: Decision Tree, Random Forests, and KNN Models. The results will be displayed for each of the respective datasets we are running our algorithms on. The implementation of each of the supervised machine learning techniques will utilize python libraries stated below in our implementation.

#### **Import Libraries**

In [None]:
# Import for reading and manipulating data
import pandas as pd

# Numerical operations
import numpy as np

# Importing stats module for calculating z-scores, mean, etc.
from scipy import stats

# Importing seaborn for data visualization
import seaborn as sns

# Importing scikit-learn modules
from sklearn.svm import SVR  # Support Vector Regression
from sklearn import tree  # Decision Tree
from sklearn import preprocessing  # Preprocessing tools
from sklearn.model_selection import GridSearchCV  # Grid Search for hyperparameter tuning
from sklearn.preprocessing import LabelEncoder  # Encoding categorical variables
#from sklearn.preprocessing import MultiColumnLabelEncoder  # Encoding categorical variables
from sklearn.preprocessing import OneHotEncoder  # One-hot encoding categorical variables
from sklearn.compose import ColumnTransformer  # Combining transformers for different data types
from sklearn.preprocessing import StandardScaler  # Scaling features
from sklearn.feature_selection import SelectKBest, f_classif  # Feature selection
from sklearn.utils import shuffle  # Shuffling data
from sklearn.model_selection import cross_val_score  # Cross-validation
from sklearn import svm  # Support Vector Machines
from sklearn.tree import DecisionTreeClassifier  # Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier  # Random Forest Classifier
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors Classifier
from sklearn.datasets import make_classification  # Creating classification datasets
from sklearn.linear_model import LogisticRegression  # Logistic Regression
from sklearn.model_selection import train_test_split  # Splitting data into train and test sets
from sklearn.pipeline import make_pipeline  # Creating pipelines

# Importing matplotlib for data visualization
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'  # High-resolution figures

# Import for reading and manipulating data
import scipy.io as sio


## **Classifier Functions**

#### **Heat Map Visualiztion**

This is the helper method that allows us to view each of our respective classifiers as a heat map.

In [None]:
global count 
count = 0
def draw_heatmap(acc, acc_desc, C_list, character):
    global count
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=C_list, 
                     xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$'  + character + '$')
    plt.title(acc_desc + ' w.r.t $' + character + '$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()
    count+=1

#### **Linear Support Vector Machine Classifier**

This is the helper method is designed to perform binary classifcation using a Support Vector Machine (SVN) algorithm with a linear kernel. It aims to find the optimal value of the regularization parameter 'C' for the SVM model by performing a grid search and k-fold cross-validation.

In [None]:
def svm_func():
    # SVM binary classification used linear instead of RBF (Faster runtime)
    classifier = svm.SVC(kernel='linear')

    # Different C values to try
    C_list = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 1]
    parameters = {'C': C_list}

    # Perform a grid Search to identify the best C and to perform K 5 Folds
    clf = GridSearchCV(classifier, parameters, return_train_score=True, cv=5)

    # Have to fit the classifier with the training data
    clf.fit(X_train_val, Y_train_val.values.ravel())

    # Extract the training and validation accuracies and plot them as heat maps
    # to visualize the best C parameter
    train_acc = clf.cv_results_['mean_train_score']
    draw_heatmap(train_acc.reshape(-1, 1), 'train accuracy', C_list, 'C')

    val_acc = clf.cv_results_['mean_test_score']
    draw_heatmap(val_acc.reshape(-1, 1), 'val accuracy', C_list, 'C')

    # Find the optimal C parameter and use that to redefine the classifier
    optimal_classifier = svm.SVC(kernel='linear', C=clf.best_params_['C'])

    for i, j in enumerate(C_list):
        if j == clf.best_params_['C']:
            best_train_acc = train_acc[i]

    # Find test accuracy
    optimal_classifier.fit(X_train_val, Y_train_val.values.ravel())
    test_acc = optimal_classifier.score(X_test, Y_test.values.ravel())
    return test_acc, best_train_acc, clf.best_params_['C']


#### **Decision Tree Classifier**

This is the helper method that is designed to perform a grid search and cross validation to find the optimal maximum depth parameter (max_depth) for a Decision Tree Classifier. The function aims to identify the (max_depth) value that yields the best performance on the given classification task.

In [None]:
def decision_Tree():
    D_list = np.array([1, 2, 3, 4, 5])
    parameters = {'max_depth':D_list}

    classifier_grid = GridSearchCV(DecisionTreeClassifier(criterion="entropy"), 
                                   parameters, cv=5, return_train_score=True)
    
    #Have to fit the classifier with the training data
    classifier_grid.fit(X_train_val, Y_train_val)

    #Show the heatmaps
    draw_heatmap(classifier_grid.cv_results_['mean_train_score'].reshape(5,1), 
                 'DT train accuracy', D_list, 'D')
    draw_heatmap(classifier_grid.cv_results_['mean_test_score'].reshape(5,1), 
                 'DT val accuracy', D_list, 'D')
    
    #Train and Test with best parameters
    D_star = classifier_grid.best_params_['max_depth']
    classifier_test = DecisionTreeClassifier(max_depth=D_star, 
                                             criterion="entropy")
    classifier_test.fit(X_train_val, Y_train_val)
    Desicion_test_acc = classifier_test.score(X_test,Y_test_val)
    
    train_acc = classifier_grid.cv_results_['mean_train_score']
    
    for i,j in enumerate(D_list):
        if j == D_star:
            best_train_acc = train_acc[i]
    
    return Desicion_test_acc, best_train_acc, D_star

#### **Random Forest Classifier**

This is the helper method that allows us to perform a grid search and cross-validation tofind the optimal maximum depth for a Random Forest Classifier. This function aims to identify the max_depth value that yields the best performance on the given classification task.

In [None]:
def rand_Forest():
    D_list = np.array([1, 2, 3, 4, 5])
    parameters = {'max_depth':D_list}

    #Tried various parameters on the docs this got me the fastest result using
    #K5 Folds specified
    classifier_grid = GridSearchCV(RandomForestClassifier(criterion="entropy"), 
                                   parameters, cv=5, return_train_score=True)
    classifier_grid.fit(X_train_val, Y_train_val)

    #Show Heatmaps
    draw_heatmap(classifier_grid.cv_results_['mean_train_score'].reshape(5,1), 
                 'RF train accuracy', D_list, 'K')
    draw_heatmap(classifier_grid.cv_results_['mean_test_score'].reshape(5,1), 
                 'RF val accuracy', D_list, 'K')

    #Train and Test with best parameters
    D_star = classifier_grid.best_params_['max_depth']

    #Entropy worked better than the default gini
    classifier_test1 = RandomForestClassifier(max_depth=D_star, 
                                              criterion="entropy")
    classifier_test1.fit(X_train_val, Y_train_val)
    randForest_acc = classifier_test1.score(X_test,Y_test_val)
    
    
    train_acc = classifier_grid.cv_results_['mean_train_score']
    for i,j in enumerate(D_list):
        if j == D_star:
            best_train_acc = train_acc[i]
    
    return randForest_acc, best_train_acc, D_star

#### **KNN Classifier**

This is the helper method that allows us to run a grid search and cross-validation to find the optimal number of neighbors parameter for a K-Nearest Neighbors (KNN) classifier. The function aims to identify the n_neighbors parameter that results in the highest accuracy score.

In [None]:

def knn_classifier():
    k_list = np.array([1, 2, 3, 4, 5, 6])
    parameters = {'n_neighbors':k_list}
    classifier_grid = GridSearchCV(KNeighborsClassifier(), parameters, cv=5, 
                                   return_train_score=True)
    classifier_grid.fit(X_train_val, Y_train_val)

    #Plot heatmaps for the Training and Testing scores respectively
    draw_heatmap(classifier_grid.cv_results_['mean_train_score'].reshape(6,1), 
                 'KNN train accuracy', k_list, 'K')
    draw_heatmap(classifier_grid.cv_results_['mean_test_score'].reshape(6,1), 
                 'KNN val accuracy', k_list, 'K')

    #Train and Test with best parameters
    k_star = classifier_grid.best_params_['n_neighbors']
    classifier_test2 = KNeighborsClassifier(n_neighbors=k_star)
    classifier_test2.fit(X_train_val,Y_train_val)
    knn_acc = classifier_test2.score(X_test,Y_test_val)
    
    train_acc = classifier_grid.cv_results_['mean_train_score']
    for i,j in enumerate(k_list):
        if j == k_star:
            best_train_acc = train_acc[i]
    
    return knn_acc, best_train_acc, k_star

## **Bankruptcy Dataset**

This given dataset provides 96 various attributes that give us the necessary information to predict whether a company will go bankrupt or not. The reasoning behind using this dataset is for a basis for a real
world scenario, where a company may exhibit similarities to these features and may need to take decisive action if they are going to become bankrupt.


### **Cleaning Bankruptcy Dataset**

In [None]:
# Load the dataset
bankrupcy_data_preserved = pd.read_csv('Bankrupcy.csv')
bankrupcy_data = bankrupcy_data_preserved.copy()  # Create a copy for manipulation

# Handle missing values (e.g., impute with mean)
bankrupcy_data = bankrupcy_data.fillna(bankrupcy_data.mean())

# Drop columns with a high percentage of missing values (if any)
bankrupcy_data = bankrupcy_data.dropna(axis=1, thresh=bankrupcy_data.shape[0] * 0.8)

# Drop duplicate entries (if any)
bankrupcy_data.drop_duplicates(inplace=True)


### **Cleaning Bankruptcy Dataset: Dealing with outliers**

In [None]:
# Identify outliers based on the Interquartile Range (IQR)
def detect_outliers(data, features):
    outlier_indices = []
    for feature in features:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outlier_indices.extend(data[(data[feature] < lower_bound) | (data[feature] > upper_bound)].index)
    return outlier_indices

# Replace outliers with the median or a specified value
def handle_outliers(data, features, strategy='median', value=None):
    if strategy == 'median':
        for feature in features:
            median = data[feature].median()
            data[feature] = data[feature].apply(lambda x: median if x in list(data[data[feature].isin(data.loc[detect_outliers(data, features), feature])][feature]) else x)
    elif strategy == 'value':
        if value is None:
            raise ValueError("You must specify a value when using the 'value' strategy.")
        for feature in features:
            data[feature] = data[feature].apply(lambda x: value if x in list(data[data[feature].isin(data.loc[detect_outliers(data, features), feature])][feature]) else x)
    return data

# Example usage
numerical_features = bankrupcy_data.select_dtypes(include=['float64', 'int64']).columns

# Identify outliers
outlier_indices = detect_outliers(bankrupcy_data, numerical_features)

# Replace outliers with the median
bankrupcy_data = handle_outliers(bankrupcy_data, numerical_features, strategy='median')

In [None]:
# Print the column names
print("Column names:")
print(bankrupcy_data.columns)

# Print the first few rows of the dataset
print("\nFirst few rows:")
print(bankrupcy_data.head())

# Print the data types of the columns
print("\nData types:")
print(bankrupcy_data.dtypes)

# Print the unique values in each column
print("\nUnique values in each column:")
for column in bankrupcy_data.columns:
    print(f"{column}: {bankrupcy_data[column].unique()}")

# Print the summary statistics of the numerical columns
print("\nSummary statistics of numerical columns:")
print(bankrupcy_data.describe())


In [None]:
# Separate numerical and categorical features
numerical_features = bankrupcy_data.select_dtypes(include=['float64', 'int64']).columns
categorical_features = bankrupcy_data.select_dtypes(include=['object']).columns

# Encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split the dataset into features (X) and target (y)
X = bankrupcy_data.drop('Bankrupt?', axis=1)  # Features
y = bankrupcy_data['Bankrupt?'] # Target variable

# Feature selection
selector = SelectKBest(f_classif, k=10)  # Select the top 10 features based on ANOVA F-value
X_selected = selector.fit_transform(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [None]:
# Split Data By 80/20, 50/50, 20/80
partitionVal = [0.8, 0.5, 0.2]
result_table = np.zeros((3, 7))
result_table1 = np.zeros((3, 7))
result_table2 = np.zeros((3, 7))

for i, partition in enumerate(partitionVal):
    print("Partition: ", partition)
    knn_test_acc = []
    rand_forest_test_acc = []
    decision_tree_test_acc = []
    svm_test_acc = []

    NUM_TRIALS = 5
    for trial in range(NUM_TRIALS):
        # Mix up the data
        bankrupcy_data = bankrupcy_data.sample(frac=1).reset_index(drop=True)
        # Find the point where to split the data
        breakNum = int(partition * len(bankrupcy_data))

        X_train_full = bankrupcy_data.loc[0:breakNum]
        X_train_val = X_train_full.drop("Bankrupt?", axis=1)
        Y_train_val = X_train_full["Bankrupt?"]
        X_test_full = bankrupcy_data.loc[breakNum:]
        X_test = X_test_full.drop("Bankrupt?", axis=1)
        Y_test_val = X_test_full["Bankrupt?"]

        # Call the svm classifier
        test_acc, best_train0, C0 = svm_func()
        svm_test_acc.append(test_acc)

        # Call the knn classifier
        test_acc, best_train1, C1 = knn_classifier()
        knn_test_acc.append(test_acc)

        # Call the Decision Tree classifier
        test_acc, best_train2, C2 = decision_Tree()
        decision_tree_test_acc.append(test_acc)

        # Call the Random Forest classifier
        test_acc, best_train3, C3 = rand_Forest()
        rand_forest_test_acc.append(test_acc)

    # result_table[i, 0] = sum(svm_test_acc)/NUM_TRIALS
    result_table[i, 1] = sum(knn_test_acc) / NUM_TRIALS
    result_table[i, 2] = sum(decision_tree_test_acc) / NUM_TRIALS
    result_table[i, 3] = sum(rand_forest_test_acc) / NUM_TRIALS

    # result_table1[i, 0] = best_train0
    result_table1[i, 1] = best_train1
    result_table1[i, 2] = best_train2
    result_table1[i, 3] = best_train3

    # result_table2[i, 0] = C0
    result_table2[i, 1] = C1
    result_table2[i, 2] = C2
    result_table2[i, 3] = C3

    # Average all test accuracies for all 3 trials
    print("Test Accuracy Average for knn = ", sum(knn_test_acc) / NUM_TRIALS)
    print("Test Accuracy Average for Random Forest = ", sum(rand_forest_test_acc) / NUM_TRIALS)
    print("Test Accuracy Average for Decision Tree = ", sum(decision_tree_test_acc) / NUM_TRIALS)
    print("Test Accuracy Average for SVM = ", sum(svm_test_acc)/NUM_TRIALS)

    # y-axis: partition
    # x-axis: classifier
    print(result_table)
    print("############################")
    print(result_table1)
    print("############################")
    print(result_table2)

# Visualize the results
classifiers = ['KNN', 'Decision Tree', 'Random Forest']
x = np.arange(len(classifiers))
width = 0.2

fig, ax = plt.subplots(figsize=(8, 6))
for i, partition in enumerate(partitionVal):
    ax.bar(x + i * width, result_table[i, 1:4], width, label=f'Partition {partition}')

ax.set_xticks(x + width / 2)
ax.set_xticklabels(classifiers)
ax.set_xlabel('Classifier')
ax.set_ylabel('Average Test Accuracy')
ax.set_title('Performance of Different Classifiers for Bankruptcy Prediction')
ax.legend()

plt.show()

## **Students Performance Dataset**

This given dataset provides 8 various attributes that give us the necessary information to predict how a student might score on in math, reading, and writing. The reasoning behind using this dataset is for a basis for a real world scenario, where a schools want to see how students are doing based on their background.


### **Students Performance Dataset: Dealing with outliers**

In [None]:
# Custom MultiColumnLabelEncoder class
class MultiColumnLabelEncoder:
    def __init__(self, columns=None):
        self.columns = columns  # array of column names to encode

    def fit(self, X, y=None):
        return self  # not relevant here

    def transform(self, X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

# Load the dataset
students_data_preserved = pd.read_csv('StudentsPerformance.csv')
students_data = students_data_preserved.copy()  # Create a copy for manipulation

# Basic cleanup
students_data.dropna(inplace=True)
print(students_data.shape)
print(students_data.head())

# Identify numerical and categorical features
numerical_features = students_data.select_dtypes(include=['float64', 'int64']).columns
categorical_features = students_data.select_dtypes(include=['object']).columns

# Separate target variable from features
target_column = 'math score'  # Replace with the desired target variable
X = students_data.drop(target_column, axis=1)
y = students_data[target_column]

# Remove the target column from numerical features
numerical_features = numerical_features.drop(target_column)

# Detect and handle outliers
def detect_outliers(data, features):
    outlier_indices = []
    for feature in features:
        z_scores = stats.zscore(data[feature])
        outlier_indices.extend(data.loc[np.abs(z_scores) > 3, feature].index)
    return list(set(outlier_indices))

outlier_indices = detect_outliers(X, numerical_features)
X = X.drop(X.index[outlier_indices])
y = y.drop(y.index[outlier_indices])

# Encode categorical features using the custom MultiColumnLabelEncoder
categorical_transformer = MultiColumnLabelEncoder(columns=categorical_features)
X_categorical_encoded = categorical_transformer.fit_transform(X[categorical_features])

# Encode numerical features using StandardScaler
numerical_transformer = StandardScaler()
X_numerical = X[numerical_features]
X_numerical_scaled = numerical_transformer.fit_transform(X_numerical)

# Combine encoded categorical and scaled numerical features
X_encoded = np.concatenate([X_categorical_encoded, X_numerical_scaled], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [None]:
# SVM function
def svm_func(X_train, X_test, y_train, y_test):
    # SVM binary classification used linear instead of RBF (Faster runtime)
    classifier = svm.SVC(kernel='linear')

    # Different C values to try
    C_list = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 1]
    parameters = {'C': C_list}

    # Perform a grid Search to identify the best C and to perform K 5 Folds
    clf = GridSearchCV(classifier, parameters, return_train_score=True, cv=5)

    # Have to fit the classifier with the training data
    clf.fit(X_train, y_train)

    # Extract the training and validation accuracies and plot them as heat maps
    # to visualize the best C parameter
    train_acc = clf.cv_results_['mean_train_score']
    val_acc = clf.cv_results_['mean_test_score']

    # Find the optimal C parameter and use that to redefine the classifier
    optimal_classifier = svm.SVC(kernel='linear', C=clf.best_params_['C'])

    for i, j in enumerate(C_list):
        if j == clf.best_params_['C']:
            best_train_acc = train_acc[i]

    # Find test accuracy
    optimal_classifier.fit(X_train, y_train)
    test_acc = optimal_classifier.score(X_test, y_test)
    return test_acc, best_train_acc, clf.best_params_['C']

# KNN function
def knn_classifier(X_train, X_test, y_train, y_test):
    k_list = np.array([1, 2, 3, 4, 5, 6])
    parameters = {'n_neighbors': k_list}
    classifier_grid = GridSearchCV(KNeighborsClassifier(), parameters, cv=5, return_train_score=True)
    classifier_grid.fit(X_train, y_train)

    # Plot heatmaps for the Training and Testing scores respectively
    train_acc = classifier_grid.cv_results_['mean_train_score'].reshape(6, 1)
    val_acc = classifier_grid.cv_results_['mean_test_score'].reshape(6, 1)

    # Train and Test with best parameters
    k_star = classifier_grid.best_params_['n_neighbors']
    classifier_test = KNeighborsClassifier(n_neighbors=k_star)
    classifier_test.fit(X_train, y_train)
    knn_acc = classifier_test.score(X_test, y_test)

    train_acc = classifier_grid.cv_results_['mean_train_score']
    for i, j in enumerate(k_list):
        if j == k_star:
            best_train_acc = train_acc[i]

    return knn_acc, best_train_acc, k_star

# Decision Tree function
def decision_Tree(X_train, X_test, y_train, y_test):
    D_list = np.array([1, 2, 3, 4, 5])
    parameters = {'max_depth': D_list}
    classifier_grid = GridSearchCV(DecisionTreeClassifier(criterion="entropy"), parameters, cv=5, return_train_score=True)
    classifier_grid.fit(X_train, y_train)

    # Show the heatmaps
    train_acc = classifier_grid.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc = classifier_grid.cv_results_['mean_test_score'].reshape(5, 1)

    # Train and Test with best parameters
    D_star = classifier_grid.best_params_['max_depth']
    classifier_test = DecisionTreeClassifier(max_depth=D_star, criterion="entropy")
    classifier_test.fit(X_train, y_train)
    decision_tree_acc = classifier_test.score(X_test, y_test)

    train_acc = classifier_grid.cv_results_['mean_train_score']
    for i, j in enumerate(D_list):
        if j == D_star:
            best_train_acc = train_acc[i]

    return decision_tree_acc, best_train_acc, D_star

# Random Forest function
def rand_Forest(X_train, X_test, y_train, y_test):
    D_list = np.array([1, 2, 3, 4, 5])
    parameters = {'max_depth': D_list}
    classifier_grid = GridSearchCV(RandomForestClassifier(criterion="entropy"), parameters, cv=5, return_train_score=True)
    classifier_grid.fit(X_train, y_train)

    # Show Heatmaps
    train_acc = classifier_grid.cv_results_['mean_train_score'].reshape(5, 1)
    val_acc = classifier_grid.cv_results_['mean_test_score'].reshape(5, 1)

    # Train and Test with best parameters
    D_star = classifier_grid.best_params_['max_depth']
    classifier_test = RandomForestClassifier(max_depth=D_star, criterion="entropy")
    classifier_test.fit(X_train, y_train)
    random_forest_acc = classifier_test.score(X_test, y_test)

    train_acc = classifier_grid.cv_results_['mean_train_score']
    for i, j in enumerate(D_list):
        if j == D_star:
            best_train_acc = train_acc[i]

    return random_forest_acc, best_train_acc, D_star

partitionVal = [0.8, 0.5, 0.2]
result_table = np.zeros((3, 7))
result_table1 = np.zeros((3, 7))
result_table2 = np.zeros((3, 7))

for i, partition in enumerate(partitionVal):
    print("Partition: ", partition)
    knn_test_acc = []
    rand_forest_test_acc = []
    decision_tree_test_acc = []
    svm_test_acc = []

    NUM_TRIALS = 5
    for trial in range(NUM_TRIALS):
        X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=1-partition, random_state=42)

        test_acc, best_train0, C0 = svm_func(X_train, X_test, y_train, y_test)
        svm_test_acc.append(test_acc)

        test_acc, best_train1, C1 = knn_classifier(X_train, X_test, y_train, y_test)
        knn_test_acc.append(test_acc)

        test_acc, best_train2, C2 = decision_Tree(X_train, X_test, y_train, y_test)
        decision_tree_test_acc.append(test_acc)

        test_acc, best_train3, C3 = rand_Forest(X_train, X_test, y_train, y_test)
        rand_forest_test_acc.append(test_acc)

    result_table[i, 0] = sum(svm_test_acc) / NUM_TRIALS
    result_table[i, 1] = sum(knn_test_acc) / NUM_TRIALS
    result_table[i, 2] = sum(decision_tree_test_acc) / NUM_TRIALS
    result_table[i, 3] = sum(rand_forest_test_acc) / NUM_TRIALS

    result_table1[i, 0] = best_train0
    result_table1[i, 1] = best_train1
    result_table1[i, 2] = best_train2
    result_table1[i, 3] = best_train3

    result_table2[i, 0] = C0
    result_table2[i, 1] = C1
    result_table2[i, 2] = C2
    result_table2[i, 3] = C3

    # Print average test accuracy for each model
    print("Test Accuracy Average for SVM = ", sum(svm_test_acc) / NUM_TRIALS)
    print("Test Accuracy Average for knn = ", sum(knn_test_acc) / NUM_TRIALS)
    print("Test Accuracy Average for Random Forest = ", sum(rand_forest_test_acc) / NUM_TRIALS)
    print("Test Accuracy Average for Decision Tree = ", sum(decision_tree_test_acc) / NUM_TRIALS)

    # y-axis: partition
    # x-axis: classifier
    print(result_table)
    print("############################")
    print(result_table1)
    print("############################")
    print(result_table2)
