# Breast Cancer Wisconsin (Diagnostic)

## Data Set Preparation 

### Accesing and Splitting Data

The dataset is already available in the folder so please don't change the directory of the dataset.

In [1]:
import pandas as pd
import numpy as np
import os
#install the pandas library which is used for data manipulation, preparation
#os is a built in library

In [2]:
#define columns
columns = [
    'id', 'diagnosis', 
    'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
    'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
    'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
    'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
    'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
    'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
]
column_types= ['categorical','categorical','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous','continuous',]
base_dir = os.getcwd()
data_path = os.path.join(base_dir, 'dataset', 'Breast_Cancer_Dataset', 'wdbc.data')
df = pd.read_csv(data_path, header=None, names=columns)
column_types.pop(df.columns.get_loc('id'))
df.drop(columns='id', inplace=True)

df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
column_types.pop(df.columns.get_loc('diagnosis'))
input_dataset=df.drop(columns='diagnosis')
output_dataset= df['diagnosis']
preview_input = input_dataset.head(10).copy()
preview_output = output_dataset.head(10).copy()
print(preview_input)
print(preview_output)

print("input_dataset: "+ str(len(input_dataset)) + "\t output_dataset: "+ str(len(output_dataset)))


   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   
5        12.45         15.70           82.57      477.1          0.12780   
6        18.25         19.98          119.60     1040.0          0.09463   
7        13.71         20.83           90.20      577.9          0.11890   
8        13.00         21.82           87.50      519.8          0.12730   
9        12.46         24.04           83.97      475.9          0.11860   

   compactness_mean  concavity_mean  concave_points_mean  symmetry_mean  \
0           0.27760         0.30010              0.14710         0.2419   
1           0

### Data Validation

In [3]:
#Checking for NaN or incompatible data entries and replacing the missing inputs with 0 and deleting the entries that lack an output
if input_dataset.isnull().values.any():
    input_dataset = input_dataset.fillna(0) #Filling missing inputs or NaN with 0
valid_mask = output_dataset.isin([0, 1])
if not valid_mask.all():
    input_dataset = input_dataset[valid_mask].reset_index(drop=True) # dropping rows that lacks valid output
    output_dataset = output_dataset[valid_mask].reset_index(drop=True)



### Data Preparation

In [4]:
#Now instead of MinMax Scaler I would like to use NormalScaler without a library which is fairly easy
input_mean=input_dataset.mean(axis=0) #axis=0 tells the function to operate columnwise
input_std=input_dataset.std(axis=0)
input_dataset_normalized = (input_dataset-input_mean)/(input_std + 0.000000000000000000001) #the constant value is to avoid division by zero because some of the datasets actually have a constant column so that std turns out to be 0

preview_input = input_dataset_normalized.head(10).copy()
preview_output = output_dataset.head(10).copy()
print(preview_input)
print(preview_output) 
#Let's see how the datasets have been prepared for further ML algorithms

   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0     1.096100     -2.071512        1.268817   0.983510         1.567087   
1     1.828212     -0.353322        1.684473   1.907030        -0.826235   
2     1.578499      0.455786        1.565126   1.557513         0.941382   
3    -0.768233      0.253509       -0.592166  -0.763792         3.280667   
4     1.748758     -1.150804        1.775011   1.824624         0.280125   
5    -0.475956     -0.834601       -0.386808  -0.505206         2.235455   
6     1.169878      0.160508        1.137124   1.094332        -0.123028   
7    -0.118413      0.358135       -0.072803  -0.218772         1.602639   
8    -0.319885      0.588312       -0.183919  -0.383870         2.199903   
9    -0.473118      1.104467       -0.329192  -0.508616         1.581308   

   compactness_mean  concavity_mean  concave_points_mean  symmetry_mean  \
0          3.280628        2.650542             2.530249       2.215566   
1         -0.

### Data Splitting

In [5]:
#Split the dataset for 80-20 ratio, use 80% for training and 20% for testing, kind of like rule of thumb in ML.
combined = list(zip(input_dataset_normalized.values.tolist(), output_dataset))
np.random.seed(42)
np.random.shuffle(combined)

input_normalized_shuffled, output_shuffled = zip(*combined)
input_normalized_shuffled = np.array(input_normalized_shuffled)
output_shuffled = np.array(output_shuffled)
split_point = int(0.8 * len(input_normalized_shuffled))
input_normalized_shuffled_train_dataset, input_normalized_shuffled_test_dataset = input_normalized_shuffled[:split_point], input_normalized_shuffled[split_point:]
output_shuffled_train_dataset, output_shuffled_test_dataset = output_shuffled[:split_point], output_shuffled[split_point:]


preview_input_train = input_normalized_shuffled_train_dataset[:10]
preview_output_train = output_shuffled_train_dataset[:10]
preview_input_test = input_normalized_shuffled_test_dataset[:10]
preview_output_test = output_shuffled_test_dataset[:10]

print(pd.DataFrame(preview_input_train).round(2))
print(pd.Series(preview_output_train))
print(pd.DataFrame(preview_input_test).round(2))
print(pd.Series(preview_output_test))


     0     1     2     3     4     5     6     7     8     9   ...    20  \
0 -0.47 -0.16 -0.45 -0.49  0.23  0.03 -0.11 -0.28  0.41  0.13  ... -0.27   
1  1.37  0.47  1.30  1.35 -0.45 -0.03  0.24  0.79 -0.84 -1.16  ...  1.78   
2  0.38  0.04  0.40  0.27  0.91  0.34  0.73  0.82  0.44 -0.69  ...  0.62   
3 -0.49 -0.37 -0.43 -0.53  0.64  0.52 -0.14 -0.54 -0.00  1.16  ... -0.70   
4 -0.73 -1.13 -0.71 -0.72  0.25  0.15 -0.27 -0.59  0.02  0.71  ... -0.83   
5  1.84  2.33  1.98  1.73  1.52  3.27  3.29  2.66  2.14  1.04  ...  1.96   
6  2.24  0.61  2.27  2.35  0.71  1.72  1.96  2.61  0.05 -0.20  ...  2.36   
7  0.98 -0.99  0.95  0.85  0.15  0.22  0.12  0.79 -0.26 -0.19  ...  0.77   
8 -0.22 -0.80 -0.23 -0.38  0.81  0.93  0.35  0.54  0.48  0.88  ... -0.15   
9 -0.06 -0.62 -0.12 -0.16 -2.00 -0.97 -0.83 -0.92  0.01 -1.05  ... -0.23   

     21    22    23    24    25    26    27    28    29  
0 -0.17 -0.33 -0.36  0.45 -0.10 -0.02 -0.20  0.18  0.20  
1  0.15  1.75  1.73 -0.57 -0.13 -0.02  0.98 -0.

### Linear Regression

In [6]:
class LinearRegression:
    def __init__(self, learning_rate=0.001, iter=1000, features=None, target_attribute=None):
        self.learning_rate = learning_rate
        self.iter = iter
        self.features = features
        self.target_attribute = target_attribute
        self.weights = None
        self.bias = None

    def train(self):
        num_samples = self.features.shape[0]
        num_features = self.features.shape[1]
        self.weights = np.zeros(num_features)
        self.bias = 0.0

        for _ in range(self.iter):
            predicted_output = np.dot(self.features, self.weights) + self.bias
            error = predicted_output - self.target_attribute
            dw = np.mean(self.features * error[:, np.newaxis], axis=0)
            db = np.mean(error)
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, input_test):
        return np.dot(input_test, self.weights) + self.bias

    def calculate_mse(self, input_test, actual_output):
        predicted_output = self.predict(input_test)
        mse = np.mean((predicted_output - actual_output) ** 2)
        print(f"Linear Regression MSE: {mse:.4f}")
        return mse


In [7]:
linreg = LinearRegression(
    features=input_normalized_shuffled_train_dataset,
    target_attribute=output_shuffled_train_dataset
)
linreg.train()
lin_mse=linreg.calculate_mse(input_normalized_shuffled_test_dataset, output_shuffled_test_dataset)



output_pred = linreg.predict(input_normalized_shuffled_test_dataset)
output_pred_rounded = np.round(output_pred).astype(int)

correct_predictions = 0
total_samples = len(output_shuffled_test_dataset)

for i in range(total_samples):
    predicted = output_pred_rounded[i]
    actual = output_shuffled_test_dataset[i]
    if predicted == actual:
        correct_predictions += 1

lin_accuracy = correct_predictions / total_samples
print(f"Rounded Linear Regression Classification Accuracy: {lin_accuracy * 100:.2f}%")

Linear Regression MSE: 0.1100
Rounded Linear Regression Classification Accuracy: 87.72%


### Logistic Regression

In [8]:
#Logistic Regression is used because the output is categorized not a continous variable so no linear regression was used. 
class CustomLogisticRegression:
    def __init__(self, learning_rate=0.01, iter=2000, features = None, target_attribute=None):
        self.learning_rate = learning_rate
        self.iter = iter
        self.features = features
        self.target_attribute = target_attribute
        self.weights = None
        self.bias = None
        
    def sigmoid_function(self, x):
        return 1 / (1 + np.exp(-x))
    
    def train(self):
        num_samples = self.features.shape[0]
        num_features = self.features.shape[1]
        self.weights = np.zeros(num_features)
        self.bias = 0.0
        
        for _ in range(self.iter):
            z = np.dot(self.features, self.weights) + self.bias
            y_hat = self.sigmoid_function(z)
            
            error = (y_hat - self.target_attribute)
            dw = np.mean(self.features * error[:, np.newaxis], axis=0)
            db = np.mean(error)
            
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, input_test):  # Rename to match usage
        z = np.dot(input_test, self.weights) + self.bias
        probs = self.sigmoid_function(z)
        return (probs >= 0.5).astype(int)

logreg = CustomLogisticRegression(features = input_normalized_shuffled_train_dataset, target_attribute=output_shuffled_train_dataset)
logreg.train()
output_pred = logreg.predict(input_normalized_shuffled_test_dataset)
log_accuracy = np.mean(output_pred == output_shuffled_test_dataset)
print(f"Logistic Regression Accuracy: {log_accuracy * 100:.2f}%")
# Predict the test set values and compute the accuracy of the logistic regression model.

Logistic Regression Accuracy: 96.49%


### Naive Bayes

In [9]:
import numpy as np
# Initialize dictionaries to store class probabilities as well as the variance relative to the mean
class BayessianClassifier:
    def __init__(self, features, target_attribute):
        self.features = features
        self.target_attribute = target_attribute
        self.available_classes = np.unique(self.target_attribute)
        
        self.mean = {}
        self.var = {}
        self.class_probabilities={}
        
    def get_probabilistic_values(self):
         # Calculate the probability, mean and variance
        total_samples = len(self.target_attribute)
        for cls in self.available_classes:
            # Select the data subset
            input_occurence= self.features[self.target_attribute== cls]
            self.class_probabilities[cls]=len(input_occurence)/total_samples
            self.mean[cls] = np.mean(input_occurence, axis=0)
            self.var[cls] = np.var(input_occurence, axis=0)

    # Gaussian Probability Density Function
    def predict(self, data_entry):
        log_posterior_array=[]
        for cls in self.available_classes:
            log_likelihood=0
            for i, col in enumerate(data_entry):
                log_likelihood=log_likelihood+((-((col-self.mean[cls][i]) ** 2))/(2*self.var[cls][i]+1e-6))+np.log(1/(np.sqrt(2*np.pi*self.var[cls][i]+1e-6)))
            log_posterior = log_likelihood + np.log(self.class_probabilities[cls])
            log_posterior_array.append(log_posterior)    
        return self.available_classes[np.argmax(log_posterior_array)]
    

    def calculate_efficiency(self,test_input,test_output):
        correct_predictions=0
        for i in range(len(test_input)):
            predicted = self.predict(test_input[i])
            actual = test_output[i]
            if predicted == actual:
                correct_predictions += 1
        accuracy = correct_predictions / len(test_input)
        print(f"Accuracy: {accuracy * 100:.2f}%")
        return accuracy

In [10]:
# Train the Gaussian Naive Bayes classifier
model = BayessianClassifier(features=input_normalized_shuffled_train_dataset, target_attribute=output_shuffled_train_dataset)
model.get_probabilistic_values()
naive_bayes_accuracy=model.calculate_efficiency(input_normalized_shuffled_test_dataset, output_shuffled_test_dataset)

Accuracy: 92.98%


### KNearest Neighbors

In [11]:
def replace_if_closer(neighbors, candidate):
    """
    neighbors: list of (label, distance)
    candidate: a new tuple (label, distance)
    """
    max_dist = -float('inf')
    max_index = -1

    for i in range(len(neighbors)):
        if neighbors[i][1] > max_dist:
            max_dist = neighbors[i][1]
            max_index = i

    if candidate[1] < max_dist:
        neighbors[max_index] = candidate
    
class KNNClassifier:
    def __init__(self, k=3, features=None, target_attribute=None):
        self.k = k
        self.features = features
        self.target_attribute = target_attribute

    def get_closest_neighbours(self, data_entry):
        # Start with dummy neighbors with infinite distance
        closest_neighbours = [(None, float('inf')) for _ in range(self.k)]

        for i in range(len(self.features)):
            distance = np.sqrt(np.sum((self.features[i] - data_entry) ** 2))
            label = self.target_attribute[i]
            candidate = (label, distance)

            replace_if_closer(closest_neighbours, candidate)
        return closest_neighbours

    def predict(self, closest_neighbours):
        label_counts = {}
    
        for label, _ in closest_neighbours:
            if label in label_counts:
                label_counts[label] += 1
            else:
                label_counts[label] = 1
    
        # Find the label with the highest count
        max_count = -1
        predicted_label = None
    
        for label in label_counts:
            if label_counts[label] > max_count:
                max_count = label_counts[label]
                predicted_label = label
    
        return predicted_label
    
    def calculate_efficiency(self, test_input, test_output):
        correct = 0
        total_samples = len(test_input)
    
        for i in range(total_samples):
            data_entry = test_input[i]
            true_label = test_output[i]
    
            # Step 1: Get k nearest neighbors
            closest_neighbors = self.get_closest_neighbours(data_entry)
    
            # Step 2: Predict label based on majority vote
            predicted_label = self.predict(closest_neighbors)
    
            # Step 3: Compare with actual label
            if predicted_label == true_label:
                correct += 1
    
        # Step 4: Compute accuracy
        accuracy = correct / total_samples
        print(f"KNNClassifier Accuracy: {accuracy * 100:.2f}%")
        return accuracy


In [12]:
model = KNNClassifier(features=input_normalized_shuffled_train_dataset, target_attribute=output_shuffled_train_dataset)
KNN_accuracy=model.calculate_efficiency(input_normalized_shuffled_test_dataset, output_shuffled_test_dataset)


KNNClassifier Accuracy: 94.74%


### Decision tree

In [13]:
class DecisionTree:
    def __init__(self, features, target_attribute, column_types, max_depth):
        self.depth=None
        self.features = features
        self.target_attribute = target_attribute
        self.column_types = column_types
        self.max_depth = max_depth
        self.tree = None

        if features is not None and column_types is not None:
            if len(column_types) != features.shape[1]:
                raise ValueError("Length of column_types must match number of feature columns")

    def calculate_entropy(self, output_column):
        values, counts = np.unique(output_column, return_counts=True) #get the unique class labels from output column with their corresponding occurence frequency.
        probabilities = counts / len(output_column) #array of probabilities by dividing occurence array by the number of rows. 
        entropy = -np.sum([p * np.log2(p + 1e-9) for p in probabilities]) #calculate the given samples entropy
        return entropy

    def categorical_split_entropy(self, feature_column, output_column):
        unique_values = np.unique(feature_column)
        total_entropy = 0
        for val in unique_values:
            mask = feature_column == val #boolean array of where the categorical value selected inside the feature column for the current iteration, return true if matches in array false if it doesn't match
            y_sub = output_column[mask] # clipping the output column by only showing True returned booleans from the mask
            entropy = self.calculate_entropy(y_sub) # calculate the entropy for the specific attribute chosen for this iteration in the specific feature chosen in this function
            weight = len(y_sub) / len(output_column) #calculate the weight of the specific attribute for this iteration
            total_entropy += weight * entropy #multiply specific attribute of the specific feature with its weight, iterate through each attribute for the specific feature eventually obtain the entropy for that specific feature column, will be used later for comparison
        return total_entropy

    def numerical_split_entropy(self, feature_column, output_column):
        sorted_indices = np.argsort(feature_column) #sort the indices chosen feature columns numerical attributes
        feature_column = feature_column[sorted_indices] #sort the feature column according to the sorted indices above
        output_column = output_column[sorted_indices]  #sort the output column according to the sorted indices above
    
        best_entropy = float('inf')
        best_threshold = None
        base_entropy = self.calculate_entropy(output_column) #get the parent entropy
    
        for i in range(len(feature_column) - 1):
            threshold = (feature_column[i] + feature_column[i + 1]) / 2
    
            left_mask = feature_column <= threshold #getting the mask boolean array into two categories, numerical attribute lower than threshold and numerical attribute greater than threshold
            right_mask = feature_column > threshold
    
            left_output = output_column[left_mask] #getting the array itself according to the boolean array created above
            right_output = output_column[right_mask]
    
            if len(left_output) == 0 or len(right_output) == 0: #skip these iterations if there are duplicate data right at the border and split simply split data into 0 and remaining, meaning split didn't actually split anything at all
                continue
    
            left_entropy = self.calculate_entropy(left_output) #calculate entropy for left side
            right_entropy = self.calculate_entropy(right_output) #calculate entropy for right side
    
            weighted_entropy = (len(left_output) * left_entropy + len(right_output) * right_entropy) / len(output_column) #get the split weighted entropy sum
    
            if weighted_entropy < best_entropy: #if split weighted entropy sum is lower than the best one so far, this is the best one and this splti poitn is the best split point
                best_entropy = weighted_entropy
                best_threshold = threshold
    
        info_gain = base_entropy - best_entropy #calculate info gain according to the best weighted split entropy  
        return best_entropy, best_threshold, info_gain

    def find_best_split(self, features=None, target=None, depth=0):
        if features is None:
            features = self.features
        if target is None:
            target = self.target_attribute

        if len(np.unique(target)) == 1: #if the split has only one possible output, conclude that leaf with the given output
            return {'label': target[0]}

        if len(target) <= 5:   #if the split has fewer rows than 5, take the maximum occurence of the output class as reference and conclude the leaf
            return {'label': np.bincount(target).argmax()}

        if self.max_depth is not None and depth >= self.max_depth:
            return {'label': np.bincount(target).argmax()}

        base_entropy = self.calculate_entropy(target)
        best_gain = -float('inf')
        best_column = None
        best_type = None
        best_threshold = None

        for col_index in range(features.shape[1]):
            feature_col = features[:, col_index] #extract the col
            col_type = self.column_types[col_index] #get the col type

            if col_type == 'categorical': #if the col is categorical, apply categorical entropy calculation
                entropy = self.categorical_split_entropy(feature_col, target)
                threshold = None
                info_gain = base_entropy - entropy
            else:                          #if the col is numerical, apply numerical entropy calculation
                entropy, threshold, info_gain = self.numerical_split_entropy(feature_col, target)

            if info_gain > best_gain: #get the best info gain
                best_gain = info_gain
                best_column = col_index
                best_type = col_type
                best_threshold = threshold

        return best_column, best_type, best_threshold, best_gain

    def build_tree(self, features=None, target=None, depth=0):
        if features is None:
            features = self.features
        if target is None:
            target = self.target_attribute

        split = self.find_best_split(features, target, depth)
        if isinstance(split, dict) and 'label' in split: #looks if split returned a leaf node or a split
            return split

        best_col, col_type, threshold, _ = split
        feature_col = features[:, best_col]

        if col_type == 'categorical':
            branches = {}
            for val in np.unique(feature_col):
                mask = feature_col == val
                sub_features = features[mask]
                sub_target = target[mask]
                branches[val] = self.build_tree(sub_features, sub_target, depth + 1)
            return {
                'feature_index': best_col,
                'type': col_type,
                'branches': branches
            }
        else:
            left_mask = feature_col <= threshold
            right_mask = feature_col > threshold

            left_features, left_target = features[left_mask], target[left_mask]
            right_features, right_target = features[right_mask], target[right_mask]
            return {
                'feature_index': best_col,
                'type': col_type,
                'threshold': threshold,
                'left': self.build_tree(left_features, left_target, depth + 1),
                'right': self.build_tree(right_features, right_target, depth + 1)
            }

    def train(self):
        self.tree = self.build_tree()
        self.depth = self.get_tree_depth(self.tree)  

    def get_tree_depth(self, node):
        if 'label' in node:
            return 0
        return 1 + max(self.get_tree_depth(node['left']), self.get_tree_depth(node['right']))

    def predict_one(self, input_row, tree):
        if 'label' in tree:
            return tree['label']

        if tree['type'] == 'categorical':
            val = input_row[tree['feature_index']]
            if val in tree['branches']:
                return self.predict_one(input_row, tree['branches'][val])
            else:
                return list(tree['branches'].values())[0]['label']
        else:
            if input_row[tree['feature_index']] <= tree['threshold']:
                return self.predict_one(input_row, tree['left'])
            else:
                return self.predict_one(input_row, tree['right'])

    def predict(self, input_test):
        return np.array([self.predict_one(row, self.tree) for row in input_test])

    def calculate_efficiency(self, test_input, test_output):
        predictions = self.predict(test_input)
        accuracy = np.mean(predictions == test_output)
        print(f"Decision Tree Classification Accuracy: {accuracy * 100:.2f}%")


In [14]:
best_accuracy = 0
best_depth = 1
depth = 1
max_possible_depth=None

while True:
    tree_clf = DecisionTree(
        features=input_normalized_shuffled_train_dataset,
        target_attribute=output_shuffled_train_dataset,
        column_types=column_types,
        max_depth=depth
    )
    tree_clf.train()
    predictions = tree_clf.predict(input_normalized_shuffled_test_dataset)
    accuracy = np.mean(predictions == output_shuffled_test_dataset)
    
    print(f"Max Depth Tried: {depth}, Actual Depth: {tree_clf.depth}, Accuracy: {accuracy * 100:.2f}%")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_depth = depth

    # If increasing depth no longer increases actual tree depth, break
    if tree_clf.depth < depth:
        max_possible_depth=depth
        break

    depth += 1

print(f"\nMax Possible Depth is: {max_possible_depth}")
print(f"\nBest Depth: {best_depth} with Accuracy: {best_accuracy * 100:.2f}%")

Max Depth Tried: 1, Actual Depth: 1, Accuracy: 85.96%
Max Depth Tried: 2, Actual Depth: 2, Accuracy: 85.96%
Max Depth Tried: 3, Actual Depth: 3, Accuracy: 92.11%
Max Depth Tried: 4, Actual Depth: 4, Accuracy: 91.23%
Max Depth Tried: 5, Actual Depth: 5, Accuracy: 94.74%
Max Depth Tried: 6, Actual Depth: 5, Accuracy: 94.74%

Max Possible Depth is: 6

Best Depth: 5 with Accuracy: 94.74%


In [15]:

print("Linear Regression Mean Square Error: ", lin_mse)
print("Linear Regression Rounded Accuracy Percentage: ", lin_accuracy)
print(f"Logistic Regression Accuracy: {log_accuracy * 100:.2f}%")
print(f"Decision Tree Classification Accuracy: {best_accuracy * 100:.2f}%")
print(f"KNN Classification Accuracy: {KNN_accuracy * 100:.2f}%")
print(f"Gaussian Naive Bayes Accuracy: {naive_bayes_accuracy * 100:.2f}%")


Linear Regression Mean Square Error:  0.10997694540102625
Linear Regression Rounded Accuracy Percentage:  0.8771929824561403
Logistic Regression Accuracy: 96.49%
Decision Tree Classification Accuracy: 94.74%
KNN Classification Accuracy: 94.74%
Gaussian Naive Bayes Accuracy: 92.98%
