# <div align="center">CP322-A Mini-Project 1: Machine Learning</div>
## <div align="center">Group 6</div>
### <div align="center">due on 15-Oct-2023 at 11:30 PM</div>

Imports:

In [1]:
import numpy as np
import heapq
from math import sqrt
from collections import Counter
from scipy import stats
import matplotlib.pyplot as plt
import statistics

## Task 1: Acquire, preprocess, and analyze the data

1. Load the datasets into NumPy objects (i.e., arrays or matrices) in Python. Remember to convert the wine dataset
to a binary task, as discussed above.
2. Clean the data. Are there any missing or malformed features? Are there other data oddities that need to be
dealt with? You should remove any examples with missing or malformed features and note this in your
report. For categorical variables, you can use a one-hot encoding.
3. Compute basic statistics on the data to understand it better. E.g., what are the distributions of the positive vs.
negative classes, what are the distributions of some of the numerical features? what are the correlations between
the features? how do the scatter plots of pair-wise features look like for some subset of features?

In [2]:
def readFileNP(filename):
    
    data = []
    labels = []
    with open(filename, "r") as file:
        for line in file:
            line = line.strip()
            if line:  # Skip empty lines
                row = line.split(",")
                if filename=="data/iris.data":
                    data.append([float(val) for val in row[:-1]])
                    labels.append(row[-1])
                else:
                    data.append(row)

    data = np.array(data)
    if filename == "data/iris.data":
        return np.array(data),np.array(labels)
    else:
        return data

def readFile(filename):
    data = []
    labels = []
    with open(filename, "r") as file:
        for line in file:
            line = line.strip()
            if line:  # Skip empty lines
                row = line.split(",")
                if filename == "adult.data":
                    # Convert non-numerical features to float
                    age = float(row[0])
                    fnlwgt = float(row[2])
                    education_num = float(row[4])
                    capital_gain = float(row[10])
                    capital_loss = float(row[11])
                    hours_per_week = float(row[12])
                    # Combine the numerical features
                    numerical_features = [age, fnlwgt, education_num, capital_gain, capital_loss, hours_per_week]
                    # Append the numerical features
                    data.append(numerical_features)
                    label = row[-1]
                    # Map the labels to binary values, e.g., '<=50K' to 0 and '>50K' to 1
                    labels.append(0 if label == ' <=50K' else 1)
                elif filename == "Rice_Cammeo_Osmancik.arff.txt":
                    data.append([float(val) for val in row[:-1]])
                    label = row[-1]
                    labels.append(0 if label == 'Cammeo' else 1)
                elif filename == "agaricus-lepiota.data":
                    label = row[0]
                    labels.append(0 if label == 'e' else 1)
                    if "?" not in row:
                        data.append([val for val in row[:-1]])  
                        
                else:
                    data.append([float(val) for val in row[:-1]])
                    label = row[-1]
                    labels.append(0 if label == 'b' else 1)


    return data, labels
    
class dataAnalysis:
    def __init__(self, data, categories, classifications):
        self.data = data
        self.size = len(data)
        self.categories = categories
        self.classifications = classifications
    def printLabelStats(self):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        Print statistics on every classification in data
        ===================================================================================
        '''
        print("=====================================================")
        print("Classification Analysis:")
        print("=====================================================")
        print(f"Total: {self.size} (%100)")
        
        for label in self.classifications:
            count = len(self.data[self.data[:, -1] == label])
            percentage = round(100*count/self.size,2)
            print(f"Value: {label}, Count: {count}, Percentage: %{percentage}")
            
    def printCategoryStats(self):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        Print statistics on every attribute in data
        ===================================================================================
        '''
        data = self.data
        categories = self.categories
        size = self.size
        labels = column = data[:, -1]
        print("=====================================================")
        print("Attribute Analysis:")
        print("=====================================================")
        for i in range(len(categories)):
            category = categories[i]
            column = data[:, i].astype(float)
            
            avg = np.mean(column)
            med = np.median(column)
            mode = float(stats.mode(column,keepdims=True)[0][0])
            std_dev = np.std(column)
        
            unique_values, counts = np.unique(column, return_counts=True)
            
            print("-----------------------------------------------------")
            print(f"{category.upper()} Analysis:")
            print("-----------------------------------------------------")
            for value, count in zip(unique_values, counts):
                percentage = round(100*count/self.size,2)
                print(f"Value: {value}, Count: {count}, Percentage: %{percentage}")
                
            print(f"\nMean: {avg}")
            print(f"Median: {med}")
            print(f"Mode: {mode}")
            print(f"Standard Deviation: {std_dev}")
            
            plt.figure(figsize=(4, 4))
            plt.bar(labels, column, edgecolor='black')
            plt.xlabel(category)
            plt.ylabel("Output")
            plt.title("Bar Chart of Numeric Data by Labels")
            plt.show()
           

            
            

### Dataset 1 (Ionosphere): 

In [62]:
filename = "data/ionosphere.data"

data,labels = readFile(filename)

# Count the number of positive class instances
positive_count = sum(1 for label in labels if label == 1)

# Count the number of negative class instances
negative_count = sum(1 for label in labels if label == 0)

#what are the distributions of the positive vs. negative classes?
print("Distribution of classes:")
print("Positive (g):", positive_count)
print("Negative (b):", negative_count)

print("\nData:")
print(data,labels)

#what are the distributions of some of the numerical features?

Distribution of classes:
Positive (g): 225
Negative (b): 126

Data:
[[1.0, 0.0, 0.99539, -0.05889, 0.85243, 0.02306, 0.83398, -0.37708, 1.0, 0.0376, 0.85243, -0.17755, 0.59755, -0.44945, 0.60536, -0.38223, 0.84356, -0.38542, 0.58212, -0.32192, 0.56971, -0.29674, 0.36946, -0.47357, 0.56811, -0.51171, 0.41078, -0.46168, 0.21266, -0.3409, 0.42267, -0.54487, 0.18641, -0.453], [1.0, 0.0, 1.0, -0.18829, 0.93035, -0.36156, -0.10868, -0.93597, 1.0, -0.04549, 0.50874, -0.67743, 0.34432, -0.69707, -0.51685, -0.97515, 0.05499, -0.62237, 0.33109, -1.0, -0.13151, -0.453, -0.18056, -0.35734, -0.20332, -0.26569, -0.20468, -0.18401, -0.1904, -0.11593, -0.16626, -0.06288, -0.13738, -0.02447], [1.0, 0.0, 1.0, -0.03365, 1.0, 0.00485, 1.0, -0.12062, 0.88965, 0.01198, 0.73082, 0.05346, 0.85443, 0.00827, 0.54591, 0.00299, 0.83775, -0.13644, 0.75535, -0.0854, 0.70887, -0.27502, 0.43385, -0.12062, 0.57528, -0.4022, 0.58984, -0.22145, 0.431, -0.17365, 0.60436, -0.2418, 0.56045, -0.38238], [1.0, 0.0, 1.0, -0.45

### Dataset 2 (Adult Data Set):

In [63]:
# filename = "data/adult.data"

# data, labels = readFile(filename)

# # Count the number of positive class instances
# positive_count = sum(1 for label in labels if label == 1)

# # Count the number of negative class instances
# negative_count = sum(1 for label in labels if label == 0)

# print("Distribution of classes:")
# print("Positive (>50):", positive_count)
# print("Negative (<=50):", negative_count)

# print("\nData:")
# print(data, labels)

#what are the distributions of some of the numerical features?

### Dataset 3 (Rice):

In [64]:
filename = "data/Rice_Cammeo_Osmancik.arff.txt"

data, labels = readFile(filename)

# Count the number of positive class instances
positive_count = sum(1 for label in labels if label == 1)

# Count the number of negative class instances
negative_count = sum(1 for label in labels if label == 0)

print("Distribution of classes:")
print("Positive (Cammeo):", positive_count)
print("Negative (Osmancik):", negative_count)

print("\nData:")
print(data, labels)

#what are the distributions of some of the numerical features?

Distribution of classes:
Positive (Cammeo): 3810
Negative (Osmancik): 0

Data:
[[15231.0, 525.5789794921875, 229.7498779296875, 85.09378814697266, 0.9288820028305054, 15617.0, 0.5728955268859863], [14656.0, 494.3110046386719, 206.0200653076172, 91.73097229003906, 0.8954049944877625, 15072.0, 0.615436315536499], [14634.0, 501.12200927734375, 214.10678100585938, 87.7682876586914, 0.9121180772781372, 14954.0, 0.6932588219642639], [13176.0, 458.3429870605469, 193.33738708496094, 87.44839477539062, 0.8918609023094177, 13368.0, 0.640669047832489], [14688.0, 507.1669921875, 211.74337768554688, 89.31245422363281, 0.9066908955574036, 15262.0, 0.6460239291191101], [13479.0, 477.0159912109375, 200.0530548095703, 86.6502914428711, 0.9013283252716064, 13786.0, 0.6578972935676575], [15757.0, 509.281005859375, 207.2966766357422, 98.33613586425781, 0.8803234696388245, 16150.0, 0.5897080898284912], [16405.0, 526.5700073242188, 221.61251831054688, 95.43670654296875, 0.9025205969810486, 16837.0, 0.658888

### Dataset 4 (Mushroom):

In [65]:
# filename = "data/agaricus-lepiota.data"
# data, labels = readFile(filename)

# # Count the number of positive class instances
# positive_count = sum(1 for label in labels if label == 1)

# # Count the number of negative class instances
# negative_count = sum(1 for label in labels if label == 0)

# print("Distribution of classes:")
# print("Positive (Poisinous):", positive_count)
# print("Negative (Edible):", negative_count)

# print("\nData:")
# print(data, labels)

#what are the distributions of some of the numerical features?

## Task 2: Implement the models

#### 1. Implement logistic regression, and use (full batch) gradient descent for optimization.
#### 2. Implement k-Nearest Neighbor (KNN), and find the best K.

Implement both models as Python classes. You should use the constructor for the class to initialize the model
parameters as attributes, as well as to define other important properties of the model.
• Each of your models’ classes should have (at least) two functions:
– Define a fit function, which takes the training data (i.e., x and y)—as well as other hyperparameters (e.g.,
the learning rate and/or number of gradient descent iterations)—as input. This function should train your
model by modifying the model parameters.
– Define a predict function, which takes a set of input points (i.e., x) as input and outputs predictions (i.e.,
yˆ) for these points. Note that for linear regression you need to convert probabilities to binary 0-1
predictions by thresholding the output at 0.5!
In addition to the model classes, you should also define functions evaluate_acc to evaluate the model accuracy.
This function should take the true labels (i.e., y), and target labels (i.e., yˆ) as input, and it should output the accuracy
score.
• Lastly, you should implement a script to run k-fold cross-validation

### Logistic Regression:

In [4]:
class LogisticRegression:
    def __init__(self):
        self.learning_rate = 0.01
        self.num_iterations = 1000
        self.weights = None
        self.bias = None
        self.iter = 0

    def set_learning_rate(self, val):
        self.learning_rate = val

    def sigmoid(self, z):
        # Sigmoid function to convert values to probabilities between 0 and 1
        return 1 / (1 + np.exp(-z)) #sigmoid(z) = 1 / ( 1 + e( - z ) )

    def fit(self, data, labels): #training the logistic regression model
        num_samples, num_features = data.shape
        self.weights = np.zeros(num_features)
        self.bias = 0
        converge=0.0001
        converged = False
        cost1 = 1
        count = 0
        self.iter = 0
        
        while not converged and count<self.num_iterations:
        # Gradient descent
        #for i in range(self.num_iterations):
            #Hypothesis Function
            linear_model = np.dot(data, self.weights) + self.bias
            predictions = self.sigmoid(linear_model)

            # Compute gradients
            #∂J/∂w = (1/m) * Σ[(h(x) - y) * x] , ∂J/∂b = (1/m) * Σ(h(x) - y)

            dw = (1/num_samples) * np.dot((predictions - labels),data)
            db = (1/num_samples) * np.sum(predictions - labels)

            # Update the parameters in the opposite direction of the gradient
            #w := w - α * ∂J/∂w  ,  b := b - α * ∂J/∂b
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            cost = 0   
            self.iter += 1
            probabilities = self.sigmoid(linear_model)
            cost = -1/num_samples * (np.dot(1 - labels, np.log(1 - probabilities + converge)) + np.dot(labels, np.log(probabilities + converge)))
            
            if abs(cost1-cost)<=converge:
                converged = True
            cost1=cost
            count+=1
            
        return 
        
    def k_fold (self, data, labels, k):
        accuracies = []
        iterations = []
        index_length = len(data)//k
        counter = 0
        
        for i in range(k):
            if i == k-1:
                data_testing_set = data[counter:]
                data_training_set = data[:counter]
                label_testing_set = labels[counter:]
                label_training_set = labels[:counter]
                
            else:
                data_testing_set = data[counter:index_length+counter]
                label_testing_set = labels[counter:index_length+counter]
                if counter == 0:
                    data_training_set = data[index_length+1:]
                    label_training_set = labels[index_length+1:]
                else:
                    data_training_set = np.concatenate((data[0:counter] , data[index_length+counter+1:]))
                    label_training_set = np.concatenate((labels[0:counter] , labels[index_length+counter+1:]))
                    
            counter+=index_length
            self.fit(np.array(data_training_set),np.array(label_training_set))
            labels_pred = self.predict(np.array(data_testing_set))
            accuracy = self.evaluate_acc(np.array(label_testing_set),np.array(labels_pred))
            accuracies.append(accuracy)
            iterations.append(self.iter)
        
        return accuracies, iterations
            
    def predict(self, data):
        #Hypothesis Function
        linear_model = np.dot(data, self.weights) + self.bias
        predictions = self.sigmoid(linear_model)
        return [1 if p >= 0.5 else 0 for p in predictions]

    def evaluate_acc(self, label_true, label_pred):
        correct = np.sum(label_true == label_pred)
        total = len(label_true)
        return correct / total

filename = "data/ionosphere.data"
#filename = "data/Rice_Cammeo_Osmancik.arff.txt"
#filename = "data/agaricus-lepiota.data"
#filename = "data/adult.data"
data, labels = readFile(filename)

# Combine features and labels
data_with_labels = list(zip(data, labels))

# Split data into training and testing sets (80% training, 20% testing)
split_ratio = 0.7
split_index = int(len(data_with_labels) * split_ratio)

train_data, train_labels = zip(*data_with_labels[:split_index])
test_data, test_labels = zip(*data_with_labels[split_index:])

model = LogisticRegression()

# Fit the model to the training data
iter = model.fit(np.array(train_data), np.array(train_labels))

# Make predictions on the test data
labels_pred = model.predict(np.array(test_data))

# Evaluate the model's accuracy
accuracy = model.evaluate_acc(np.array(test_labels), labels_pred)
print(f"Accuracy: {accuracy:.2f}")

"""
#test chosen dataset with 5-fold cross validation, print results and the average
model2 = LogisticRegression()
kfold, _ = model2.k_fold(data, labels, k = 5)
print("kfold results: ", kfold)
print("Average: ", np.average(kfold))
"""
model3 = LogisticRegression()
vals = [5,3,2,1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

for val in vals:
    model3.set_learning_rate(val)
    kfold, iterations = model3.k_fold(data, labels, k = 5)
    print("\nkfold results for learning rate", val, ": ", kfold)
    print("Average kfold results for learning rate", val, ": ", np.average(kfold))
    print("Number of iterations: ", iterations)
    print("Average number of iterations: ", np.average(iterations))

Accuracy: 0.96

kfold results for learning rate 5 :  [0.7428571428571429, 0.8142857142857143, 0.8428571428571429, 0.9, 1.0]
Average kfold results for learning rate 5 :  0.86
Number of iterations:  [90, 209, 563, 1000, 1000]
Average number of iterations:  572.4

kfold results for learning rate 3 :  [0.7428571428571429, 0.8, 0.8285714285714286, 0.9142857142857143, 0.9577464788732394]
Average kfold results for learning rate 3 :  0.8486921529175049
Number of iterations:  [264, 321, 74, 91, 127]
Average number of iterations:  175.4

kfold results for learning rate 2 :  [0.7428571428571429, 0.8, 0.8285714285714286, 0.9142857142857143, 0.9577464788732394]
Average kfold results for learning rate 2 :  0.8486921529175049
Number of iterations:  [292, 347, 14, 275, 265]
Average number of iterations:  238.6

kfold results for learning rate 1 :  [0.7571428571428571, 0.7714285714285715, 0.8142857142857143, 0.9142857142857143, 0.9577464788732394]
Average kfold results for learning rate 1 :  0.84297786

### K-Nearest Neighbor (KNN):
Riley and Torin

In [67]:
# 1) a new data point is input that we need to classify
# 2) check the classification of the k nearest elements
# 3) assunming we have 2 unique classifications (a,b). we take the classification of the dominant group
# 4) if a tie exists take the class with the shortest distance from 

#to calculate distance we can use the Euclidean distance formula sqrt(sum i to N (x1_i — x2_i)²)


class kNN:
    def __init__(self, k, dist_metric="euclidean"):     
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        initialize kNN model
        ===================================================================================
        PARAMETERS:
        ===================================================================================
        * self (kNN): 
        ----------------------------------------
        kNN model to define k values, training data, and distance metric
        ----------------------------------------
        * k (int):
        ----------------------------------------
        integer representing number of neighbours to compare to
        ----------------------------------------
        * dist_metric (string):
        ----------------------------------------
        string representing distance metric formula to follow
        ===================================================================================
        '''
        self.k = k #num of neighbours
        self.dist_metric = dist_metric #equation to calculate distance with
        self.train_data = None #initialize using fit method
        self.train_labels = None
        
    def fit(self, data, labels):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        set train_data and train_labelsby loading in Train data used to compare new data
        ===================================================================================
        PARAMETERS:
        ===================================================================================
        * data[] (NumPy Array): 
        ----------------------------------------
        list of data with labels seperated
        ----------------------------------------
        * labels (NumPy Array):
        ----------------------------------------
        list of labels with data removed
        ===================================================================================
        '''
        self.train_data = data
        self.train_labels = labels

    def predict(self, new_data):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        given new data, compare its items to the k closest elements of training data based 
        on a set distance metric and predict the datas classification.
        ===================================================================================
        PARAMETERS:
        ===================================================================================
        * self (kNN): 
        ----------------------------------------
        kNN model with predefined k values, training data, and distance metric
        ----------------------------------------
        * new_data (NumPy Array):
        ----------------------------------------
        Array of new data to predict classifications for
        ===================================================================================
        RETURNS:
        ===================================================================================
        * predictions (List):
        ----------------------------------------
        list of labels for each item in new_data
        ===================================================================================
        '''
        predictions = []#return array of predicted classifications, for each row in new_data
        for new_row in new_data:
            # calculate distances between new data and training data                   
            k_neighbours = self.__neighbours(new_row) #determine the k nearest neighbours using preffered distance metric
            classifications = []#for the given neighbors check their label
            distances = [] #for tiebreak if need be
            for result in k_neighbours:
                #print(f"new = {new_row}: train = {result[2]}")
                i = result[1]#results formatted [row, index of row], so take the index to find the associated label
                dist = result[0]
                classifications.append(self.train_labels[i])#add label at index i
                distances.append(dist)

            
            #check for ties in classifications here
            # thinking is use multimode to check for classification ties, to know then we need to check lowest distance 
            classifications_mode = statistics.multimode(classifications)
            #print(classifications_mode)
            #print(distances)

            if len(classifications_mode) > 1: #i.e we have a tie
                #go over distances, and get the lowest average and append that to predictions
                #print(classifications_mode)
                #make an array of size len(classifications) to store distances
                distances_mode = [0.0] * len(classifications_mode)
                #print(distances_mode, len(distances_mode), len(classifications_mode))
                #if label from classifications is in modes, add the distance to the appropriate index in the distances array
                classification_index = 0
                for classification in classifications:
                    if classification in classifications_mode: #if the classification is part of the multimode
                        #get the proper index for the distance array for that 
                        distance_index = classifications_mode.index(classification)
                        #print(distance_index)
                        #print(distances_mode)
                        distances_mode[distance_index] += distances[classification_index] #increase the correct distance by the distance of the classification

                    classification_index += 1

                #print(distances)
                #print(distances_mode)
                #choose the minimum distance label and append to predictions
                min_distance = min(distances_mode)

                min_distance_index = distances_mode.index(min_distance)
                #print(min_distance_index)
                min_classification = classifications_mode[min_distance_index]

                #print("The label with the minimum distance to neighbors is:", min_classification, "with distance:", min_distance)

                predictions.append(min_classification)

            else:
                predictions.append(str(max(classifications, key=classifications.count))) #from collections import Counter
        
        return predictions
    
    def evaluate_acc(self,predictions, test_labels):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        Compare model predictions to actual values and print success rate
        ===================================================================================
        '''
        total = len(predictions)
        hits = 0

        for i in range(total):
            if predictions[i] == test_labels[i]:
                hits += 1
#             print(f"guess:{predictions[i]}|answer:{test_labels[i]}")
        percentage = round(100*hits/total,2)
        print(f"Success Rate: %{percentage}")

#         return hits/total
    
    def __calc_distance(self,newRow, trainRow):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        Private Function used in self.__neighbours(). Given a row from new data, calculate 
        the distance based on a set metric from a row in Train data
        ===================================================================================
        PARAMETERS:
        ===================================================================================
        * self (kNN): 
        ----------------------------------------
        kNN model with predefined k values, training data, and distance metric
        ----------------------------------------
        * newRow[] (List of data points (float/int)):
        ----------------------------------------
        data row to compare distance from train row data 
        ----------------------------------------
        * trainRow[] (List of data points (float/int)):
        ----------------------------------------
        data row to compare distance with test data
        ===================================================================================
        RETURNS:
        ===================================================================================
        * distance (float):
        ----------------------------------------
        float distance between to rows of data
        ===================================================================================
        '''
        distance = 0
        if self.dist_metric == "euclidean":
            for i in range(len(newRow)):
                squared = pow(newRow[i] - trainRow[i],2)
                distance += squared
            distance = sqrt(distance)
        return(distance)

    def __neighbours(self, new_row):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        private function used in self.predict(). Given a row from new data, return k number
        of neigbours based on distance
        ===================================================================================
        PARAMETERS:
        ===================================================================================
        * self (kNN): 
        ----------------------------------------
        kNN model with predefined k values, training data, and distance metric
        ----------------------------------------
        * newRow[] (List of data points (float/int)):
        ----------------------------------------
        data row to compare distance from train row data 
        ----------------------------------------
        * trainRow[] (List of data points (float/int)):
        ----------------------------------------
        data row to compare distance with test data
        ===================================================================================
        RETURNS:
        ===================================================================================
        * k_neighbours (List):
        ----------------------------------------
        list of k closest neighbours based on distance metric
        ===================================================================================
        '''
        distances = []#heap array
        #for every row of data
        for index in range(len(self.train_data)):#use index to return that value later
            train_row = self.train_data[index]#current row of train data
            dist = self.__calc_distance(new_row, train_row)#calculate distance between new row and train data row
            heapq.heappush(distances, [-dist, index, list(train_row)])#make negative value temporarily to assure we have smallest values 
            if len(distances) > self.k:#past k values remove largest from heap
                heapq.heappop(distances)
        
        k_neighbours = [[-dist, index, train_row] for dist, index, train_row in sorted(distances)]#make positive values, only 5 smallest remain

        return k_neighbours
    
    def __foldSplit(self, data, k_folds):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        split data into folds used for kFoldCross function 
        ===================================================================================
        PARAMETERS:
        ===================================================================================
        * self (kNN): [not used]
        ----------------------------------------
        kNN model with predefined k values, training data, and distance metric
        ----------------------------------------
        * data (numpy array):
        ----------------------------------------
        data to split into training and test data based on k value
        ----------------------------------------
        * k_folds (int):
        ----------------------------------------
        number of folds to split data into       
        ===================================================================================
        '''  
        data_size = len(data)
        fold_size = data_size//k_folds
        data_split = []
        
        for i in range(k_folds):
#             print(i)
            start_i = i * fold_size
            end_i = (i + 1) * fold_size if i < k_folds - 1 else len(data)

            # Extract the test data for this fold
            test_data = data[start_i:end_i]
#             print(test_data)

            # Extract the training data for this fold
            train_data = np.concatenate([data[:start_i], data[end_i:]], axis=0)
            data_split.append((train_data, test_data))
            print(len(data_split))
        
        return data_split
    
    def kFoldCross(self, data, k_folds):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        A script to run an evaluation of the same data with different training values based 
        on k Fold cross validatio 
        ===================================================================================
        PARAMETERS:
        ===================================================================================
        * self (kNN): [not used]
        ----------------------------------------
        kNN model with predefined k values, training data, and distance metric
        ----------------------------------------
        * data (numpy array):
        ----------------------------------------
        data to split into training and test data based on k value
        ----------------------------------------
        * k_folds (int):
        ----------------------------------------
        number of folds to split data into       
        ===================================================================================
        '''    
        
        dataSplit = self.__foldSplit(data,k_folds)
        size = len(dataSplit)
        
        for i in range(size):
            print("=================================")
            print(f"Training with Fold {i+1}")
            print("---------------------------------")

            train_data_list,test_data_list = dataSplit[i]
            train_data, train_labels = self.__seperateLabels(train_data_list)
            test_data, test_labels = self.__seperateLabels(test_data_list)
            
            self.fit(train_data, train_labels)
            predictions = self.predict(test_data)
            self.evaluate_acc(predictions, test_labels)

        
    def __seperateLabels(self, data):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        seperate data into values and classifications (data,labels)
        ===================================================================================
        PARAMETERS:
        ===================================================================================
        * self (kNN): [not used]
        ----------------------------------------
        kNN model with predefined k values, training data, and distance metric
        ----------------------------------------
        * data (NumPy Array):
        ----------------------------------------
        data imported for assignment
        ===================================================================================
        RETURNS:
        ===================================================================================
        * values , labels
        ----------------------------------------
        a tuple of 2 numpy arrays, one of data and one of labels
        ===================================================================================
        '''
        values = data[:,:-1].astype(float)#data only
        labels = data[:,-1]#classifications only
        
        return (values,labels)
        
    def testTrainSplit(self, data, testSplit=0.7):
        '''
        ===================================================================================
        DESCRIPTION: 
        ===================================================================================
        Used externally to split data into test and train data. probably remove later
        ===================================================================================
        PARAMETERS:
        ===================================================================================
        * self (kNN): [not used]
        ----------------------------------------
        kNN model with predefined k values, training data, and distance metric
        ----------------------------------------
        * data (NumPy Array):
        ----------------------------------------
        data imported for assignment
        ----------------------------------------
        * testSplit (float):
        ----------------------------------------
        ratio of data to be used for testing, default 70/30 split
        ===================================================================================
        RETURNS:
        ===================================================================================
        * dataSplit (tuple(List1,List2)):
        ----------------------------------------
        a tuple of 2 lists where list 1 contains training data and training labels,
        similarily list 2 contains test data and tes labels
        ===================================================================================
        '''
        split = int(len(data) * testSplit )
        #Split train data (70% standard)
        train_data, train_labels = self.__seperateLabels(data[:split]) 
        test_data, test_labels = self.__seperateLabels(data[split:])
        
        dataSplit = ([train_data, train_labels], [test_data, test_labels])
        
        return(dataSplit)



In [68]:
ionosphere_dataNP = readFileNP("data/ionosphere.data")
model1 = kNN(3)
train,test = model1.testTrainSplit(ionosphere_dataNP)
model1.fit(train[0],train[1])
predictions = model1.predict(test[0])
model1.kFoldCross(ionosphere_dataNP,5)

# model2 = kNN(5)
# train,test = model2.testTrainSplit(adult_data)
# model2.fit(train[0],train[1])
# predictions = model2.predict(test[0])

rice_dataNP = readFileNP("data/Rice_Cammeo_Osmancik.arff.txt")
model3 = kNN(4)
train,test = model3.testTrainSplit(rice_dataNP)
model3.fit(train[0],train[1])
predictions = model3.predict(test[0])

# model4 = kNN(3)
# train,test = model4.testTrainSplit(cars_data)
# model.fit(train[0],train[1])
# predictions = model4.predict(test[0])



1
2
3
4
5
Training with Fold 1
---------------------------------
Success Rate: %78.57
Training with Fold 2
---------------------------------
Success Rate: %68.57
Training with Fold 3
---------------------------------
Success Rate: %82.86
Training with Fold 4
---------------------------------
Success Rate: %88.57
Training with Fold 5
---------------------------------
Success Rate: %97.18


## Task 3: Run Experiments

The goal of this project is to have you explore linear classification and compare different features and models. Use
5-fold cross-validation to estimate performance in all of the experiments. Evaluate the performance using accuracy.
You are welcome to perform any experiments and analyses you see fit (e.g., to compare different features), but at a
minimum, you must complete the following experiments in the order stated below:

#### 1. Compare the accuracy of k-nearest neighbor and logistic regression on the four datasets.

#### 2. Test different k values for the k-nearest neighbor to find the best k-value by showing the accuracy plot. 

#### 3. Test different learning rates for gradient descent applied to logistic regression. Use a threshold for change in the value of the cost function as termination criteria and plot the accuracy on the train/validation set as a function of iterations of gradient descent.

#### 4. Compare the accuracy of the two models as a function of the size of the dataset (by controlling the training size)

Note: The above experiments are the minimum requirements that you must complete; however, this project is open-ended. For example, you might investigate different stopping criteria for gradient descent in logistic regression and develop an automated approach to select a good subset of features. You do not need to do all of these things, but you should demonstrate creativity, rigor, and an understanding of the course material in how you run your chosen experiments and how you report on them in your write-up.