All the modules, constant, imports and libraries used in this program

In [11]:
import numpy as np
import pandas as pd
from random import randint
from KNN import KNearestNeighbors

FILE_INPUT = {'IRIS':'iris.csv', 'MNIST':'mnist_test.csv'}
PERCENTAGE = 0.20

#### Read a given csv file

This function reads the input file: <ol>
<li>Takes the input file path</li>
<li>The columns name if not present in the file</li> 
</ol>
Returns: 
<ol>
<li>The pandas data frame</li>
</ol>

In [12]:
def read_file(file:str, columnsName:list[str]=None)->pd.DataFrame:
    """ _summary_
    Args:
        file: the file path
        columnsName: the header of the file if not present
    Returns:
        :returns: pandas dataFrame with the given header
    """
    return pd.read_csv(file, header=None if columnsName else 0, names=columnsName if columnsName else None)

#### Random index generator 
Given the number of index you want to be generated it creates them <br> 
bounded by the upper limit and the lower limit (0 if not present)
<br>
Returns:
- The list of random indexes 

In [13]:
def generateRandomIndex(num:int, upperLimit:int, lowerLimit:int=0)->set[int]:
    """_summary_
    Args:
        num : the number of numbers you want to be generated
        upperLimit : the upper limit of the number to be generated
        lowerLimit : the lower limit of the number to be generated
    Returns:
        pd.dataFrame: the set of random numbers created
    """
    numbers = set()
    i=0
    while i < num:
        gen = randint(lowerLimit, upperLimit)
        if gen not in numbers:
            numbers.add(gen)
            i += 1
    
    return numbers
    # May not be of len equal to num due to duplicates
    #return {randint(lowerLimit, upperLimit) for _ in range(num)}

#### Split array

Splits an array into values and labels to be used for KNN
<ol>
<li>All the given data of the data frame</li>
<li>The indexes of the data chosen to be into the training set</li>
</ol>
Returns:
<ol>
<li>The np.array (matrix) containing the values</li>
<li>The np.array (vector) containing the label associated to said values</li>
</ol>

In [14]:
def createSplitArray(data:pd.DataFrame, indexes:list[int])->[np.dtypes.Float16DType, np.dtypes.StringDType]: # type: ignore
    """_summary_

    Args:
        data (pd.DataFrame): the data frame
        indexes (list[int]): list of indexes you want to work on

    Returns:
        np.dtypes.Float16DType: the array of the values selected
        np.dtypes.StringDType: the class associated to the selected values 
    """
    
    
    val = np.zeros((len(indexes), len(data.columns)-1), dtype=np.dtypes.Float16DType)
    classes = np.zeros(len(indexes), dtype=np.dtypes.StringDType)
    
    for i in range(len(indexes)):
        val[i] = np.array([float(data.values[indexes[i]][j]) for j in range(len(data.values[indexes[i]])-1) ] , dtype=np.dtypes.Float16DType)
        classes[i] = data.values[indexes[i]][-1]
    
    return val, classes

#### Main function

This is the main function of the program:
<ol>
<li>Reads the iris.csv dataset into a pandas dataFrame</li>
<li>Selects randomly x% of our N data </li>
<li>Creates the test array also portioning the data as given</li>
<ol>

In [15]:
def main() -> None:
    iris = read_file(FILE_INPUT['IRIS'], ['sepal length', 'sepal width', 'petal length', 'petal width', 'species'])
    randomIndex = generateRandomIndex(int(PERCENTAGE*len(iris)), len(iris)-1)
    X_test, Y_test = createSplitArray(iris, sorted(randomIndex))
    X_train, Y_train = createSplitArray(iris, sorted(set(list(range(len(iris)))).difference(randomIndex)))
    
    knn = KNearestNeighbors(5, "alessio")
    
    print(knn.manhattan(np.array([16, 15]), np.array([14, 15])))
    
    knn.manhattan(knn.manhattan)
    
main()

2
