# Fisher Iris
Data Analysis - ISAE 2025/2026

In [None]:
import os  # path handling
import pandas as pd  # dataframe handling
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [None]:
dataPath = "./"  # path to folder containing the Iris data
dataFile = os.path.join(dataPath, "iris.data")  # data file to use

## 1 - Data import and formatting



In [None]:
# Let's assign column names ()'FType' stands for 'Flower Type')
fullDF = pd.read_csv(
    dataFile, header=None, names=["sepalL", "sepalW", "petalL", "petalW", "FType"]
)
fullDF.sample(10)  # Let's take a random sample from the full data frame

In [None]:
fullDF.describe()  # quick statistical description of the dataframe

In [None]:
# Let's define a mapping function from 'Flower type' to a class number
def dataMap(FType):
    """
    Data mapping function. Assigns a class number, given a class name
    Args:
      FType: (str) Flower type
    Returns: (int) Flower class
    """
    if FType == "Iris-versicolor":
        label = 0
    elif FType == "Iris-setosa":
        label = 1
    elif FType == "Iris-virginica":
        label = 2
    else:
        raise (RuntimeWarning(f"Unknown flower type: {FType}, using default label 0"))

    return label

In [None]:
# Let's apply the mapping function to the input data and create a new column called 'Y'
fullDF["Y"] = [dataMap(item) for item in fullDF["FType"]]

In [None]:
# Let's check that mapping works on a few random samples
fullDF.sample(10)

In [None]:
# Separate data from classes
dataDF = fullDF[["sepalL", "sepalW", "petalL", "petalW"]]
classDF = fullDF["Y"]

## 2 - Data splitting

In [None]:
# Let's split the data into training data, and test data (10% of the full dataset). Same splitting should be applied to classes.
trainData, testData, trainY, testY = train_test_split(dataDF, classDF, test_size=0.1)

## 3 - kNN

In [None]:
# Let's build a kd Tree out of our training data and use euclidian distance as a metric (or another custon metric).
kNN = KNeighborsClassifier(
    n_neighbors=3, algorithm="kd_tree", metric="minkowski", p=2, n_jobs=-1
)
kNN.fit(trainData, trainY)

In [None]:
# Check the classifier's performance on the training data
trainPredictions = kNN.predict(trainData)
trainCM = confusion_matrix(y_pred=trainPredictions, y_true=trainY)
print(trainCM)

## 4 - Classifier performance

In [None]:
# Check the classifier's performance on the test data
predictions = kNN.predict(testData)
testCM = confusion_matrix(y_pred=predictions, y_true=testY)
print(testCM)

In [None]:
# You can also plot the confusion matrix
plt.matshow(testCM)