In [None]:
!pip install scikit-learn

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

# Get the iris sample dataset
iris = datasets.load_iris()

# Seperate the dataset into training and test sets
train_data, test_data, train_labels, test_labels = train_test_split(iris['data'], iris['target'])

In [None]:
def normalize(train_data, test_data):
    # Normalize features into [-1, 1] based solely on training data

    feature_mins = []
    feature_maxs = []
    
    # Find the extrema of the training features
    for i in range(len(train_data[0])):
        feature_mins.append(float(train_data[0][i]))
        feature_maxs.append(float(train_data[0][i]))
    
    for point in train_data:
        for i in range(len(train_data[0])):
            if point[i] < feature_mins[i]:
                feature_mins[i] = float(point[i])
            elif point[i] > feature_maxs[i]:
                feature_maxs[i] = float(point[i])
    
    # Normalize the training data
    normalized_train_data = []
    for point in train_data:
        new_line = []
        for i in range(len(train_data[0])):
            new_line.append(((point[i] - feature_mins[i] ) / (feature_maxs[i] - feature_mins[i]) - 0.5) * 2)
        normalized_train_data.append(new_line)
    
    # Normalize the testing data
    normalized_test_data = []
    for point in test_data:
        new_line = []
        for i in range(len(train_data[0])):
            new_line.append(((point[i] - feature_mins[i] ) / (feature_maxs[i] - feature_mins[i]) - 0.5) * 2)
        normalized_test_data.append(new_line)

    return normalized_train_data, normalized_test_data

In [None]:
# Create data that only has the sepal length/width
normalized_train_data, normalized_test_data = normalize(train_data, test_data)

sepal_train_data = [[i[0], i[1]] for i in normalized_train_data]
sepal_test_data = [[i[0], i[1]] for i in normalized_test_data]

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create a nearest neighbors classifier
model = KNeighborsClassifier(3)

# Look at only first length/width
model.fit(sepal_train_data, train_labels)

# Predict the probabilities for each class a single point with sepal width and length slightly larger than any training point
print(model.predict_proba([[1.05, 1.05]]))

In [None]:
import matplotlib.pyplot as plt

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score

def calculate_scores(model, test_data, test_labels):
    # Create a confusion matrix for the test data
    confusion_matrix_display = ConfusionMatrixDisplay.from_estimator(
        model,
        test_data,
        test_labels,
        cmap=plt.cm.Purples,
        display_labels=['setosa', 'versicolor', 'virginica']
    )
    
    plt.show()
    
    # Calculate the f1 score
    print("Classwise F1 scores:")
    print(f1_score(model.predict(test_data), test_labels, average=None))

In [None]:
# Calculate metrics for our nearest neighbors model
calculate_scores(model, sepal_test_data, test_labels)

In [None]:
import matplotlib.patches as patches
import numpy as np

def graph(model, point=None):
    # Graph the decision surface of the model, plus a point if given
    colors = [[1,0,0],[0,1,0],[0,0,1]]
    labels = ['setosa', 'versicolor', 'virginica']

    # Graph every point inside the extrema of the training data
    xx, yy = np.meshgrid(np.arange(-1, 1, 0.01), np.arange(-1, 1, 0.01))
    if hasattr(model, "predict_proba"):
        Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    else:
        Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
        for line in Z:
            sum = 0
            for i in line:
                if i > 0:
                    sum += i
            for i in range(len(line)):
                if line[i] > 0:
                    line[i] = line[i] / sum
                else:
                    line[i] = 0
    Z = Z.reshape((xx.shape[0], xx.shape[1], 3))
    im = plt.imshow(Z, extent=(-1, 1, -1, 1), origin="lower")
    plt.legend(bbox_to_anchor=(1, 1), handles=[ patches.Patch(color=colors[i], label=labels[i]) for i in range(3) ], loc=2)
    if point:
        plt.scatter(point[0], point[1], c=point[2])


In [None]:
# Graph our nearest neighbors model's output
graph(model)

In [None]:
# Try with all four features now
model.fit(train_data, train_labels)
calculate_scores(model, test_data, test_labels)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train a decision tree classifier
model = DecisionTreeClassifier(max_depth=5)
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
# Retrain while allowing a deeper tree
model = DecisionTreeClassifier(max_depth=25)
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.tree import plot_tree

# Show the internals of the tree
plot_tree(model)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train an ensemble of 10 trees at once
model = RandomForestClassifier(max_depth=5, n_estimators=10)
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Train an AdaBoost classifier
model = AdaBoostClassifier(algorithm="SAMME")
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.naive_bayes import GaussianNB

# Train a Naive Bayes classifier with a Gaussian prior
model = GaussianNB()
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.naive_bayes import MultinomialNB

# The multinomial distribution assumes nonnegative input, so rescale.
positive_sepal_train_data = [[(i[0] + 1) / 2, (i[1] + 1) / 2 ] for i in sepal_train_data]
positive_sepal_test_data = [[(i[0] + 1) / 2, (i[1] + 1) / 2 ] for i in sepal_test_data]

# Train a Naive Bayes classifier with a multinomial prior
model = MultinomialNB()
model.fit(positive_sepal_train_data, train_labels)

calculate_scores(model, positive_sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.naive_bayes import ComplementNB

# Train a Naive Bayes classifier with a multinomial prior using the complement of the class for weights
model = ComplementNB()
model.fit(positive_sepal_train_data, train_labels)

calculate_scores(model, positive_sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.naive_bayes import BernoulliNB

# Train a Naive Bayes classifier with a Bernoulli prior
model = BernoulliNB()
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Train a Linear Discriminant Analysis classifier 
model = LinearDiscriminantAnalysis()
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Train a Quadratic Discriminant Analysis classifier 
model = QuadraticDiscriminantAnalysis()
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.svm import SVC

# Train a linear Support Vector Machine classifier
model = SVC(kernel="linear")
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
# Train a Support Vector Machine classifier using a polynomial kernel function
model = SVC(kernel="poly")
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
# Train a Support Vector Machine classifier using a sigmoid kernel function
model = SVC(kernel="sigmoid")
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
# Train a Support Vector Machine classifier using a radial basis function
model = SVC(kernel="rbf")
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.neural_network import MLPClassifier

# Train a multi-layer perceptron classifier
model = MLPClassifier(max_iter=1000)
model.fit(sepal_train_data, train_labels)

calculate_scores(model, sepal_test_data, test_labels)
graph(model)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Chi squared calculation requires non-negative input
positive_train_data = [[(i[0] + 1) / 2, (i[1] + 1) / 2 ] for i in normalized_train_data]

#Calculate chi squared values for all features
chi2_values = np.concatenate(chi2(positive_train_data, train_labels)).ravel()

# Get the two features with the highest chi2 values
top_feature_indices = np.argpartition(chi2_values, -2)[-2:]

# Get datasets with only the selected features
top_feature_train_data = [[i[top_feature_indices[0]], i[top_feature_indices[1]]] for i in normalized_train_data]
top_feature_test_data = [[i[top_feature_indices[0]], i[top_feature_indices[1]]] for i in normalized_test_data]

In [None]:
# Refit the last model using the new features
model.fit(top_feature_train_data, train_labels)

calculate_scores(model, top_feature_test_data, test_labels)
graph(model)

In [None]:
# Train a classifier using all features
model = AdaBoostClassifier(algorithm="SAMME")
model.fit(normalized_train_data, train_labels)

calculate_scores(model, normalized_test_data, test_labels)

# Note how internally the model has calculated a numerical score for each feature's contribution to decisions
print("Feature importances:")
model.feature_importances_

In [None]:
from sklearn.feature_selection import RFE

# Perform recursive feature elimination
eliminator = RFE(model, n_features_to_select=2).fit(normalized_train_data, train_labels)

# See the list of features to keep
print(eliminator.support_)

In [None]:
from sklearn.feature_selection import RFECV

# Perform recursive feature elimination, using cross validation to calculate the number of features to use
eliminator = RFECV(model).fit(normalized_train_data, train_labels)

# See the list of features to keep
print(eliminator.support_)

In [None]:
# Arbitrarily weigh point number 3 as being 100 times more important than all others
weights = [1] * len(top_feature_train_data)
p=2
weights[p] = 100

model = SVC(kernel="rbf")
model.fit(top_feature_train_data, train_labels, sample_weight=weights)

calculate_scores(model, top_feature_test_data, test_labels)
graph(model, point=[top_feature_train_data[p][0],top_feature_train_data[p][1],train_labels[p]])


In [None]:
# Simulate a mistake in data collection/entry setting the first point to extreme positive values
top_feature_train_data[0][0] = 2
top_feature_train_data[0][1] = 2

# Renormalize the data with the incorrect point
top_feature_train_data, top_feature_test_data = normalize(top_feature_train_data, top_feature_test_data)

# Refit the last model using the new features
model.fit(top_feature_train_data, train_labels)

calculate_scores(model, top_feature_test_data, test_labels)
graph(model)

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# Create a local outlier factor model
outlier_model = LocalOutlierFactor()
outlier_model.fit(top_feature_train_data)

# Show all points with unusually low values for clustering
print([1 if i < -5 else 0 for i in outlier_model.negative_outlier_factor_])