Лабораторна  1: Класифікація

Мета: Реалізувати програмно алгоритми класифікації 1-Rule, Naive Bayes, Decision Tree, kNN.

Тестовий набір:
{(0,1,2,1), (1,0,1,0), (0,1,1,1), (0,0,1,1), (0,0,2,1), (1,1,2,1), (1,0,2,0),
(1,0,0,1), (0,0,0,0), (0,0,1,0)}.Визначити клас для (1,1,1).

Алгоритм 1: 1-Rule

In [1]:
import pandas as pd
from collections import defaultdict, Counter
import math
from statistics import mean, stdev

data = [
    (0,1,2,1), (1,0,1,0), (0,1,1,1), (0,0,1,1), (0,0,2,1),
    (1,1,2,1), (1,0,2,0), (1,0,0,1), (0,0,0,0), (0,0,1,0)
]

df = pd.DataFrame(data, columns=['f1', 'f2', 'f3', 'class'])
features = ['f1', 'f2', 'f3']

class OneR:
    def fit(self, df, target_col):
        self.target_col = target_col
        self.rules = {}
        min_error = float('inf')
        best_feature = None

        for feature in df.columns:
            if feature == target_col:
                continue

            rule = defaultdict(lambda: None)
            total_error = 0

            for val in df[feature].unique():
                subset = df[df[feature] == val]
                most_common = subset[target_col].value_counts().idxmax()
                rule[val] = most_common
                total_error += sum(subset[target_col] != most_common)

            if total_error < min_error:
                min_error = total_error
                self.rules = rule
                self.best_feature = feature

    def predict(self, df):
        predictions = []
        for val in df[self.best_feature]:
            pred = self.rules.get(val, None)
            predictions.append(pred)
        return predictions

oner = OneR()
oner.fit(df, 'class')

# Check which feature was selected and what rules were learned
print("Best feature:", oner.best_feature)
print("Rules learned:", dict(oner.rules))

new_data = pd.DataFrame([(1, 1, 1)], columns=['f1', 'f2', 'f3'])

prediction = oner.predict(new_data)
print("Prediction for (1, 1, 1):", prediction[0])


Best feature: f2
Rules learned: {1: 1, 0: 0}
Prediction for (1, 1, 1): 1



Алгоритм 2: Наївний баєсівський класифікатор

In [2]:
class NaiveBayes:
    def __init__(self):
        self.summaries = {}

    def fit(self, df):
        separated = defaultdict(list)
        for _, row in df.iterrows():
            separated[row['class']].append(row[:-1])  # exclude class column

        summaries = {}
        for class_value, rows in separated.items():
            feature_stats = []
            # For each feature column in class rows, calculate mean and stddev
            for col in zip(*rows):
                m = mean(col)
                s = stdev(col) if len(col) > 1 else 0  # handle single row case
                feature_stats.append((m, s))
            summaries[class_value] = feature_stats
        self.summaries = summaries

    def calculate_gaussian_probability(self, x, mean, stdev):
        epsilon = 1e-10  # to avoid division by zero
        exponent = math.exp(-((x - mean) ** 2) / (2 * (stdev + epsilon) ** 2))
        return (1 / (math.sqrt(2 * math.pi) * (stdev + epsilon))) * exponent

    def calculate_class_probabilities(self, input_vector):
        probabilities = {}
        for class_value, class_summaries in self.summaries.items():
            probabilities[class_value] = 1
            for i in range(len(class_summaries)):
                mean, stdev = class_summaries[i]
                x = input_vector[i]
                probabilities[class_value] *= self.calculate_gaussian_probability(x, mean, stdev)
        return probabilities

    def predict(self, input_vector):
        probabilities = self.calculate_class_probabilities(input_vector)
        return max(probabilities, key=probabilities.get)

gnb = NaiveBayes()
gnb.fit(df)

new_instance = (1, 1, 1)
predicted_class = gnb.predict(new_instance)
print("Predicted class for (1,1,1):", predicted_class)

Predicted class for (1,1,1): 1


Алгоритм 3: Decision tree

In [3]:
class DecisionTree:
    def __init__(self):
        self.tree = None
        self.features = None  # Store features names here

    def entropy(self, labels):
        total = len(labels)
        counts = labels.value_counts()
        ent = 0
        for count in counts:
            p = count / total
            ent -= p * math.log2(p)
        return ent

    def info_gain(self, df, feature, target='class'):
        total_entropy = self.entropy(df[target])
        values = df[feature].unique()
        weighted_entropy = 0
        for v in values:
            subset = df[df[feature] == v]
            weighted_entropy += (len(subset) / len(df)) * self.entropy(subset[target])
        return total_entropy - weighted_entropy

    def majority_class(self, labels):
        return labels.value_counts().idxmax()

    def build_tree(self, df, features, target='class'):
        if len(df[target].unique()) == 1:
            return df[target].iloc[0]
        if not features:
            return self.majority_class(df[target])

        gains = {f: self.info_gain(df, f, target) for f in features}
        best_feature = max(gains, key=gains.get)

        tree = {best_feature: {}}
        for v in df[best_feature].unique():
            subset = df[df[best_feature] == v]
            subtree = self.build_tree(subset, [f for f in features if f != best_feature], target)
            tree[best_feature][v] = subtree

        return tree

    def fit(self, df, features, target='class'):
        self.features = features  # Save features list in class
        self.tree = self.build_tree(df, features, target)

    def predict(self, sample):
        tree = self.tree
        while isinstance(tree, dict):
            feature = next(iter(tree))
            if isinstance(sample, dict):
                value = sample.get(feature)
            else:
                # sample is a tuple/list: get index of feature in self.features
                feature_index = self.features.index(feature)
                value = sample[feature_index]
            if value not in tree[feature]:
                return None
            tree = tree[feature][value]
        return tree

dt = DecisionTree()
dt.fit(df, features)

test_sample = (1,1,1)
prediction = dt.predict(test_sample)

print("Decision tree structure:")
print(dt.tree)
print(f"Prediction for {test_sample}: {prediction}")

Decision tree structure:
{'f2': {1: 1, 0: {'f1': {1: {'f3': {1: 0, 2: 0, 0: 1}}, 0: {'f3': {1: 1, 2: 1, 0: 0}}}}}}
Prediction for (1, 1, 1): 1


Алгоритм 4: KNN

In [4]:
class KNN:
    def __init__(self, k=3):
        self.k = k
        self.features = None
        self.X = None
        self.y = None

    # Fit saves training data
    def fit(self, df, features, target='class'):
        self.features = features
        self.X = df[features].values
        self.y = df[target].values

    # Euclidean distance between two points
    def distance(self, a, b):
        return math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b)))

    # Predict class for one sample
    def predict_one(self, sample):
        # Compute distances to all training points
        distances = []
        for x_train, label in zip(self.X, self.y):
            dist = self.distance(sample, x_train)
            distances.append((dist, label))
        # Sort by distance
        distances.sort(key=lambda x: x[0])
        # Take k nearest neighbors
        k_nearest = distances[:self.k]
        # Majority vote
        votes = [label for _, label in k_nearest]
        most_common = Counter(votes).most_common(1)[0][0]
        return most_common

    # Predict classes for multiple samples
    def predict(self, samples):
        return [self.predict_one(sample) for sample in samples]

knn = KNN(k=3)
knn.fit(df, features)

test_sample = (1, 1, 1)
prediction = knn.predict_one(test_sample)
print(f"Prediction for {test_sample}: {prediction}")


Prediction for (1, 1, 1): 1
