# NHẬN DẠNG KÝ TỰ BẰNG NAIVE BAYES

## 1. Tạo dữ liệu đầu vào

In [7]:

def load_letter_dataset(path):
    X, y = [], []
    with open(path, "r") as f:
        for line in f:
            p = line.strip().split(',')
            y.append(p[0])
            X.append(list(map(int, p[1:])))
    return X, y

## 2. Phân tập dữ liệu Train, Test


In [8]:

def train_test_split(X, y, test_ratio=0.2):
    data = list(zip(X, y))
    random.shuffle(data)
    split = int(len(data) * (1 - test_ratio))
    train_data = data[:split]
    test_data = data[split:]
    X_train, y_train = zip(*train_data)
    X_test, y_test = zip(*test_data)
    return list(X_train), list(y_train), list(X_test), list(y_test)


##  GaussianNB




In [9]:
import math
from collections import defaultdict, Counter
import random


class GaussianNB:
    def fit(self, X, y):
        self.classes = sorted(set(y))
        n_features = len(X[0])

        self.prior = {}
        self.mean = {c: [0.0]*n_features for c in self.classes}
        self.var = {c: [0.0]*n_features for c in self.classes}

        counts = Counter(y)
        for c in self.classes:
            self.prior[c] = counts[c] / len(y)

        sums = {c: [0.0]*n_features for c in self.classes}
        for xi, yi in zip(X, y):
            for j, v in enumerate(xi):
                sums[yi][j] += v

        for c in self.classes:
            for j in range(n_features):
                self.mean[c][j] = sums[c][j] / counts[c]

        sq_sums = {c: [0.0]*n_features for c in self.classes}
        for xi, yi in zip(X, y):
            for j, v in enumerate(xi):
                d = v - self.mean[yi][j]
                sq_sums[yi][j] += d*d

        for c in self.classes:
            for j in range(n_features):
                self.var[c][j] = sq_sums[c][j] / counts[c]

    def _gauss_logprob(self, x, mean, var):
        eps = 1e-9
        coeff = -0.5 * math.log(2*math.pi*(var + eps))
        expo = - ((x - mean)**2) / (2*(var + eps))
        return coeff + expo

    def predict_log_proba(self, X):
        out = []
        for xi in X:
            logps = {}
            for c in self.classes:
                lp = math.log(self.prior[c])
                for j, v in enumerate(xi):
                    lp += self._gauss_logprob(v, self.mean[c][j], self.var[c][j])
                logps[c] = lp
            out.append(logps)
        return out

    def predict(self, X):
        probs = self.predict_log_proba(X)
        return [max(p.items(), key=lambda it: it[1])[0] for p in probs]





## 4. Đánh giá kết quả


In [14]:
def accuracy(y_true, y_pred):
    correct = sum(t == p for t, p in zip(y_true, y_pred))
    return correct / len(y_true)

X, y = load_letter_dataset("letter-recognition.data")
X_train, y_train, X_test, y_test = train_test_split(X, y, test_ratio=0.2)
model1 = GaussianNB()
model1.fit(X_train, y_train)

pred1 = model1.predict(X_test)
acc1 = accuracy(y_test, pred1)
print("Accuracy:", acc1)



Accuracy: 0.64175


In [None]:
import random
x=random.randint(1,20000)
pred = model.predict([X[x]])
print(pred)
print(y[x])