## Họ tên: Nguyễn Phú Thành - MSSV: 18110014
## Bài tập thực hành Nhập môn máy học - Lab 04

In [1]:
from math import pi, sqrt, e as exp
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import OrdinalEncoder

In [2]:
class NaiveBayesClassifier:
    
    def __init__(self, laplace = True):
        self.laplace = laplace
    
    def fit(self, X, y, categorical_features = None, numerical_features = None, labels = (0, 1)):
        self.label_dict = {i : y[y == i].shape[0]/y.shape[0] for i in labels}
        self.unique_val = dict()
        self.prob_dict = dict()
        
        if categorical_features is None:
            categorical_prob = dict()
        else:
            categorical_prob = dict()
            for i in categorical_features:
                categorical_prob.setdefault(i, dict())
                col_i_prob = dict()
                col_i = X[:, i]
                self.unique_val.setdefault(i, 0)
                self.unique_val[i] = np.unique(col_i).shape[0]
                for x in np.unique(col_i):
                    col_i_prob.setdefault(x, dict())
                    if self.laplace:
                        col_i_prob[x] = {
                            j : (np.sum((col_i == x) & (y == j)) + 1)/(np.sum(y == j) + self.unique_val[i])\
                            for j in labels
                        }
                    else:
                        col_i_prob[x] = {
                            j : np.sum((col_i == x) & (y == j))/np.sum(y == j) for j in labels
                        }
                categorical_prob[i] = col_i_prob
        
        if numerical_features is None:
            numerical_prob = dict()
        else:
            numerical_prob = dict()
            for i in numerical_features:
                col_i = X[:, i]
                numerical_prob.setdefault(i, dict())
                for j in labels:
                    mean, std = (col_i[y == j]).mean(), (col_i[y == j]).std(ddof = 1)
                    numerical_prob[i][j] = (mean, std)
        
        self.prob_dict.update(categorical_prob)
        self.prob_dict.update(numerical_prob)
        
        self.numerical = numerical_features
        self.categorical = categorical_features
        self.labels = labels
        
        return self
    def predict(self, X):
        y_predict = []
        
        for x in X:
            predict_value = None
            prob_max = -1
            for label in self.labels:
                probs = []
                
                if self.categorical is not None:
                    for categorical_col in self.categorical:
                        if categorical_col in self.prob_dict.keys():
                            if x[categorical_col] in self.prob_dict[categorical_col].keys():
                                probs.append(self.prob_dict[categorical_col][x[categorical_col]][label])
                            else:
                                probs.append(1/self.unique_val[categorical_col])
                        else:
                                probs.append(1/self.unique_val[categorical_col])                            
                
                if self.numerical is not None:
                    for numerical_col in self.numerical:
                        mean, std = self.prob_dict[numerical_col][label]
                        density = 1/(std * sqrt(2*pi)) * exp**(-(x[numerical_col] - mean)**2/(2 * std**2))
                        probs.append(density)
                
                prob_label = self.label_dict[label] * np.cumprod(probs)[-1]
                if prob_label > prob_max:
                    predict_value, prob_max = label, prob_label
            y_predict.append(predict_value)
        return np.array(y_predict)

In [3]:
data_train = pd.read_csv('https://raw.githubusercontent.com/huynhthanh98/ML/master/lab-04/train.csv')
data_test = pd.read_csv('https://raw.githubusercontent.com/huynhthanh98/ML/master/lab-04/test.csv')

data_train = data_train.drop(['Cabin', 'Age'], axis = 1)

In [4]:
data_train = data_train.dropna()

## 1. Dùng các features: Sex, Pclass, và Embarked để xây dựng thuật toán Naive Bayes Multinomial. (4đ)

In [5]:
X, y = data_train[['Sex', 'Pclass', 'Embarked']].to_numpy(), data_train[['Survived']].to_numpy()

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [7]:
clf = NaiveBayesClassifier(laplace = True).fit(
    X_train, y_train.flatten(),
    categorical_features = (0, 1, 2),
    labels = (0, 1),
)

In [8]:
# Đã laplace smoothing
clf.prob_dict

{0: {'female': {0: 0.13901345291479822, 1: 0.7026022304832714},
  'male': {0: 0.8609865470852018, 1: 0.29739776951672864}},
 1: {1: {0: 0.1476510067114094, 1: 0.3814814814814815},
  2: {0: 0.1767337807606264, 1: 0.26666666666666666},
  3: {0: 0.6756152125279642, 1: 0.35185185185185186}},
 2: {'C': {0: 0.14317673378076062, 1: 0.2814814814814815},
  'Q': {0: 0.07829977628635347, 1: 0.1037037037037037},
  'S': {0: 0.7785234899328859, 1: 0.6148148148148148}}}

In [9]:
enc = OrdinalEncoder().fit(X_train)
X_train_enc = enc.transform(X_train)
X_valid_enc = enc.transform(X_valid)

In [10]:
clf_sklearn = CategoricalNB().fit(X_train_enc, y_train.flatten())

In [11]:
clf.predict(X_valid)

array([1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1])

In [12]:
clf_sklearn.predict(X_valid_enc)

array([1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1])

In [13]:
y_valid.flatten()

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1])

In [14]:
np.mean(clf.predict(X_valid) == y_valid.flatten())

0.7078651685393258

In [15]:
clf_sklearn.score(X_valid_enc, y_valid.flatten())

0.7078651685393258

In [16]:
(clf.predict(X_valid) == clf_sklearn.predict(X_valid_enc)).all()

True

## 2. Dùng các features: SibSp, Parch, và Fare để xây dựng thuật toán Naive Bayes Gaussian (Khỏi normalize). (4đ)

In [17]:
X, y = data_train[['SibSp', 'Parch', 'Fare']].to_numpy(), data_train[['Survived']].to_numpy()

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [19]:
clf = NaiveBayesClassifier().fit(
    X_train, y_train.flatten(),
    numerical_features = (0, 1, 2),
    labels = (0, 1),
)

In [20]:
clf_sklearn = GaussianNB().fit(X_train, y_train.flatten())

In [21]:
clf.predict(X_valid)

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0])

In [22]:
clf_sklearn.predict(X_valid)

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0])

In [23]:
y_valid.flatten()

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1])

In [24]:
np.mean(clf.predict(X_valid) == y_valid.flatten())

0.6348314606741573

In [25]:
clf_sklearn.score(X_valid, y_valid.flatten())

0.6348314606741573

In [26]:
(clf.predict(X_valid) == clf_sklearn.predict(X_valid)).all()

True

## 3. Từ bộ dữ liệu liên minh hãy xây dựng thuật toán Naive Bayes bằng cách dùng tất cả features (mix cả Multinomial và Gaussian. Lưu ý: Khỏi chia tập train, test. Dự đoán thẳng trên tập train) (2đ)

In [27]:
data = pd.read_csv(
    "https://raw.githubusercontent.com/dinhvietcuong1996/Lab-MachineLearningCourse/master/Lab04/lienminh.csv"
)
data.head()

Unnamed: 0,killsDiff,minionsKilledDiff,wardPlacedDiff,firstBlood,heralds,dragons,teamWins
0,3,-2,13,blue,none,none,red
1,0,-66,0,red,red,red,red
2,-4,-17,0,red,none,blue,red
3,-1,-34,28,red,blue,none,red
4,0,-15,58,red,none,red,red


In [28]:
X, y = data.iloc[:, :-1].to_numpy(), data.iloc[:, -1].to_numpy()

In [29]:
clf = NaiveBayesClassifier(laplace = True).fit(
    X, y,
    categorical_features = (3, 4, 5),
    numerical_features = (0, 1, 2),
    labels = np.unique(y).tolist()
)

In [30]:
clf.predict(X)

array(['blue', 'red', 'red', ..., 'red', 'red', 'blue'], dtype='<U4')

In [31]:
np.mean(clf.predict(X) == y.flatten())

0.7116104868913857