<a href="https://colab.research.google.com/github/Tingfang-W/Machine-Learning/blob/main/WeightedML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries and global variables/contants
import collections
from collections import defaultdict
import numpy as np
import pandas as pd
import operator
from os import listdir
from sklearn import metrics as mr
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# The column names in the data
columns = ['ID', 'Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape',
           'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin',
           'Normal_Nucleoli', 'Mitoses', 'Class']

In [None]:
# Common Function Definitions for Naive Bayes and Weighted Naive Bayes
# Read the dataset
def get_dataset():
    dataset = pd.read_csv('breast_cancer.csv', usecols=columns)
    dataset = dataset[dataset['Bare_Nuclei'] != '?'] # Remove the observations that has missing values
    dataset['Bare_Nuclei'] = dataset['Bare_Nuclei'].astype(int)
    dataset['Bare_Nuclei'] = pd.Series(list(map(int, list(dataset['Bare_Nuclei']))))
    del dataset['ID']
    dataset.dropna(axis=0, how='any', inplace=True)
    return dataset

# Calculate the prior probabilities of the two classes.
def get_prior_prob(dataset):
    y = dataset['Class']
    total_num = len(y)
    class2_num = len(y[y == 2])
    class4_num = len(y[y == 4])
    class2_prior_prob = class2_num / total_num
    class4_prior_prob = class4_num / total_num
    return class2_prior_prob, class4_prior_prob

# Check if the functions work
cancer_df = get_dataset()
print(cancer_df.head(1))
class2_prior_prob, class4_prior_prob = get_prior_prob(cancer_df)
print(class2_prior_prob, class4_prior_prob)

   Clump_Thickness  Uniformity_of_Cell_Size  Uniformity_of_Cell_Shape  \
0                5                        1                         1   

   Marginal_Adhesion  Single_Epithelial_Cell_Size  Bare_Nuclei  \
0                  1                            2          1.0   

   Bland_Chromatin  Normal_Nucleoli  Mitoses  Class  
0                3                1        1      2  
0.6476761619190404 0.3523238380809595


In [None]:
# Function Definitions For Naive Bayes

# The probabilities of each feature given the two classes
def get_feat_postprob(dataset, feature):
    feature_values = list(collections.Counter(dataset[feature]))
    feature_probs = defaultdict(list)
    for i in range(len(feature_values)):
        instances2 = dataset[(dataset[feature] == feature_values[i]) & (dataset['Class'] == 2)]
        class2 = dataset[dataset['Class'] == 2]
        if len(instances2) != 0:
            prob2 = len(instances2) / len(class2)
            feature_probs[feature_values[i]].append(prob2)
        else:
            prob2 = 1 / (len(class2) + 1)
            feature_probs[feature_values[i]].append(prob2)
        instances4 = dataset[(dataset[feature] == feature_values[i]) & (dataset['Class'] == 4)]
        class4 = dataset[dataset['Class'] == 4]
        if len(instances4) != 0:
            prob4 = len(instances4) / len(class4)
            feature_probs[feature_values[i]].append(prob4)
        else:
            prob4 = 1 / (len(class4) + 1)
            feature_probs[feature_values[i]].append(prob4)
    feat_postprob_df = pd.DataFrame.from_dict(feature_probs, orient='Index')
    feat_postprob_df.reset_index(inplace=True)
    feat_postprob_df.columns = [feature, feature + 'PostProb2', feature + 'PostProb4']
    return feat_postprob_df

# Under the conditional independence assumption, P(x_1,...x_m|y) can be calculated by the product of the posterior probabilities of each feature.
def get_feats_postprob(dataset):
    features = [col for col in columns if col not in ['ID', 'Class']]
    df_list = list()
    for feature in features:
        df_list.append(get_feat_postprob(dataset, feature))
    feats_postprob_df = pd.concat(df_list, axis=1)
    return feats_postprob_df

# Calculate the posterior probabilities (numerator) of the two classes
def get_obs_postprob(dataset, obs):
    feats_postprob_df = get_feats_postprob(dataset)
    features = [col for col in columns if col not in ['ID', 'Class']]
    posterior_prob2 = list()
    posterior_prob4 = list()
    for feature in features:
        feature_prob2 = feats_postprob_df[feature + 'PostProb2'][feats_postprob_df[feature] == obs[feature]]
        posterior_prob2.append(feature_prob2.iloc[0])
        feature_prob4 = feats_postprob_df[feature + 'PostProb4'][feats_postprob_df[feature] == obs[feature]]
        posterior_prob4.append(feature_prob4.iloc[0])
    obs_postprob2 = np.multiply.accumulate(posterior_prob2)[-1]
    obs_postprob4 = np.multiply.accumulate(posterior_prob4)[-1]
    return obs_postprob2, obs_postprob4

# Assign the label to the observation by comparing the two posterior probabilities
def get_nbpred_label(dataset, obs):
    postprob2, postprob4 = get_obs_postprob(dataset, obs)
    class2_prob, class4_prob = get_prior_prob(dataset)
    if (postprob2 * class2_prob) > (postprob4 * class4_prob):
        pred_y = 2
    else:
        pred_y = 4
    return pred_y

# Calculate the average accuracy in cross-validation
def get_nb_accuracy(fold_num):
    x = get_dataset()
    x = x.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(x) // fold_num
    folds = list()
    for i in range(fold_num - 1):
        folds.append(x.iloc[i * n:(i + 1) * n])
    folds.append(x.iloc[(fold_num - 1) * n: len(x)])
    Accuracy = list()
    for i in range(fold_num):
        X_test = folds[i]
        X_train = pd.concat(folds[:i] + folds[i + 1:])
        count = 0
        for j in range(len(X_test)):
            predict_y = get_nbpred_label(X_train, X_test.iloc[j])
            if predict_y == X_test.iloc[j][-1]:
                count += 1
        accuracy = count / (len(X_test))
        Accuracy.append(accuracy)
    AverageAccuracy = np.mean(Accuracy) * 100
    return AverageAccuracy

# Check if the functions work
AverageAccuracy = get_nb_accuracy(3)
print(AverageAccuracy)

97.00171023040978


In [None]:
# Function Definitions For Weighted Naive Bayes

# Get the mutual information between the class and each of the features
def get_MI(dataset):
    MI = dict()
    features = [col for col in columns if col not in ['ID', 'Class']]
    for feature in features:
        mi = mr.normalized_mutual_info_score(dataset['Class'], dataset[feature])
        MI[feature] = mi
    MI_df = pd.DataFrame.from_dict(MI, orient='index', columns=['MI'])
    MI_df['feature'] = features
    MI_df['Weight2'] = MI_df['MI']
    MI_df['Weight4'] = MI_df['MI']
    del MI_df['MI']
    return MI_df

# Get features weighted posterior probabilities
def get_feats_wpostprob(dataset):
    features = [col for col in columns if col not in ['ID', 'Class']]
    feats_postprob_df = get_feats_postprob(dataset)
    feats_weight_df = get_MI(dataset)
    feats_wpostprob_df = pd.DataFrame()
    for feature in features:
        feats_wpostprob_df[feature] = feats_postprob_df[feature]
        feats_wpostprob_df[feature + 'WeightPostProb2'] = feats_postprob_df[feature + 'PostProb2'].values * \
                                                          feats_weight_df[feats_weight_df['feature'] == feature][
                                                              'Weight2'].values
        feats_wpostprob_df[feature + 'WeightPostProb4'] = feats_postprob_df[feature + 'PostProb4'].values * \
                                                          feats_weight_df[feats_weight_df['feature'] == feature][
                                                              'Weight4'].values
    return feats_wpostprob_df

# Calculate the weighted posterior probabilities (numerator) of the two classes
def get_obs_wnb_postprob(dataset, obs):
    feats_wpostprob_df = get_feats_wpostprob(dataset)
    features = [col for col in columns if col not in ['ID', 'Class']]
    feats_postprob2 = list()
    feats_postprob4 = list()
    for feature in features:
        feature_prob2 = feats_wpostprob_df[feature + 'WeightPostProb2'][feats_wpostprob_df[feature] == obs[feature]]
        feats_postprob2.append(feature_prob2.iloc[0])
        feature_prob4 = feats_wpostprob_df[feature + 'WeightPostProb4'][feats_wpostprob_df[feature] == obs[feature]]
        feats_postprob4.append(feature_prob4.iloc[0])
    obs_postprob2 = np.multiply.accumulate(feats_postprob2)[-1]
    obs_postprob4 = np.multiply.accumulate(feats_postprob4)[-1]
    return obs_postprob2, obs_postprob4

# Assign the label to the observation by comparing the two weighted posterior probabilities
def get_wnb_pred_label(dataset, obs):
    obs_postprob2, obs_postprob4 = get_obs_wnb_postprob(dataset, obs)
    class2_prob, class4_prob = get_prior_prob(dataset)
    if (obs_postprob2 * class2_prob) > (obs_postprob4 * class4_prob):
        predict_y = 2
    else:
        predict_y = 4
    return predict_y

# Calculate the average accuracy in cross-validation
def get_wnb_accuracy(fold_num):
    X = get_dataset()
    X = X.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(X) // fold_num
    folds = list()
    for i in range(fold_num - 1):
        folds.append(X.iloc[i * n:(i + 1) * n])
    folds.append(X.iloc[(fold_num - 1) * n: len(X)])
    Accuracy = list()
    for i in range(fold_num):
        X_test = folds[i]
        X_train = pd.concat(folds[:i] + folds[i + 1:])
        count = 0
        for j in range(len(X_test)):
            predict_y = get_wnb_pred_label(X_train, X_test.iloc[j])
            if predict_y == X_test.iloc[j][-1]:
                count += 1
        accuracy = count / (len(X_test))
        Accuracy.append(accuracy)
    AverageAccuracy = np.mean(Accuracy) * 100
    return AverageAccuracy

# Check if the functions work
AverageAccuracy = get_wnb_accuracy(3)
print(AverageAccuracy)

97.00171023040978


In [None]:
# Define the main function and calculate the accuracy for Naive Bayes and Weighted Naive Bayes

# The main function
def main():
    fold_nums = [3, 5, 7, 9, 10, 11]
    nb_acc = list()
    wnb_acc = list()
    for fold_num in fold_nums:
        nb_acc.append(get_nb_accuracy(fold_num))
        wnb_acc.append(get_wnb_accuracy(fold_num))
    results_df = pd.DataFrame(list(zip(fold_nums, nb_acc, wnb_acc)),
                              columns=['CV', 'Naive Bayes', 'Weighted Naive Bayes'])
    return results_df

if __name__ == "__main__":
    print(main())

   CV  Naive Bayes  Weighted Naive Bayes
0   3    97.001710             97.001710
1   5    97.153996             97.153996
2   7    97.158360             97.158360
3   9    97.005005             97.005005
4  10    97.164799             97.164799
5  11    97.017187             97.017187


In [None]:
# Define Common functions for KNN and weighted KNN

# Read the dataset
def get_data():
    dataset = pd.read_csv('breast_cancer.csv', usecols=columns)
    dataset = dataset[dataset['Bare_Nuclei'] != '?']
    dataset['Bare_Nuclei'] = pd.Series(list(map(int, list(dataset['Bare_Nuclei']))))
    del dataset['ID']
    dataset.dropna(axis=0, how='any', inplace=True)
    features = [col for col in columns if col not in ['ID', 'Class']]
    x = dataset[features]
    x = np.array(x)
    y = dataset['Class']
    y = list(y)
    return dataset, x, y

# Normalize the features
def get_std_data(x):
    minVals = x.min(0)
    maxVals = x.max(0)
    ranges = maxVals - minVals
    stdx = np.zeros(np.shape(x))
    m = x.shape[0]
    stdx = x - np.tile(minVals, (m, 1))
    stdx = stdx / np.tile(ranges, (m, 1))  # element wise divide
    return stdx

# Check if the functions work
dataset, x, y = get_data()  # load data setfrom file
print(y[:5])
stdx = get_std_data(x)
print(stdx[:5])

[2, 2, 2, 2, 2]
[[0.44444444 0.         0.         0.         0.11111111 0.
  0.22222222 0.         0.        ]
 [0.44444444 0.33333333 0.33333333 0.44444444 0.66666667 1.
  0.22222222 0.11111111 0.        ]
 [0.22222222 0.         0.         0.         0.11111111 0.11111111
  0.22222222 0.         0.        ]
 [0.55555556 0.77777778 0.77777778 0.         0.22222222 0.33333333
  0.22222222 0.66666667 0.        ]
 [0.33333333 0.         0.         0.22222222 0.11111111 0.
  0.22222222 0.         0.        ]]


In [None]:
# Define functions for KNN

# Get the labels of each observation
def get_knn_pred_label(inX, dataset, labels, k):
    datasetSize = dataset.shape[0]
    diffMat = np.tile(inX, (datasetSize, 1)) - dataset
    sqDiffMat = diffMat ** 2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances ** 0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        voteLabel = labels[sortedDistIndicies[i]]
        classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

# Perform KNN and calculate the accuracy
def KNN(k):
    ratio = 0.50  # hold out 50% of the data as the test set
    dataset, x, y = get_data()  # load data set from file
    stdx = get_std_data(x)
    m = stdx.shape[0]
    test_size = int(m * ratio)
    errorCount = 0.0
    for i in range(test_size):
        pred_label = get_knn_pred_label(stdx[i, :], stdx[test_size:m, :], y[test_size:m], k)
        if pred_label != y[i]: errorCount += 1.0
    accuracy = (1 - (errorCount / float(test_size))) * 100
    return accuracy

# Check if the functions work
accuracy = KNN(1)
print(accuracy)

88.28828828828829


In [None]:
# Define functions for weighted KNN

# Get the mutual information
def get_MI(dataset):
    MI = dict()
    features = [col for col in columns if col not in ['ID', 'Class']]
    for feature in features:
        mi = mr.normalized_mutual_info_score(dataset['Class'], dataset[feature])
        MI[feature] = mi
    MI_df = pd.DataFrame.from_dict(MI, orient='index', columns=['MI'])
    MI_df['feature'] = features
    MI_df['Weight'] = MI_df['MI']
    del MI_df['MI']
    weights = np.array(MI_df['Weight'])
    return weights

# Get the label of each observation
def get_wknn_pred_label(inX, dataset, x, y, k):
    weights = get_MI(dataset)
    xSize = x.shape[0]
    diffMat = np.tile(inX, (xSize, 1)) - x
    sqDiffMat = weights * diffMat ** 2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances ** 0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        voteLabel = y[sortedDistIndicies[i]]
        classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

# Perform weighted KNN and calculate the accuracy
def weighted_KNN(k):
    ratio = 0.50  # hold out 50% of the data as the test set
    dataset, x, y = get_data()  # load data from file
    m = stdx.shape[0]
    test_size = int(m * ratio)
    errorCount = 0.0
    for i in range(test_size):
        pred_label = get_wknn_pred_label(stdx[i, :], dataset, stdx[test_size:m, :], y[test_size:m], k)
        if (pred_label != y[i]): errorCount += 1.0
    accuracy = (1 - (errorCount / float(test_size))) * 100
    return accuracy

# Check if the functions work
accuracy = weighted_KNN(1)
print(accuracy)

90.990990990991


In [None]:
# Get the accuracy for different values of K
def main():
    k_values = [1, 3, 5, 7, 9]
    knn_acc = list()
    wknn_acc = list()
    for k in k_values:
        knn_acc.append(KNN(k))
        wknn_acc.append(weighted_KNN(k))
    results_df = pd.DataFrame(list(zip(k_values, knn_acc, wknn_acc)),
                              columns=['K', 'KNN', 'Weighted KNN'])
    return results_df


if __name__ == "__main__":
    print(main())

   K        KNN  Weighted KNN
0  1  88.288288     90.990991
1  3  89.489489     92.492492
2  5  90.390390     92.192192
3  7  90.390390     91.891892
4  9  89.789790     91.891892


In [None]:
# Define functions for logistic regression and weighted logistic regression

# Add weights to the features by taking the power of each feature with the corresponding mutual information
def apply_weights_as_powers(X, weights):
    powered_X = X ** weights.reshape(1, -1)
    return powered_X

# Define the model parameters
def train_logistic_regression(X_train, y_train):
    lr = LogisticRegression(max_iter=1000, random_state=42, solver='lbfgs', penalty=None)
    lr.fit(X_train, y_train)
    return lr

# Calculate the average accuracy in cross-validation
def get_accuracy_lr(dataset, X, y, k_values):
    results = []
    y = np.array(y)

    weights = get_MI(dataset)
    powered_X = apply_weights_as_powers(X, weights)

    for k in k_values:
        kf = KFold(n_splits=k)
        fold_accuracies_lr = []
        fold_accuracies_weighted_lr = []

        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            X_train_w, X_test_w = powered_X[train_index], powered_X[test_index]
            y_train, y_test = y[train_index], y[test_index]  # Correct indexing for y

            # Train logistic regression without weights
            lr = train_logistic_regression(X_train, y_train)
            y_pred_lr = lr.predict(X_test)
            accuracy_lr = accuracy_score(y_test, y_pred_lr)
            fold_accuracies_lr.append(accuracy_lr)

            # Train the weighted  logistic regression
            lr_w = train_logistic_regression(X_train_w, y_train)
            y_pred_lr_w = lr_w.predict(X_test_w)
            accuracy_lr_w = accuracy_score(y_test, y_pred_lr_w)
            fold_accuracies_weighted_lr.append(accuracy_lr_w)

        results.append({
            'K': k,
            'Logistic Regression': np.mean(fold_accuracies_lr) * 100,
            'Weighted Logistic Regression': np.mean(fold_accuracies_weighted_lr)* 100
        })

    return pd.DataFrame(results)

In [None]:
# Get the accuracy for logistic regression and weighted logistic regression in cross-validation
def main():
    k_values = [3, 5, 7, 9, 10, 11]
    dataset, X, y = get_data()
    cv_results_lr = get_accuracy_lr(dataset, X, y, k_values)
    return cv_results_lr

if __name__ == "__main__":
    print(main())

    K  Logistic Regression  Weighted Logistic Regression
0   3            95.355445                     95.354772
1   5            95.361912                     96.407811
2   7            95.809837                     95.957080
3   9            96.258258                     96.404404
4  10            95.811850                     96.411126
5  11            95.814704                     96.413313
