In [8]:
%config Completer.use_jedi = False

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
from pprint import pprint
from sklearn import metrics
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings( "ignore" )

In [10]:
df = pd.read_csv('transfusion.data')
df.columns = ['Recency', 'Frequency', 'Monetary', 'Time', 'tag']
df.head(10)

Unnamed: 0,Recency,Frequency,Monetary,Time,tag
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
5,4,4,1000,4,0
6,2,7,1750,14,1
7,1,12,3000,35,0
8,2,9,2250,22,1
9,5,46,11500,98,1


<h2>Evaluation Metrics</h2>

In [11]:
def calc_confusion(y_pred, y_true, n):
    confusion = np.zeros((2, 2))
    for i in range(n):
        if y_true[i] == 1 and y_pred[i] == 1:
            confusion[0][0] += 1
        elif y_true[i] == 0 and y_pred[i] == 1:
            confusion[0][1] += 1
        elif y_true[i] == 1 and y_pred[i] == 0:
            confusion[1][0] += 1
        else:
            confusion[1][1] += 1
    return confusion

def accuracy_score(cnf):
    return (cnf[0][0] + cnf[1][1]) / np.sum(cnf)

def recall_score(cnf):
    return (cnf[0][0]) / (cnf[0][0] + cnf[1][0])

def precision_score(cnf):
    return (cnf[0][0]) / (cnf[0][0] + cnf[0][1])

def f1_score(cnf):
    r = recall_score(cnf)
    p = precision_score(cnf)
    return (2 * p * r) / (r + p)

<h2>Logistic Regression</h2>

In [5]:
train = df.sample(frac=0.8, random_state=0)
test = df.drop(train.index)

x_train = train.loc[:, ['Recency', 'Frequency', 'Monetary', 'Time']]
x_train_array = np.array(x_train)
y_train = train['tag']
y_train_array = np.array(y_train)

x_test = test.loc[:, ['Recency', 'Frequency', 'Monetary', 'Time']]
x_test_array = np.array(x_test)
y_test = test['tag']
y_test_array = np.array(y_test)
x_train_array.shape, y_train_array.shape

((598, 4), (598,))

In [6]:
def sigmoid(a):
    return 1/(1+ np.exp(-1 * a))

class LogisticRegression():
    def __init__(self, lr, max_iter):
        self.max_iter = max_iter
        self.lr = lr
        
    def predict(self, x):
        z = sigmoid(np.dot(x, self.w) + self.w0)
        return np.where(z > 0.5, 1, 0)

    def fit(self, x, y):
        self.x = x
        self.y = y
        self.w0 = 0
        self.n = x.shape[0]
        self.m = x.shape[1]
        self.w = np.zeros(self.m)
        for i in range(self.max_iter):
            yn = sigmoid(np.dot(self.x, self.w) + self.w0)
            tmp = (yn - self.y)
            self.w -= self.lr * (np.dot(self.x.T, tmp) / self.n)
            self.w0 -= self.lr * (np.sum(tmp) / self.n)
        return self


model = LogisticRegression(lr=0.1, max_iter=20000)
model.fit(x_train_array, y_train_array)
y_pred = model.predict(x_test_array)

print("TEST: ")
test_cnf = calc_confusion(y_pred, y_test_array, y_test_array.shape[0])
print("accuracy_score:", accuracy_score(test_cnf), "f1_score: ", f1_score(test_cnf), "recall_score: ", recall_score(test_cnf))

train_y_pred = model.predict(x_train_array)
print("TRAIN: ")
train_cnf = calc_confusion(train_y_pred, y_train_array, y_train_array.shape[0])
print("accuracy_score:", accuracy_score(train_cnf), "f1_score: ", f1_score(train_cnf), "recall_score: ", recall_score(train_cnf))

TEST: 
accuracy_score: 0.7533333333333333 f1_score:  0.1395348837209302 recall_score:  0.07894736842105263
TRAIN: 
accuracy_score: 0.774247491638796 f1_score:  0.21965317919075142 recall_score:  0.1357142857142857


<h2> Decision Tree </h2>

In [None]:
numeric_series = df.select_dtypes(np.number).stack().groupby(level=1)
numeric_df = numeric_series.describe()
numeric_df.insert(8, "var", numeric_series.var(), True)
numeric_df

In [None]:
plt.figure(figsize=(5, 8))
plt.hist(df['Recency'], density=False, ec='w');

plt.figure(figsize=(5, 8))
plt.hist(df['Monetary'], density=False, ec='w');

plt.figure(figsize=(5, 8))
plt.hist(df['Frequency'], density=False, ec='w');

plt.figure(figsize=(5, 8))
plt.hist(df['Time'], density=False, ec='w');

In [5]:
def split_train_test_data(df):
    train = df.sample(frac=0.8, random_state=0)
    test = df.drop(train.index)

    x_train = train.loc[:, ['Recency', 'Frequency', 'Monetary', 'Time']]
    x_train_array = np.array(x_train)
    y_train = train['tag']
    y_train_array = np.array(y_train)

    x_test = test.loc[:, ['Recency', 'Frequency', 'Monetary', 'Time']]
    x_test_array = np.array(x_test)
    y_test = test['tag']
    y_test_array = np.array(y_test)
    return train, test

def split_x_y_data(df):

    x = df.loc[:, ['Recency', 'Frequency', 'Monetary', 'Time']]
    x_array = np.array(x)
    y = df['tag']
    y_array = np.array(y)
    
    return x_array, y_array

In [15]:
class Config:
    TARGET = 'tag'
    PRIME = '_prime'
    Q = 'q'
    YES = 1
    NO = 0
    N = 2
    headers = ['Recency', 'Frequency', 'Monetary', 'Time', 'tag']


def change_data_set(df, n):
    for column in df.columns:
        if column == Config.TARGET:
            continue

        def f(x):
            min_value = df[column].min()
            max_value = df[column].max()
            step = (max_value - min_value) / n
            a = min_value
            b = a + step
            if x < a:
                return str(Config.Q) + "0"
            for i in range(1, n):
                if b > x >= a:
                    return str(Config.Q) + str(i)
                a = b
                b = a + step

            if x >= a:
                return str(Config.Q) + str(n)

        df[column + str(Config.PRIME)] = df[column].apply(lambda x: f(x))
    return df



for i in range(len(Config.headers)):
    if Config.headers[i] == Config.TARGET:
        continue
    Config.headers[i] = str(Config.headers[i]) + str(Config.PRIME)


class Question:
    def __init__(self, column, df):
        self.column = column
        self.column_possible_values = df[column].value_counts().index.tolist()
        self.num_possible_values = len(df[column].value_counts().index.tolist())
        self.gain_information = 0
        self.branches_entropy = []

    def partition(self, df):
        output = list()
        for possible_value in self.column_possible_values:
            temp = df[df[self.column] == possible_value]
            output.append(temp)
        return output

    def match_index(self, row):
        df_row = row
        row_answer_value = df_row[self.column]
        return self.column_possible_values.index(row_answer_value)

    def print(self, spacing):
        return "??" + str(self.column) + "??\n" + str(spacing) + " GI: " + str(
            self.gain_information) + "\n " + str(spacing) + "Branches entropy: " + str(self.branches_entropy)

    def __str__(self):
        return "??" + str(self.column) + "??"

def B(q):
    if q == 1 or q == 0:
        return 0
    return round(-1 * (q * np.log2(q) + (1 - q) * np.log2(1 - q)), 10)


def remainder_util(total_elements, n_k, p_k):
    return round((p_k + n_k) / total_elements * B(p_k / (p_k + n_k)), 10)


def reminder(total_elements, list_of_partitions):
    r = 0
    partitions_entropy = []
    for sub_df in list_of_partitions:
        n_k_and_pk = sub_df.shape[0]
        pk = sub_df[sub_df[Config.TARGET] == Config.YES].shape[0]
        nk = n_k_and_pk - pk
        t = remainder_util(total_elements, nk, pk)
        r += t
        partitions_entropy.append(t)
    return r, partitions_entropy



def gain(n, p, list_of_partitions):
    r, partitions_entropy = reminder(p + n, list_of_partitions)
    return (B(p / (n + p)) - r), partitions_entropy



def find_best_split(df):
    best_gain = 0
    best_question = None
    n_and_p = df.shape[0]
    p = df[df[Config.TARGET] == Config.YES].shape[0]
    n = n_and_p - p

    for feature in Config.headers[:-1]:
        question = Question(feature, df)
        question.gain_information, question.branches_entropy = gain(n, p, question.partition(df))
        if question.gain_information > best_gain:
            best_gain, best_question = question.gain_information, question
    return best_gain, best_question


def leaf_count(df):
    unique_values_count = np.array(df[Config.TARGET].value_counts())
    unique_values_name = df[Config.TARGET].value_counts().index.tolist()
    counts = {}
    for i, name in enumerate(unique_values_name):
        counts[name] = unique_values_count[i]

    return counts


class Leaf:
    def __init__(self, df):
        self.predictions = leaf_count(df)

    def predict(self):
        max_occur = 0
        max_key = None
        for key in self.predictions.keys():
            if self.predictions[key] > max_occur:
                max_occur = self.predictions[key]
                max_key = key
        return max_key


class MiddleNode:
    def __init__(self, question, branches):
        self.question = question
        self.branches = branches


def build_tree(df, max_depth):
    gain, question = find_best_split(df)


    if gain == 0 or max_depth <= 0:
        return Leaf(df)

    partitions = question.partition(df)
    partitions_nodes = [] 
    for partition in partitions:
        temp = build_tree(partition, max_depth - 1)
        partitions_nodes.append(temp)

    return MiddleNode(question, partitions_nodes)



def print_tree(node, spacing1="\t", spacing2="\t\t"):
    if isinstance(node, Leaf):
        print(spacing1, node.predictions)
        return
    print(spacing1 + (node.question.print(spacing1)))
    for i, possible_value in enumerate(node.question.column_possible_values):

        print(spacing1 + '--> :' + str(possible_value))
        print_tree(node.branches[i], spacing1 + spacing2)



def classify(row, node):
    if isinstance(node, Leaf):
        return node.predict()
    branch_index = node.question.match_index(row)
    return classify(row, node.branches[branch_index])


def predict_test_set(df, node):
    predicted_y = []
    for row_index in range(df.shape[0]):
        y = classify(df.iloc[row_index], node)
        predicted_y.append(y)
    return predicted_y



df2 = pd.read_csv('transfusion.data')
df2.columns = ['Recency', 'Frequency', 'Monetary', 'Time', 'tag']
df2 = change_data_set(df2, Config.N)
train, test = split_train_test_data(df2)
train_x, train_y = split_x_y_data(train)
test_x, test_y = split_x_y_data(test)
my_tree = build_tree(df=train, max_depth=2)
# print_tree(my_tree)

print("TRAIN:")
train_y_pred = predict_test_set(train, my_tree)
train_conf = calc_confusion(y_pred=train_y_pred, y_true=train_y, n=train_y.shape[0])
print("accuracy_score:", accuracy_score(train_conf), "f1_score: ", f1_score(train_conf), "recall_score: ", recall_score(train_conf))

print("TEST:")
test_y_pred = predict_test_set(test, my_tree)
test_conf = calc_confusion(y_pred=test_y_pred, y_true=test_y, n=test_y.shape[0])
print("accuracy_score:", accuracy_score(test_conf), "f1_score: ", f1_score(test_conf), "recall_score: ", recall_score(test_conf))

TRAIN:
accuracy_score: 0.7725752508361204 f1_score:  0.08108108108108107 recall_score:  0.04285714285714286
TEST:
accuracy_score: 0.7533333333333333 f1_score:  0.05128205128205127 recall_score:  0.02631578947368421


<h2>
    Weighted Classification
</h2>

In [9]:
df['tag'].value_counts()

0    570
1    178
Name: tag, dtype: int64

<div dir='rtl'>
     همانطور که مشاهده شد، مقادیر f1score و recall بسیار پایین هستند دلیل این امر این هست که برچسب کلاس ها unbalanced هستند و داده های 1 تعداد کمتری دارند. راه حل این امر این هست که از وزن دهی در مدل هایمان استفاده کنیم، به عبارتی وزن داده های با برچسب 1 را بیشتر قرار دهیم در زیر این تغییر ایجاد شده و مشاهده می کنید recall تا 1 پیش می رود.
</div>
    

<h2>
Weighted Logistic Regression
</h2>

In [16]:
def sigmoid(a):
    return 1/(1+ np.exp(-1 * a))

class LogisticRegression():
    def __init__(self, lr, max_iter):
        self.max_iter = max_iter
        self.lr = lr
        
    def predict(self, x):
        z = sigmoid(np.dot(x, self.w) + self.w0)
        return np.where(z > 0.5, 1, 0)

    def fit(self, x, y, weights):
        self.x = x
        self.y = y
        self.w0 = 0
        self.n = x.shape[0]
        self.m = x.shape[1]
        self.w = np.zeros(self.m)
        self.weight1 = weights['1']
        self.weight0 = weights['0']
        for i in range(self.max_iter):
            yn = sigmoid(np.dot(self.x, self.w) + self.w0)
            tmp = ((self.weight0) * yn + (self.weight1 - self.weight0) * np.multiply(yn, self.y) - (self.weight1) * self.y)
            self.w -= self.lr * (np.dot(self.x.T, tmp) / self.n)
            self.w0 -= self.lr * (np.sum(tmp) / self.n)
        return self


model = LogisticRegression(lr=0.1, max_iter=20000)
model.fit(x_train_array, y_train_array, {'1': 9, '0': 1})
y_pred = model.predict(x_test_array)
print("TEST: ")
test_cnf = calc_confusion(y_pred, y_test_array, y_test_array.shape[0])
print("accuracy_score:", accuracy_score(test_cnf), " f1_score: ", f1_score(test_cnf), " recall_score: ", recall_score(test_cnf))

train_y_pred = model.predict(x_train_array)
print("TRAIN: ")
train_cnf = calc_confusion(train_y_pred, y_train_array, y_train_array.shape[0])
print("accuracy_score:", accuracy_score(train_cnf), " f1_score: ", f1_score(train_cnf), " recall_score: ", recall_score(train_cnf))

TEST: 
accuracy_score: 0.3466666666666667  f1_score:  0.4302325581395348  recall_score:  0.9736842105263158
TRAIN: 
accuracy_score: 0.362876254180602  f1_score:  0.41653905053598766  recall_score:  0.9714285714285714


<h2>
Weighted Decision Tree
</h2>

In [12]:
class Config:
    TARGET = 'tag'
    PRIME = '_prime'
    Q = 'q'
    YES = 1
    NO = 0
    N = 2
    headers = ['Recency', 'Frequency', 'Monetary', 'Time', 'tag']


def change_data_set(df, n):
    for column in df.columns:
        if column == Config.TARGET:
            continue

        def f(x):
            min_value = df[column].min()
            max_value = df[column].max()
            step = (max_value - min_value) / n
            a = min_value
            b = a + step
            if x < a:
                return str(Config.Q) + "0"
            for i in range(1, n):
                if b > x >= a:
                    return str(Config.Q) + str(i)
                a = b
                b = a + step

            if x >= a:
                return str(Config.Q) + str(n)

        df[column + str(Config.PRIME)] = df[column].apply(lambda x: f(x))
    return df



for i in range(len(Config.headers)):
    if Config.headers[i] == Config.TARGET:
        continue
    Config.headers[i] = str(Config.headers[i]) + str(Config.PRIME)


class Question:
    def __init__(self, column, df):
        self.column = column
        self.column_possible_values = df[column].value_counts().index.tolist()
        self.num_possible_values = len(df[column].value_counts().index.tolist())
        self.gain_information = 0
        self.branches_entropy = []

    def partition(self, df):
        output = list()
        for possible_value in self.column_possible_values:
            temp = df[df[self.column] == possible_value]
            output.append(temp)
        return output

    def match_index(self, row):
        df_row = row
        row_answer_value = df_row[self.column]
        return self.column_possible_values.index(row_answer_value)

    def print(self, spacing):
        return "??" + str(self.column) + "??\n" + str(spacing) + " GI: " + str(
            self.gain_information) + "\n " + str(spacing) + "Branches entropy: " + str(self.branches_entropy)

    def __str__(self):
        return "??" + str(self.column) + "??"

def B(q):
    if q == 1 or q == 0:
        return 0
    return round(-1 * (q * np.log2(q) + (1 - q) * np.log2(1 - q)), 10)


def remainder_util(total_elements, n_k, p_k):
    return round((p_k + n_k) / total_elements * B(p_k / (p_k + n_k)), 10)


def reminder(total_elements, list_of_partitions):
    r = 0
    partitions_entropy = []
    for sub_df in list_of_partitions:
        n_k_and_pk = sub_df.shape[0]
        pk = sub_df[sub_df[Config.TARGET] == Config.YES].shape[0]
        nk = n_k_and_pk - pk
        t = remainder_util(total_elements, nk, pk)
        r += t
        partitions_entropy.append(t)
    return r, partitions_entropy



def gain(n, p, list_of_partitions):
    r, partitions_entropy = reminder(p + n, list_of_partitions)
    return (B(p / (n + p)) - r), partitions_entropy



def find_best_split(df):
    best_gain = 0
    best_question = None
    n_and_p = df.shape[0]
    p = df[df[Config.TARGET] == Config.YES].shape[0]
    n = n_and_p - p

    for feature in Config.headers[:-1]:
        question = Question(feature, df)
        question.gain_information, question.branches_entropy = gain(n, p, question.partition(df))
        if question.gain_information > best_gain:
            best_gain, best_question = question.gain_information, question
    return best_gain, best_question


def leaf_count(df, weight):
#     unique_values_count = np.array(df[Config.TARGET].value_counts())
    w0 = weight['0']
    w1 = weight['1']
    n_vals = df[Config.TARGET].value_counts()[0] * w0
    p_vals = df[Config.TARGET].value_counts()[1] * w1
    unique_values_count = [n_vals, p_vals]
    unique_values_name = df[Config.TARGET].value_counts().index.tolist()
    counts = {}
    for i, name in enumerate(unique_values_name):
        counts[name] = unique_values_count[i]

    return counts


class Leaf:
    def __init__(self, df, weight):
        self.predictions = leaf_count(df, weight)

    def predict(self):
        max_occur = 0
        max_key = None
        for key in self.predictions.keys():
            if self.predictions[key] > max_occur:
                max_occur = self.predictions[key]
                max_key = key
        return max_key


class MiddleNode:
    def __init__(self, question, branches):
        self.question = question
        self.branches = branches


def build_tree(df, max_depth, weight):
    gain, question = find_best_split(df)


    if gain == 0 or max_depth <= 0:
        return Leaf(df, weight)

    partitions = question.partition(df)
    partitions_nodes = [] 
    for partition in partitions:
        temp = build_tree(partition, max_depth - 1, weight)
        partitions_nodes.append(temp)

    return MiddleNode(question, partitions_nodes)



def print_tree(node, spacing1="\t", spacing2="\t\t"):
    if isinstance(node, Leaf):
        print(spacing1, node.predictions)
        return
    print(spacing1 + (node.question.print(spacing1)))
    for i, possible_value in enumerate(node.question.column_possible_values):

        print(spacing1 + '--> :' + str(possible_value))
        print_tree(node.branches[i], spacing1 + spacing2)



def classify(row, node):
    if isinstance(node, Leaf):
        return node.predict()
    branch_index = node.question.match_index(row)
    return classify(row, node.branches[branch_index])


def predict_test_set(df, node):
    predicted_y = []
    for row_index in range(df.shape[0]):
        y = classify(df.iloc[row_index], node)
        predicted_y.append(y)
    return predicted_y



df2 = pd.read_csv('transfusion.data')
df2.columns = ['Recency', 'Frequency', 'Monetary', 'Time', 'tag']
df2 = change_data_set(df2, Config.N)
train, test = split_train_test_data(df2)
train_x, train_y = split_x_y_data(train)
test_x, test_y = split_x_y_data(test)
my_tree = build_tree(df=train, max_depth=2, weight={'1':9, '0':1})
# print_tree(my_tree)

print("TRAIN:")
train_y_pred = predict_test_set(train, my_tree)
train_conf = calc_confusion(y_pred=train_y_pred, y_true=train_y, n=train_y.shape[0])
print("accuracy_score:", accuracy_score(train_conf), "f1_score: ", f1_score(train_conf), "recall_score: ", recall_score(train_conf))

print("TEST:")
test_y_pred = predict_test_set(test, my_tree)
test_conf = calc_confusion(y_pred=test_y_pred, y_true=test_y, n=test_y.shape[0])
print("accuracy_score:", accuracy_score(test_conf), "f1_score: ", f1_score(test_conf), "recall_score: ", recall_score(test_conf))

TRAIN:
accuracy_score: 0.22742474916387959 f1_score:  0.36712328767123287 recall_score:  0.9571428571428572
TEST:
accuracy_score: 0.24666666666666667 f1_score:  0.3957219251336898 recall_score:  0.9736842105263158


<h2>
    AdaBoost
</h2>

In [12]:
class AdaBoost:  
    def __init__(self,samples=10):
        self.samples = samples
        self.models = [None] * samples
        
    def predict(self, df):
        y = 0
        for s in range(self.samples):
            alpha, tree = self.models[s]
            y += alpha * np.array(predict_test_set(df, tree))
        y_ = np.where(np.vectorize(lambda i: 1 if i>= 0 else -1)(y) == 1, 1, -1)
        return y_
        
    def fit(self, df, x, y):    
        N = y.shape[0]
        w = np.array([1/N for i in range(N)])
        for m in range(self.samples):
            my_tree = build_tree(df=df, max_depth=1)
            y_pred = predict_test_set(df, my_tree)
            em = sum([w[i] * (1 if y[i] != y_pred[i] else 0) for i in range(N)]) / sum(w)
            alpha = np.log((1 - em) / em)  
            w = [w[i] * np.exp(alpha * (1 if y[i] != y_pred[i] else 0)) for i in range(N)] 
            self.models[m] = (alpha, my_tree)


df3 = pd.read_csv('transfusion.data')
df3.columns = ['Recency', 'Frequency', 'Monetary', 'Time', 'tag']
df3 = change_data_set(df3, Config.N)
train, test = split_train_test_data(df3)
train_x, train_y = split_x_y_data(train)
test_x, test_y = split_x_y_data(test)
clf = AdaBoost(samples=10)
clf.fit(train, train_x, train_y)

print("TEST: ")
y_test_pred = clf.predict(test)
test_conf = calc_confusion(y_pred=y_test_pred, y_true=test_y, n=test_y.shape[0])
print("accuracy_score: ", accuracy_score(test_conf), " f1_score:",f1_score(test_conf) , " recall_score: ", recall_score(test_conf))

print("TRAIN: ")
y_train_pred = clf.predict(train)
train_conf = calc_confusion(y_pred=y_train_pred, y_true=train_y, n=train_y.shape[0])
print("accuracy_score: ", accuracy_score(train_conf), " f1_score:",f1_score(train_conf) , " recall_score: ", recall_score(train_conf))

TEST: 
accuracy_score:  0.25333333333333335  f1_score: 0.4042553191489362  recall_score:  1.0
TRAIN: 
accuracy_score:  0.23411371237458195  f1_score: 0.3794037940379404  recall_score:  1.0
