# importing necessary modules

In [None]:
import math
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

## Entropy calculate

In [None]:
# https://medium.com/@pytholabs/decision-trees-from-scratch-using-id3-python-coding-it-up-6b79e3458de4
#  https://towardsdatascience.com/entropy-and-information-gain-in-decision-trees-c7db67a3a293
def calc_entropy(dataframe, attribute):
    # Compute the counts of each unique value in the column
    counts = np.bincount(dataframe[attribute])
    # Divide by the total column length to get a probability
    probabilities = counts / len(dataframe[attribute])

    entropy = 0
    for prob in probabilities:
        if prob > 0:
            # use log from math and set base to 2
            entropy += prob * math.log(prob, 2)

    return -entropy

## split dataset function

In [None]:
def split_dataset(dataframe, testsize=0.20):
    X = dataframe.iloc[:, :-1]
    y = dataframe.iloc[:, -1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testsize, random_state=1605084)
    return X_train, X_test, y_train, y_test

# Preprocessing

## Dataset1

In [None]:
def preprocess_dataset1(filepath):
    df = pd.read_csv(filepath)

    # https://stackoverflow.com/questions/13411544/delete-a-column-from-a-pandas-dataframe
    # removing unnecessary features
    df.drop('customerID', axis=1, inplace=True)

    label_name = "Churn"

    all_features = list(df.columns)
    all_features.remove(label_name)
    non_categorical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
    categorical_features = list(set(all_features) - set(non_categorical_features))

    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html
    df.loc[:, label_name].replace(to_replace=['Yes', 'No'], value=[1, -1], inplace=True)

    # all spaces are converted to np.nan
    df = df.replace(r'^\s*$', np.nan, regex=True)
    df['TotalCharges'] = df['TotalCharges'].astype(float)

    df.loc[:, 'SeniorCitizen'].replace([1, 0], ["Yes", "No"], inplace=True)

    # removing data row with missing label
    df.dropna(axis=0, how="any", subset=[label_name], inplace=True)
    df.reset_index(drop=True, inplace=True)

    # print(pd.isnull(df).sum().sum())

    # feature = column name
    for feature in df:
        # fill missing values
        if df[feature].isnull().sum() != 0:
            # print("Column: ", feature, " has missing values. count: ", df[feature].isnull().sum())
            if feature in non_categorical_features:
                # fill non-categorical missing values with median value -- https://www.geeksforgeeks.org/replacing-missing-values-using-pandas-in-python/
                df[feature].fillna(df[feature].mean(), inplace=True)
            else:
                # fill categorical missing values with mode value -- https://www.tutorialspoint.com/python-pandas-filling-missing-column-values-with-mode
                # mode returns a Series .. so we take the first element of the list
                df[feature].fillna(df[feature].mode()[0], inplace=True)

        if feature in non_categorical_features:
            df[feature] = df[feature].astype(float)
            # df[feature] = (df[feature]-df[feature].mean())/df[feature].std()

            # df[feature] = (df[feature]-df[feature].min())/(df[feature].max() - df[feature].min())

            x = df[feature].values.reshape(-1, 1)

            standard_scaler = preprocessing.StandardScaler()
            x_scaled = standard_scaler.fit_transform(x)

            df[feature] = x_scaled

    # https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/

    # one hot encoding implementation
    df = pd.get_dummies(df, prefix=categorical_features, columns=categorical_features)

    # move label column to the last
    encoded_all_features = list(df.columns)
    encoded_all_features.remove(label_name)
    encoded_all_features.append(label_name)
    df = df[encoded_all_features]

    # splitting dataset to 80% training and 20% test
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train[label_name] = y_train
    X_test[label_name] = y_test
    # fix indexing
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

    return X_train, X_test


## Dataset 2 training and test set

In [None]:
def preprocess_dataset2(train_filepath, test_filepath):
    column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                    'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                    'income']
    label_name = 'income'
    non_categorical_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    categorical_features = list(set(column_names) - set(non_categorical_features))
    categorical_features.remove(label_name)

    df1 = pd.read_csv(train_filepath, names=column_names, index_col=False, skipinitialspace=True)
    df2 = pd.read_csv(test_filepath, names=column_names, index_col=False, skipinitialspace=True)

    # # for training dataset
    df1.loc[:, label_name].replace(to_replace=['<=50K', '>50K'], value=[-1, 1], inplace=True)
    # for test dataset
    df2.loc[:, label_name].replace(to_replace=['<=50K.', '>50K.'], value=[-1, 1], inplace=True)

    train_len = df1.shape[0]

    df = pd.concat([df1, df2])

    df.replace(r'^[\s?]+$', np.nan, regex=True, inplace=True)

    # removing data row with missing label
    df.dropna(axis=0, how="any", subset=[label_name], inplace=True)
    df[label_name] = df[label_name].astype(np.int64)
    df.reset_index(drop=True, inplace=True)

    # print(df.isnull().sum())

    # feature = column name
    for feature in df:
        # fill missing values
        if df[feature].isnull().sum() != 0:
            # print("Column: ", feature, " has missing values. count: ", df[feature].isnull().sum())
            if feature in non_categorical_features:
                # fill non-categorical missing values with median value -- https://www.geeksforgeeks.org/replacing-missing-values-using-pandas-in-python/
                df[feature].fillna(df[feature].mean(), inplace=True)
            else:
                # fill categorical missing values with mode value -- https://www.tutorialspoint.com/python-pandas-filling-missing-column-values-with-mode
                # mode returns a Series .. so we take the first element of the list
                df[feature].fillna(df[feature].mode()[0], inplace=True)

        if feature in non_categorical_features:
            df[feature] = df[feature].astype(float)
            x = df[feature].values.reshape(-1, 1)

            standard_scaler = preprocessing.StandardScaler()
            x_scaled = standard_scaler.fit_transform(x)

            df[feature] = x_scaled
    # https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/
    # one hot encoding implementation
    df = pd.get_dummies(df, prefix=categorical_features, columns=categorical_features)
    # move label column to the last
    encoded_all_features = list(df.columns)
    encoded_all_features.remove(label_name)
    encoded_all_features.append(label_name)
    df = df[encoded_all_features]

    # splitting dataset
    X_train = df.iloc[:train_len, :]
    X_test = df.iloc[train_len:, :]
    # fix indexing
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

    return X_train, X_test

## Dataset3 full

In [None]:
def preprocess_dataset3(filepath):
    df = pd.read_csv(filepath)
    df.drop('Time', axis=1, inplace=True)
    label_name = 'Class'
    non_categorical_features = list(df.columns)
    non_categorical_features.remove(label_name)

    # removing data row with missing label
    df.dropna(axis=0, how="any", subset=[label_name], inplace=True)
    df.reset_index(drop=True, inplace=True)

    df.loc[:, label_name].replace(to_replace=[0], value=[-1], inplace=True)

    # feature = column name
    for feature in df:
        # fill missing values
        if df[feature].isnull().sum() != 0:
            # print("Column: ", feature, " has missing values. count: ", df[feature].isnull().sum())
            if feature in non_categorical_features:
                # fill non-categorical missing values with median value -- https://www.geeksforgeeks.org/replacing-missing-values-using-pandas-in-python/
                df[feature].fillna(df[feature].mean(), inplace=True)
            else:
                # fill categorical missing values with mode value -- https://www.tutorialspoint.com/python-pandas-filling-missing-column-values-with-mode
                # mode returns a Series .. so we take the first element of the list
                df[feature].fillna(df[feature].mode()[0], inplace=True)

        if feature in non_categorical_features:
            df[feature] = df[feature].astype(float)
            x = df[feature].values.reshape(-1, 1)

            standard_scaler = preprocessing.StandardScaler()
            x_scaled = standard_scaler.fit_transform(x)

            df[feature] = x_scaled

    # splitting dataset to 80% training and 20% test
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train[label_name] = y_train
    X_test[label_name] = y_test
    # fix indexing
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

    return X_train, X_test

## dataset3 with all positive samples and variable number of negative samples

In [None]:
def preprocess_dataset3_sample(filepath, num_neg_samples=20000):
    df = pd.read_csv(filepath)
    df.drop('Time', axis=1, inplace=True)
    label_name = 'Class'
    non_categorical_features = list(df.columns)
    non_categorical_features.remove(label_name)

    # removing data row with missing label
    df.dropna(axis=0, how="any", subset=[label_name], inplace=True)
    df.reset_index(drop=True, inplace=True)

    df.loc[:, label_name].replace(to_replace=[0], value=[-1], inplace=True)

    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html
    positive_samples = df.loc[df[label_name] == 1]  # take all
    negative_samples = (shuffle(df[df[label_name] == -1], random_state=84))[:num_neg_samples]  # take 20k

    df = shuffle(pd.concat([positive_samples, negative_samples]), random_state=84).reset_index(drop=True)

    df.to_csv('dataset3/creditcard_sample.csv', index=False, header=True)

    # splitting dataset to 80% training and 20% test
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train[label_name] = y_train
    X_test[label_name] = y_test
    # fix indexing
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

    return X_train, X_test

## dataset4 given in online

In [None]:
def preprocess_dataset4(filepath):
    df = pd.read_csv(filepath, sep=";", index_col=False, skipinitialspace=True)

    column_names = ["age", "job", "marital", "education", "default", "housing", "loan", "contact", "month",
                    "day_of_week", "duration", "campaign", "pdays", "previous", "poutcome", "emp.var.rate",
                    "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "y"]

    label_name = 'y'

    non_categorical_features = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx',
                                "cons.conf.idx", "euribor3m", "nr.employed"]
    categorical_features = list(set(column_names) - set(non_categorical_features))
    categorical_features.remove(label_name)

    df.loc[:, label_name].replace(to_replace=['yes', 'no'], value=[1, -1], inplace=True)

    # removing data row with missing label
    df.dropna(axis=0, how="any", subset=[label_name], inplace=True)
    df.reset_index(drop=True, inplace=True)

    # feature = column name
    for feature in df:
        # fill missing values
        if df[feature].isnull().sum() != 0:
            print("Column: ", feature, " has missing values. count: ", df[feature].isnull().sum())
            if feature in non_categorical_features:
                # fill non-categorical missing values with median value -- https://www.geeksforgeeks.org/replacing-missing-values-using-pandas-in-python/
                df[feature].fillna(df[feature].mean(), inplace=True)
            else:
                # fill categorical missing values with mode value -- https://www.tutorialspoint.com/python-pandas-filling-missing-column-values-with-mode
                # mode returns a Series .. so we take the first element of the list
                df[feature].fillna(df[feature].mode()[0], inplace=True)

        if feature in non_categorical_features:
            df[feature] = df[feature].astype(float)
            # df[feature] = (df[feature]-df[feature].mean())/df[feature].std()

            # df[feature] = (df[feature]-df[feature].min())/(df[feature].max() - df[feature].min())

            x = df[feature].values.reshape(-1, 1)

            standard_scaler = preprocessing.StandardScaler()
            x_scaled = standard_scaler.fit_transform(x)

            df[feature] = x_scaled

    #     # one hot encoding implementation
    df = pd.get_dummies(df, prefix=categorical_features, columns=categorical_features)

    # # move label column to the last
    encoded_all_features = list(df.columns)
    encoded_all_features.remove(label_name)
    encoded_all_features.append(label_name)
    df = df[encoded_all_features]

    X_train, X_test, y_train, y_test = split_dataset(df, testsize=0.1)
    X_train[label_name] = y_train
    X_test[label_name] = y_test

    # print(y_test.shape)
    # fix indexing
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

    return X_train, X_test

# Logistic Regression Implementation

In [None]:
# https://towardsdatascience.com/logistic-regression-from-scratch-in-python-ec66603592e2

def train_loss(y, y_hat):
    loss = np.mean((y - y_hat) * (y - y_hat))
    return loss


def gradient_descent(X, y, y_hat):
    # X --> Input.
    # y --> true/target value.
    # y_hat --> hypothesis/predictions.
    # w --> weights (parameter).

    # Gradient of loss w.r.t weights.
    dw = np.matmul(X.T, ((y - y_hat) * (1 - y_hat * y_hat))) / X.shape[0]
    return dw


def logistic_regression_train(data, epochs, lr, threshold=0.0):
    # X --> Input.
    # y --> target value.
    # epochs --> Number of iterations.
    # lr --> Learning rate.
    # m-> number of training examples
    # n-> number of features

    X = data.iloc[:, :-1].values
    X = np.insert(X, 0, [1] * len(X), axis=1)

    y = data.iloc[:, -1].values

    m, n = X.shape

    # Initializing weights
    w = np.zeros((n, 1))

    # Reshaping y
    y = y.reshape(m, 1)

    losses = []
    # Training loop.
    for epoch in range(epochs):
        # Calculating hypothesis/prediction.
        y_hat = np.tanh(np.matmul(X, w))

        # Getting the gradients of loss w.r.t parameters.
        dw = gradient_descent(X, y, y_hat)

        # Updating the parameters.
        w += lr * dw

        l = train_loss(y, y_hat)
        if l < threshold:
            # print("loss found less than 0.5.loss: ", l) # for weak learner threshold needs to be set
            break
        losses.append(l)

    # plt.plot([i+1 for i  in range(epochs)], losses)
    # plt.show()
    # returning weights, predictions(hyp)
    return w, y_hat


def predict(data, weight_matrix):
    # X --> Input.

    X = data.iloc[:, :-1].values
    X = np.insert(X, 0, [1] * len(X), axis=1)

    # Calculating precisions/y_hat.

    predictions = np.tanh(np.matmul(X, weight_matrix))
    predictions_list = [-1 if x < 0 else 1 for x in predictions]
    # predictions_list[predictions < 0] = -1

    return np.array(predictions_list)


def performance_measure(data, y_hat, show_all_measures=True):
    y = data.iloc[:, -1]
    acc = np.sum(y == y_hat) / len(y)
    print("Accuracy: ", acc * 100, "%")

    if show_all_measures:
        tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel()
        TPR = tp / (tp + fn)  # recall, hit rate
        TNR = tn / (tn + fp)
        precision = tp / (tp + fp)
        FDR = fp / (fp + tp)
        f1 = (2 * precision * TPR) / (precision + TPR)

        print(f"True positive rate(sensitivity, recall, hit rate): {TPR * 100}%")
        print(f"True negative rate (specificity): {TNR * 100}%")
        print(f"Positive predictive value (precision): {precision * 100}%")
        print(f"False discovery rate: {FDR * 100}%")
        print(f"F1 score: {f1 * 100}%\n")


# Adaboost implementation

In [None]:
def adaboost(examples, K):
    # examples : set of N labled examples
    # weak_learner: a learning algo
    # K: number of hypothesis in the ensemble

    # sample_size = N
    sample_size = examples.shape[0]
    epsilon = np.finfo(float).eps  # to avoid division by 0

    # w: a vector of N example weights, initially 1/N
    # h: a vector of K hypothesis
    # z: a vector of K hypothesis weights
    w = [float(1 / sample_size) for i in range(sample_size)]
    h = []
    z = []

    real_label_values = examples.iloc[:, -1]
    # print("hyihi", real_label_values)
    # print("shape1 : ", real_label_values[5])

    for k in range(K):
        data = examples.sample(n=sample_size, weights=w, replace=True, random_state=84).reset_index(drop=True)
        weight, hypothesis = logistic_regression_train(data, epochs=100, lr=0.5, threshold=0.5)

        h.append(weight)
        error = 0

        pred = predict(examples, weight_matrix=weight)

        for i in range(sample_size):
            if pred[i] != real_label_values[i]:
                error += w[i]

        if error > 0.5:
            # print("error > 0.5")
            continue

        # print("k: ", k, "    error: ", error)
        for i in range(sample_size):
            if pred[i] == real_label_values[i]:
                w[i] = (w[i] * error) / (1 - error)

        # normalize weights
        w = [float(i) / sum(w) for i in w]

        z.append(math.log((1 - error) / (error + epsilon), 2))

    return h, z


def adaboost_accuracy(dataset, hyp_vectors, z_values):
    label_predict = [0] * dataset.shape[0]
    for h, z in zip(hyp_vectors, z_values):
        label_predict += predict(data=dataset, weight_matrix=h) * z

    label_predict = [-1 if x < 0 else 1 for x in label_predict]
    performance_measure(dataset, np.array(label_predict), show_all_measures=False)

# download datasets from the following links as they are too large to upload in github

  <b>link for dataset1: https://www.kaggle.com/blastchar/telco-customer-churn</b>
  <b>link for dataset2: https://archive.ics.uci.edu/ml/datasets/adult </b>
  <b>link for dataset3: https://www.kaggle.com/mlg-ulb/creditcardfraud </b>
  <b>link for dataset4: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing </b>

# Specify datasets' file paths

In [None]:
dataset_1_file_path = "dataset1/WA_Fn-UseC_-Telco-Customer-Churn.csv"
dataset_2_train_file_path = "dataset2/adult.data"
dataset_2_test_file_path = "dataset2/adult.test"
dataset_3_file_path = "dataset3/creditcard.csv"
dataset_4_file_path = "dataset4/bank-additional-full.csv"

# Preprocess data and get the train and test set

### get dataset1 train and test set

In [None]:
X_train, X_test = preprocess_dataset1(dataset_1_file_path)

### get dataset2 train and test set

In [None]:
X_train, X_test = preprocess_dataset2(dataset_2_train_file_path, dataset_2_test_file_path)

### get dataset3(full) train and test set

In [None]:
X_train, X_test = preprocess_dataset3(dataset_3_file_path)

### get dataset3(portion) train and test set

In [None]:
X_train, X_test = preprocess_dataset3_sample(dataset_3_file_path, num_neg_samples=10000)

### get dataset4 train and test set

In [None]:
X_train, X_test = preprocess_dataset4(dataset_4_file_path)

# Run Logistic Regression

In [None]:
print("Logistic Regression Result:")
weight, predictions = logistic_regression_train(data=X_train, epochs=1000, lr=0.05)

print("Training set:")
p = predict(data=X_train, weight_matrix=weight)
performance_measure(data=X_train, y_hat=p)

print("Test set:")
p = predict(data=X_test, weight_matrix=weight)
performance_measure(data=X_test, y_hat=p)

# Run adaboost

In [None]:
print("\n\nAdaboost Result:")
K_values = [5, 10, 15, 20]
for k in K_values:
    print("\nK :", k)
    print("Training set:")
    h, z = adaboost(X_train, k)
    adaboost_accuracy(X_train, h, z)
    print()

    print("Test set: ")
    h, z = adaboost(X_test, k)
    adaboost_accuracy(X_test, h, z)