In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [58]:
missing_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                 'marital-status', 'occupation', 'relationship', 'race', 'sex',
                 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

dataset1 = pd.read_csv('datasets/1/WA_Fn-UseC_-Telco-Customer-Churn.csv')

dataset2_train = pd.read_csv('datasets/2/adult.data', header=None, names=missing_columns, na_values=' ?')
dataset2_test = pd.read_csv('datasets/2/adult.test', header=None, names=missing_columns, na_values=' ?')
dataset2_train['income'] = dataset2_train['income'].str.replace('.', '')
dataset2_test['income'] = dataset2_test['income'].str.replace('.', '')
dataset2 = pd.concat([dataset2_train, dataset2_test])

dataset3 = pd.read_csv('datasets/3/creditcard.csv')

np.random.seed(1)

#dataset = dataset1
#dataset = dataset2
#dataset = dataset3

In [71]:
dataset_type = 3


if dataset_type == 1:
    dataset = dataset1
elif dataset_type == 2:
    dataset = dataset2
elif dataset_type == 3:
    dataset = dataset3

In [60]:
# check missing data and data type of columns
print(dataset.info())
print(dataset.isnull().sum())
    

<class 'pandas.core.frame.DataFrame'>
Index: 48843 entries, 0 to 16281
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             48843 non-null  object 
 1   workclass       46043 non-null  object 
 2   fnlwgt          48842 non-null  float64
 3   education       48842 non-null  object 
 4   education-num   48842 non-null  float64
 5   marital-status  48842 non-null  object 
 6   occupation      46033 non-null  object 
 7   relationship    48842 non-null  object 
 8   race            48842 non-null  object 
 9   sex             48842 non-null  object 
 10  capital-gain    48842 non-null  float64
 11  capital-loss    48842 non-null  float64
 12  hours-per-week  48842 non-null  float64
 13  native-country  47985 non-null  object 
 14  income          48842 non-null  object 
dtypes: float64(5), object(10)
memory usage: 6.0+ MB
None
age                  0
workclass         2800
fnlwgt               1
educ

In [40]:
# unique value count of each column
for column in dataset.columns:
    print(f"{column}: {dataset[column].nunique()}, datatype: {dataset[column].dtype}")

age: 147, datatype: object
workclass: 8, datatype: object
fnlwgt: 28523, datatype: float64
education: 16, datatype: object
education-num: 16, datatype: float64
marital-status: 7, datatype: object
occupation: 14, datatype: object
relationship: 6, datatype: object
race: 5, datatype: object
sex: 2, datatype: object
capital-gain: 123, datatype: float64
capital-loss: 99, datatype: float64
hours-per-week: 96, datatype: float64
income: 2, datatype: object


In [72]:
if dataset_type != 2:    
    # extract X dataframe keeping column labels intact
    X = dataset.iloc[:, :-1]
    # extract y dataframe keeping column labels intact
    Y = dataset.iloc[:, -1]

    X.head()

In [73]:
#check class distribution of Y
print(Y.value_counts())


Class
0    284315
1       492
Name: count, dtype: int64


### Global preprocess functions

In [43]:
def get_test_train_split(X, Y):
    # test train split

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                        test_size = 0.2, random_state = 0)
    return X_train, X_test, Y_train, Y_test

In [44]:
def global_labelEncode_oneHotEncode(X_df, columns_to_hot_encode, columns_to_label_encode):
    #one hot encode
    one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
    
    if columns_to_hot_encode is not None:
        X_df_encoded = one_hot_encoder.fit_transform(X_df[columns_to_hot_encode])
        X_df = X_df.drop(columns_to_hot_encode, axis=1)
        
        #print(f"shape of X_df before concat: {X_df.shape}")
        
        X_df = pd.concat([X_df, pd.DataFrame(X_df_encoded)], axis=1)
        
        #print(f"shape of X_df after concat: {X_df.shape}")
    
    
        
    #label encode
    label_encoder = LabelEncoder()
    if columns_to_label_encode is not None:
        for column in columns_to_label_encode:
            X_df[column] = label_encoder.fit_transform(X_df[column])
        
    return X_df

In [45]:
def global_preprocess(X,Y, 
                      columns_to_hot_encode=None, 
                      columns_to_label_encode=None,
                      columns_to_normalize=None,
                      fill_missing_type='mean'):
    
    # one hot encode and label encode before splitting
    X = global_labelEncode_oneHotEncode(X, columns_to_hot_encode, columns_to_label_encode)
    X_df_train, X_df_test, Y_train, Y_test = get_test_train_split(X, Y)
    
    
    # #convert to numeric and fill missing values
    X_df_train = X_df_train.apply(pd.to_numeric, errors='coerce')
    X_df_test = X_df_test.apply(pd.to_numeric, errors='coerce')
    if fill_missing_type == 'zero':
        X_df_train = X_df_train.fillna(0)
        X_df_test = X_df_test.fillna(0)
    else:    
        X_df_train = X_df_train.fillna(X_df_train.mean())
        X_df_test = X_df_test.fillna(X_df_test.mean())
            
    # #normalize
    scaler = MinMaxScaler()
    X_df_train[columns_to_normalize] = scaler.fit_transform(X_df_train[columns_to_normalize])
    X_df_test[columns_to_normalize] = scaler.transform(X_df_test[columns_to_normalize])
    
    return X_df_train, X_df_test, Y_train, Y_test
    

In [46]:
# print the columns of X where data type is object and unique value count is 2
# return list of those column names
def get_binary_columns(X_binary):
    binary_columns = []
    for column in X_binary.columns:
        if X_binary[column].dtype == 'object' and X_binary[column].nunique() == 2:
            binary_columns.append(column)
    return binary_columns

### Dataset specific functions

In [47]:
def preprocess_dataset1(X, Y):
    
    #label encode Y
    labelencoder_Y = LabelEncoder()
    Y = labelencoder_Y.fit_transform(Y)
    
    X_df = X.drop('customerID', axis=1)
    
    X_df['MultipleLines'] = X_df['MultipleLines'].replace('No phone service', 'No')
    internet_columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
    
    for i in internet_columns:
        X_df[i] = X_df[i].replace('No internet service', 'No')
        
    columns_to_hot_encode = ['InternetService', 'Contract', 'PaymentMethod']
    # columns_to_label_encode = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 
    #                            'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
    #                            'PaperlessBilling']
    cols = get_binary_columns(X_df)    
    columns_to_normalize = ['tenure', 'MonthlyCharges', 'TotalCharges']
    
    
    
    X_df_train, X_df_test, Y_train, Y_test = global_preprocess(X_df, Y, 
                                                               columns_to_hot_encode=columns_to_hot_encode,
                                                                columns_to_label_encode=cols,
                                                                columns_to_normalize=columns_to_normalize,
                                                                fill_missing_type='mean')

    return X_df_train, X_df_test, Y_train, Y_test
  

In [56]:
def preprocess_dataset2(dataset_train, dataset_test):

    X_train = dataset_train.iloc[:, :-1]
    Y_train = dataset_train.iloc[:, -1]
    X_test = dataset_test.iloc[:, :-1]
    Y_test = dataset_test.iloc[:, -1]

    
    labelencoder_Y = LabelEncoder()
    Y_train = labelencoder_Y.fit_transform(Y_train)
    #Y_train = pd.DataFrame(Y_train)
    Y_test = dataset_test.iloc[:, -1]
    Y_test = labelencoder_Y.fit_transform(Y_test)
    #Y_test = pd.DataFrame(Y_test)
    
    # replace ? values with nan
    targert_columns = ['workclass', 'occupation', 'native-country']   
    X_train[targert_columns] = X_train[targert_columns].replace(' ?', np.nan)
    X_test[targert_columns] = X_test[targert_columns].replace(' ?', np.nan)
    
    
    # list columns that are of type object
    object_columns = X_train.select_dtypes(include=['object']).columns
    columns_to_hot_encode = object_columns
    X_train = global_labelEncode_oneHotEncode(X_train, columns_to_hot_encode, None)
    X_test = global_labelEncode_oneHotEncode(X_test, columns_to_hot_encode, None)
    #print(f"columns_to_hot_encode: {columns_to_hot_encode}")
    
    X_train.columns = X_train.columns.astype(str)
    X_test.columns = X_test.columns.astype(str)
    # list columns that are of type numeric for nomralization
    numeric_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
    columns_to_normalize = numeric_columns
    #print(f"columns_to_normalize: {columns_to_normalize}")
    
    
    
    # #convert to numeric and fill missing values
    X_train = X_train.apply(pd.to_numeric, errors='coerce')
    X_test = X_test.apply(pd.to_numeric, errors='coerce')

    X_train = X_train.fillna(X_train.mean())
    X_test = X_test.fillna(X_test.mean())
            
    # #normalize
    scaler = MinMaxScaler()
    X_train[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])
    
    return X_train, X_test, Y_train, Y_test
    

In [48]:
def preprocess_dataset3(X, Y):
    
    # list columns that are of type numeric for nomralization
    numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns
    columns_to_normalize = numeric_columns
    
    #randomly select 2000 '0' class samples from Y and all '1' class samples
    concat_df = pd.concat([X, Y], axis=1)
    concat_df_0 = concat_df[concat_df['Class'] == 0]
    concat_df_1 = concat_df[concat_df['Class'] == 1]
    concat_df_0_sample = concat_df_0.sample(n=2000, random_state=0)
    concat_df_1_sample = concat_df_1
    concat_df_sample = pd.concat([concat_df_0_sample, concat_df_1_sample], axis=0)
    X = concat_df_sample.drop('Class', axis=1)
    Y = concat_df_sample['Class']
    
    
    X_train, X_test, Y_train, Y_test = global_preprocess(X, Y, 
                                                        columns_to_hot_encode=None,
                                                        columns_to_label_encode=None,
                                                        columns_to_normalize=columns_to_normalize,
                                                        fill_missing_type='mean')

    
    return X_train, X_test, Y_train, Y_test

In [74]:
if dataset_type == 1:
    X_train, X_test, Y_train, Y_test = preprocess_dataset1(X, Y)
elif dataset_type == 2:
    X_train, X_test, Y_train, Y_test = preprocess_dataset2(dataset2_train, dataset2_test)
elif dataset_type == 3:
    X_train, X_test, Y_train, Y_test = preprocess_dataset3(X, Y)


In [75]:
X_train.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
42549,0.237758,0.766287,0.474491,0.641708,0.775829,0.493591,0.242183,0.544067,0.728048,0.340158,...,0.497793,0.502577,0.565251,0.582999,0.522098,0.432747,0.388567,0.884977,0.577907,0.020566
239781,0.870461,0.91239,0.373862,0.921117,0.252656,0.669774,0.476085,0.731555,0.674641,0.6583,...,0.444579,0.460811,0.570887,0.597741,0.543073,0.510574,0.25837,0.704874,0.543645,0.002681
215915,0.812524,0.914583,0.40065,0.838133,0.263102,0.682637,0.438572,0.718797,0.633072,0.612232,...,0.459246,0.436203,0.583058,0.604928,0.232527,0.495088,0.279943,0.672657,0.517899,0.003494
41272,0.234697,0.874487,0.416546,0.876844,0.351772,0.658422,0.436529,0.720179,0.694391,0.560626,...,0.425019,0.461192,0.53231,0.599975,0.408824,0.54859,0.252547,0.631218,0.499015,0.000233
7945,0.06279,0.96284,0.34969,0.909012,0.293089,0.642469,0.425801,0.71938,0.671096,0.696351,...,0.43905,0.455976,0.525991,0.60439,0.494035,0.564015,0.561738,0.667063,0.509673,0.009091


In [65]:
# check missing data and data type of columns
print(X_test.info())
print(X_test.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16282 entries, 0 to 16281
Columns: 104 entries, age to 97
dtypes: float64(104)
memory usage: 12.9 MB
None
age              0
fnlwgt           0
education-num    0
capital-gain     0
capital-loss     0
                ..
93               0
94               0
95               0
96               0
97               0
Length: 104, dtype: int64


In [21]:
#print type and shape of X_train
print(type(Y_train))
print(Y_train.shape)


<class 'pandas.core.frame.DataFrame'>
(32561, 1)


In [22]:
def entropy(labels):
    unique_labels, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    entropy_value = -np.sum(probabilities * np.log2(probabilities))
    return entropy_value

In [23]:
def feature_selection_by_IG(X_train, X_test, Y_train, num_features=-1):
    base_entropy = entropy(Y_train)
    print(f"base entropy: {base_entropy}")
    
    preprocessed_X = X_train.copy()
    
    # calculate entropy for each feature
    entropies = []
    InfoGain = []
    for column in preprocessed_X.columns:
        feat_value, feat_value_counts = np.unique(preprocessed_X[column], return_counts=True)
        weighted_feature_entropy = 0
        
        for value, count in zip(feat_value, feat_value_counts):
            #weighted_feature_entropy += count * entropy(Y_train[preprocessed_X[column] == value])
            weighted_feature_entropy += (count / len(preprocessed_X)) * entropy(Y_train[preprocessed_X[column] == value])
            
            
        entropies.append(weighted_feature_entropy)
        InfoGain.append(base_entropy - weighted_feature_entropy)
        #print(f"entropy for column '{column}': {weighted_feature_entropy} and information gain: {base_entropy - weighted_feature_entropy}" )
        
    # sort by InfoGain
    sorted_indices = np.argsort(InfoGain)[::-1]
    sorted_IG = np.sort(InfoGain)[::-1]
    sorted_columns = preprocessed_X.columns[sorted_indices]
    
    #print(sorted_IG)
    
    if num_features == -1:
        num_features = len(sorted_columns)
        
    # return dataframe with selected features and extract the same features from the test set
    truncated_X_train = pd.DataFrame(preprocessed_X[sorted_columns[:num_features]], columns=sorted_columns[:num_features])
    
    # only select those features from test set which are present in the truncated_X_train
    truncated_X_test = pd.DataFrame(X_test[sorted_columns[:num_features]], columns=sorted_columns[:num_features])
    
    return truncated_X_train, truncated_X_test

        
    

In [76]:
trunc_X_train, trunc_X_test = feature_selection_by_IG(X_train, X_test, Y_train, num_features=-1)

# make Y_train and Y_test as numpy arrays
# Y_train = Y_train.values
# Y_test = Y_test.values

print(f"shape of X_train: {trunc_X_train.shape}")
print(f"shape of X_test: {trunc_X_test.shape}")
print(f"shape of Y_train: {Y_train.shape} and type: {type(Y_train)}")
print(f"shape of Y_test: {Y_test.shape} and type: {type(Y_test)}")

base entropy: 0.7091021926507215
shape of X_train: (1993, 30)
shape of X_test: (499, 30)
shape of Y_train: (1993,) and type: <class 'pandas.core.series.Series'>
shape of Y_test: (499,) and type: <class 'pandas.core.series.Series'>


### Modified Logistic regression that also works as a weak learner

In [69]:
class modified_regressor:
    def __init__(self, X_train, Y_train, 
                threshold=-1, max_feature_count=-1, learning_rate=0.1):
        self.X = X_train
        self.Y = Y_train
        self.X_test = None
        self.Y_test = None
        
        self.y_hat = None
        self.weights = None
        self.max_feature_count = max_feature_count
        self.threshold = threshold
        self.learning_rate = learning_rate
        self.selected_features = None
        
        self.sorted_indices = None
        self.sorted_IG = None
        self.sorted_columns = None
            
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def binary_cross_entropy(self, y, y_hat):
        return -(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    
    def gradient_descent(self):
        z = np.dot(self.X, self.weights)
        y_hat = self.sigmoid(z)
        error = y_hat - self.Y
        gradient = np.dot(self.X.T, error) / len(self.Y)
        self.weights -= self.learning_rate * gradient
    
    def train(self):        
        self.weights = np.zeros(self.X.shape[1])
        iteration = 0
        if self.threshold is not None and self.threshold > 0:
            error = float('inf')
            
            
            steps = 1000
            for i in range(steps):
                self.gradient_descent()
                iteration += 1
                z = np.dot(self.X, self.weights)
                y_hat = self.sigmoid(z)
                sum_error = np.sum(self.binary_cross_entropy(self.Y, y_hat))
                error = sum_error / len(self.Y)
                if error <= self.threshold:
                    break
        else:
            steps = 1000
            for i in range(steps):
                iteration += 1
                self.gradient_descent()
                z = np.dot(self.X, self.weights)
                y_hat = self.sigmoid(z)
                sum_error = np.sum(self.binary_cross_entropy(self.Y, y_hat))
                error = sum_error / len(self.Y)
        
        #print(f"total iterations: {iteration}, error: {error}")        
        #return self.weights, error, iteration
        
    def predict(self, X_test):
        self.X_test = X_test
        z = np.dot(self.X_test, self.weights)
        y_hat = np.round(self.sigmoid(z))
        self.y_hat = y_hat
        return y_hat
    
    
    def accuracy(self, Y_test):
        self.Y_test = Y_test
        return (np.sum(self.y_hat == self.Y_test) / len(self.Y_test))*100
                

### Adaboost 

In [66]:
def adaboost(examples_X, examples_Y, L_weak, K, num_features=-1):
    """
    Parameters
    ----------
    
    examples : set of N examples
    L_weak : weak learner (logistic regression)
    K : number of weak learners to use
    
    ------------------
    """
    
    data_point_weights = np.ones(len(examples_X)) / len(examples_X)
    hypothesises = []
    hypo_weights = []
        
    for i in range(K):
        #resampling data
        # resampling data
        indices = np.random.choice(len(examples_X), len(examples_X), p=data_point_weights)
        resampled_examples_X = examples_X.values[indices]
        resampled_examples_Y = examples_Y[indices]

        
        # initialize regressor
        regressor = L_weak(resampled_examples_X, resampled_examples_Y, 0.5, num_features)
        regressor.train()
        y_hat = regressor.predict(examples_X)
        #hypothesises.append(regressor)
        
        error = 0
        for j in range(len(examples_X)):
            if y_hat[j] != examples_Y[j]:
                error += data_point_weights[j]
            
        if error > 0.5:
            continue
        else:
            hypothesises.append(regressor)
        
        for j in range(len(examples_X)):
            if y_hat[j] == examples_Y[j]:
                data_point_weights[j] *= error / (1 - error)
        
        # normalizing weights
        data_point_weights /= np.sum(data_point_weights)
        
        hypo_weights.append(np.log2((1 - error) / (error + 1e-10)))
        
    return hypothesises, hypo_weights

In [344]:
def weighted_majority(hypothesises, hypo_weights, examples_X):
    predictions = []
    
    #normalize hypothesis weights
    hypo_weights /= np.sum(hypo_weights)
    
    y_hats = np.zeros(len(examples_X))
    
    for i in range(len(hypothesises)):
        y_hats += hypo_weights[i] * hypothesises[i].predict(examples_X)
    
    predictions = np.round(y_hats)
    
    
    # for i in range(len(examples_X)):
    #     prediction = 0
    #     for j in range(len(hypothesises)):
    #         prediction += hypo_weights[j] * hypothesises[j].predict(examples_X[i])  
    #     prediction /= len(hypothesises)
    #     predictions.append(np.round(prediction))
    return predictions

### performance metrics

In [30]:
def perf_metrics(pred_Y, true_Y):
    TP = 0
    FN = 0
    FP = 0
    TN = 0
    
    for i in range(len(pred_Y)):
        if pred_Y[i] == true_Y[i] == 1:
            TP += 1
        elif pred_Y[i] == 1 and true_Y[i] == 0:
            FP += 1
        elif pred_Y[i] == 0 and true_Y[i] == 1:
            FN += 1
        else:
            TN += 1
            
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    false_discovery_rate = FP / (FP + TP)
    F1_score = 2 / ((1 / precision) + (1 / sensitivity))
    
    return accuracy, precision, sensitivity, specificity, false_discovery_rate, F1_score
    
    

In [31]:
def print_stuffs(accuracy, sensitivity, specificity, precision, false_discovery_rate, F1_score):
    print(f"accuracy: {accuracy}")
    print(f"precision: {precision}")
    print(f"sensitivity: {sensitivity}")
    print(f"specificity: {specificity}")
    print(f"false_discovery_rate: {false_discovery_rate}")
    print(f"F1_score: {F1_score}")
    print("\n---------------\n")

### without boosting

In [70]:

# if Y_train is series.series conver to ndarray
if type(Y_train) == pd.core.series.Series:
    Y_train = Y_train.values
if type(Y_test) == pd.core.series.Series:
    Y_test = Y_test.values

regressor = modified_regressor(trunc_X_train, Y_train)
regressor.train()

print(f"\nWithout Boosting:\n")
print(f"Training data")

y_pred = regressor.predict(trunc_X_train)


accuracy, precision, sensitivity, specificity, false_discovery_rate, F1_score = perf_metrics(y_pred, Y_train)
print_stuffs(accuracy, sensitivity, specificity, precision, false_discovery_rate, F1_score)

print(f"Test data")

y_pred = regressor.predict(trunc_X_test)
accuracy, precision, sensitivity, specificity, false_discovery_rate, F1_score = perf_metrics(y_pred, Y_test)
print_stuffs(accuracy, sensitivity, specificity, precision, false_discovery_rate, F1_score)





Without Boosting:

Training data
accuracy: 0.8264181075519793
precision: 0.6994714780390012
sensitivity: 0.48947838285932915
specificity: 0.9332928802588997
false_discovery_rate: 0.3005285219609987
F1_score: 0.5759303721488594

---------------

Test data
accuracy: 0.7755803955288049
precision: 0.628
sensitivity: 0.12246489859594384
specificity: 0.9775651334834352
false_discovery_rate: 0.372
F1_score: 0.20496083550913838

---------------



### With Boosting

In [None]:
hypothesises, hypo_weights = adaboost(trunc_X_train, trunc_X_test, Y_train, Y_test, modified_regressor, 10)

In [None]:
print(f"shape of truncated_X_train: {trunc_X_train.shape}")
print(f"shape of truncated_X_test: {trunc_X_test.shape}")
print(f"type of truncated_X_train: {type(trunc_X_train)}")

In [None]:
## predictions
print(f"Training data")
y_hat = weighted_majority(hypothesises, hypo_weights, trunc_X_train)
accuracy, precision, sensitivity, specificity, false_discovery_rate, F1_score = perf_metrics(y_hat, Y_train)
print_stuffs(accuracy, sensitivity, specificity, precision, false_discovery_rate, F1_score)

print(f"Test data")
y_hat = weighted_majority(hypothesises, hypo_weights, trunc_X_test)
accuracy, precision, sensitivity, specificity, false_discovery_rate, F1_score = perf_metrics(y_hat, Y_test)
print_stuffs(accuracy, sensitivity, specificity, precision, false_discovery_rate, F1_score)

In [268]:
for K in range(5, 30, 5):
    print(f"K: {K}")
    hypothesises, hypo_weights = adaboost(trunc_X_train, Y_train,  
                                          modified_regressor, K)
    ## predictions
    print(f"Training data")
    y_hat = weighted_majority(hypothesises, hypo_weights, trunc_X_train)
    accuracy, precision, sensitivity, specificity, false_discovery_rate, F1_score = perf_metrics(y_hat, Y_train)
    print_stuffs(accuracy, sensitivity, specificity, precision, false_discovery_rate, F1_score)

    print(f"Test data")
    y_hat = weighted_majority(hypothesises, hypo_weights, trunc_X_test)
    accuracy, precision, sensitivity, specificity, false_discovery_rate, F1_score = perf_metrics(y_hat, Y_test)
    print_stuffs(accuracy, sensitivity, specificity, precision, false_discovery_rate, F1_score)


K: 5
Training data
accuracy: 0.9518314099347717
precision: 1.0
sensitivity: 0.7512953367875648
specificity: 1.0
false_discovery_rate: 0.0
F1_score: 0.8579881656804734

---------------

Test data
accuracy: 0.9539078156312625
precision: 1.0
sensitivity: 0.7830188679245284
specificity: 1.0
false_discovery_rate: 0.0
F1_score: 0.8783068783068784

---------------

K: 10
Training data
accuracy: 0.963371801304566
precision: 0.924119241192412
sensitivity: 0.883419689119171
specificity: 0.9825762289981331
false_discovery_rate: 0.07588075880758807
F1_score: 0.9033112582781457

---------------

Test data
accuracy: 0.9478957915831663
precision: 0.8773584905660378
sensitivity: 0.8773584905660378
specificity: 0.9669211195928753
false_discovery_rate: 0.12264150943396226
F1_score: 0.8773584905660379

---------------

K: 15
Training data
accuracy: 0.9678876066231812
precision: 0.9497206703910615
sensitivity: 0.8808290155440415
specificity: 0.9887990043559427
false_discovery_rate: 0.05027932960893855
F1_

In [233]:

hypothesises, hypo_weights = adaboost(trunc_X_train, Y_train,  
                                        modified_regressor, 25)
## predictions
print(f"Training data")
y_hat = weighted_majority(hypothesises, hypo_weights, trunc_X_train)
accuracy, precision, sensitivity, specificity, false_discovery_rate, F1_score = perf_metrics(y_hat, Y_train)
print_stuffs(accuracy, sensitivity, specificity, precision, false_discovery_rate, F1_score)

print(f"Test data")
y_hat = weighted_majority(hypothesises, hypo_weights, trunc_X_test)
accuracy, precision, sensitivity, specificity, false_discovery_rate, F1_score = perf_metrics(y_hat, Y_test)
print_stuffs(accuracy, sensitivity, specificity, precision, false_discovery_rate, F1_score)

Training data
accuracy: 0.8363789926289926
precision: 0.6803636363636364
sensitivity: 0.5991034261927634
specificity: 0.9112210887789112
false_discovery_rate: 0.31963636363636366
F1_score: 0.6371530733866848

---------------

Test data
accuracy: 0.8309534776600644
precision: 0.6708160442600276
sensitivity: 0.6081504702194357
specificity: 0.9032126880845872
false_discovery_rate: 0.32918395573997233
F1_score: 0.637948043406774

---------------

