In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('wbdc.csv')

In [None]:
df = df.iloc[:, 1:]

In [None]:
df.iloc[:, 0] = df.iloc[:, 0].replace({'M' : 1, 'B' : 0})

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn import metrics

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.utils import shuffle

In [None]:
def train_test_df(df):
    df = shuffle(df) 
    
    M_df = df[df['Diagnosis']==1]
    B_df = df[df['Diagnosis']==0]
    
    M_test = M_df.iloc[:int(0.20*len(M_df)), :]
    M_train = M_df.iloc[int(0.20*len(M_df)):, :]
    
    B_test = B_df.iloc[:int(0.20*len(B_df)), :]
    B_train = B_df.iloc[int(0.20*len(B_df)):, :]
    
    train_df = pd.concat([M_train, B_train])
    test_df = pd.concat([M_test, B_test])
    x_train = train_df.iloc[:, 1:]
    y_train = train_df.iloc[:, 0]
    x_test = test_df.iloc[:, 1:]
    y_test = test_df.iloc[:, 0]
    return x_train, y_train, x_test, y_test

i. Supervised Learning: Train an L1-penalized SVM to classify the data. Use 5 fold cross validation to choose the penalty parameter. Use normalized data. Report the average accuracy, precision, recall, F-score, and AUC, for both training and test sets over your M runs. Plot the ROC and report the confusion matrix for training and testing in one of the runs.

In [None]:
def roc_curve(model, X_test, y_test):
    y_pred = model.predict(X_test)
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc_ = metrics.auc(fpr, tpr)

    plt.figure(figsize=(8, 5))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label=' (area = {:.3f})'.format(auc_))
    plt.xlabel('False Positive rate')
    plt.ylabel('True Positive rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

In [None]:
def get_c_range(x_train, y_train):
    c_ = np.logspace(-5, 8, 10)
    scores = []
    for c in c_:
        svc = LinearSVC(penalty='l1', C=c, dual=False)
        svc.fit(x_train, y_train)
        scores.append(svc.score(x_train, y_train))
    scores = np.array(scores)
    ind = np.argwhere(scores > 0.9).flatten()
    c_1 = c_[ind[0]]
    c_2 = c_[ind[-1]]
    return c_1, c_2

In [None]:
def return_results(results):
    print('Avg. Train Accuracy {}'.format(np.mean(results.iloc[:, 0])))
    print('Avg. Train Precision {}'.format(np.mean(results.iloc[:, 1])))
    print('Avg. Train Recall {}'.format(np.mean(results.iloc[:, 2])))
    print('Avg. Train F-1 score {}'.format(np.mean(results.iloc[:, 3])))
    print('Avg. Train AUC {}'.format(np.mean(results.iloc[:, 4])))

    print('\nTest Accuracy {}'.format(np.mean(results.iloc[:, 5])))
    print('Test Precision {}'.format(np.mean(results.iloc[:, 6])))
    print('Test Recall {}'.format(np.mean(results.iloc[:, 7])))
    print('Test F1 Score {}'.format(np.mean(results.iloc[:, 8])))
    print('Test AUC {}'.format(np.mean(results.iloc[:, 9])))

In [None]:
from tqdm import tqdm
from math import log10
from sklearn import preprocessing
from sklearn import metrics

In [None]:
#Use normalized data. Report the average accuracy, precision, recall, F-score, and AUC, for both training and test sets 
#over your M runs. Plot the ROC and report the confusion matrix for training and testing in one of the runs.
results1 = []

for i in tqdm(range(15)):
    print('Run Number {}'.format(i))
    each = []
    x_train, y_train, x_test, y_test = train_test_df(df)
    scaler = preprocessing.Normalizer()
    x_train_ = scaler.fit_transform(x_train)
    x_test_ = scaler.transform(x_test)
    
    c_l, c_h = get_c_range(x_train_, y_train)
    parameters = {'C':np.logspace(log10(c_l), log10(c_h), 20)}
    
    scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    
    svc = LinearSVC(penalty='l1', dual=False)
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    clf = GridSearchCV(svc, parameters, cv=kf, scoring=scoring, refit='roc_auc', return_train_score=True)
    clf.fit(x_train_, y_train)
    
    results = clf.cv_results_
    
    each.append(round(np.mean(results['mean_train_accuracy']), 2))
    each.append(round(np.mean(results['mean_train_precision']), 2))
    each.append(round(np.mean(results['mean_train_recall']), 2))
    each.append(round(np.mean(results['mean_train_f1']), 2))
    each.append(round(np.mean(results['mean_train_roc_auc']), 2))
    
    # calculating test accuracy
    sv = clf.best_estimator_
    y_pred = sv.predict(x_test_)
    
    each.append(round(metrics.accuracy_score(y_test, y_pred), 2))
    each.append(round(metrics.recall_score(y_test, y_pred), 2))
    each.append(round(metrics.precision_score(y_test, y_pred), 2))
    each.append(round(metrics.f1_score(y_test, y_pred), 2))
    each.append(round(metrics.roc_auc_score(y_test, y_pred), 2))
    
    print('Avg. Train Accuracy {}'.format(round(np.mean(results['mean_train_accuracy']), 2)))
    print('Test Accuracy {}'.format(round(metrics.accuracy_score(y_test, y_pred), 2)))
    
    if(i == 14):
        print('\nFor Run Number {}'.format(i))
        cnf_train = metrics.confusion_matrix(y_train, sv.predict(x_train_))
        print('Train Confusion Matrix')
        print(cnf_train)
        #roc_curve(sv, x_train_, y_train)

        cnf_test = metrics.confusion_matrix(y_test, y_pred)
        print('\nTest Confusion Matrix')
        print(cnf_test)
        #roc_curve(sv, x_test_, y_test)
    results1.append(each)
    print('\n')

return_results(pd.DataFrame(results1))

ii. Semi-Supervised Learning/ Self-training: select 50% of the positive
class along with 50% of the negative class in the training set as labeled data
and the rest as unlabelled data. You can select them randomly.   
   
A. Train an L1-penalized SVM to classify the labeled data Use normalized
data. Choose the penalty parameter using 5 fold cross validation.   
   
B. Find the unlabeled data point that is the farthest to the decision boundary
of the SVM. Let the SVM label it (ignore its true label), and add it to
the labeled data, and retrain the SVM. Continue this process until all
unlabeled data are used. Test the final SVM on the test data andthe
average accuracy, precision, recall, F-score, and AUC, for both training
and test sets over your M runs. Plot the ROC and report the confusion
matrix for training and testing in one of the runs.

In [None]:
def train_test_split_df(df):
    df = shuffle(df) 
    M_df = df[df['Diagnosis']==1]
    B_df = df[df['Diagnosis']==0]
    
    M_test = M_df.iloc[:int(0.20*len(M_df)), :]
    M_train = M_df.iloc[int(0.20*len(M_df)):, :]
    
    B_test = B_df.iloc[:int(0.20*len(B_df)), :]
    B_train = B_df.iloc[int(0.20*len(B_df)):, :]
    
    train_df = pd.concat([M_train, B_train])
    test_df = pd.concat([M_test, B_test])
    return train_df, test_df

In [None]:
def lab_unlab_split(df):
    df = shuffle(df) 
    M_df = df[df['Diagnosis']==1]
    B_df = df[df['Diagnosis']==0]
    
    M_test = M_df.iloc[:int(0.50*len(M_df)), :]
    M_train = M_df.iloc[int(0.50*len(M_df)):, :]
    
    B_test = B_df.iloc[:int(0.50*len(B_df)), :]
    B_train = B_df.iloc[int(0.50*len(B_df)):, :]
    
    train_df = pd.concat([M_train, B_train])
    test_df = pd.concat([M_test, B_test])
    x_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]
    x_test = test_df.iloc[:, :-1]
    y_test = test_df.iloc[:, -1]
    
    return x_train, y_train, x_test, y_test

In [None]:
def get_farthest_points(svc, x_test_):
    x_cols = df.columns.values[1:]
    y_cols = df.columns.values[0]

    dd = pd.DataFrame(x_test_)
    y = svc.decision_function(x_test_)
    w_norm = np.linalg.norm(svc.coef_)
    dist = y / w_norm
    d = np.argmax([abs(a) for a in dist])
    point = svc.predict([dd.iloc[d, :]])
    
    return dd.iloc[1,:], point

In [None]:
#Use normalized data. Report the average accuracy, precision, recall, F-score, and AUC, for both training and test sets 
#over your M runs. Plot the ROC and report the confusion matrix for training and testing in one of the runs.
results2 = []

for i in tqdm(range(1)):
    print('Run Number {}'.format(i))
    each = []
    
    #Divide the initial df into 80-20 test-train
    train_d, test_d = train_test_split_df(df)
    
    x_train_ = train_d.iloc[:, 1:]
    y_train = train_d.iloc[:, 0]
    x_test_ = test_d.iloc[:, 1:]
    y_test = test_d.iloc[:, 0]
    
    scaler = preprocessing.Normalizer()
    
    train_data = pd.DataFrame(scaler.fit_transform(x_train_), columns=df.columns.values[1:])
    test_data = pd.DataFrame(scaler.fit_transform(x_test_), columns=df.columns.values[1:])
    
    train_data['Diagnosis'] = [int(i) for i in y_train]
    test_data['Diagnosis'] = [int(j) for j in y_test]
    
    
    #Concat train data, and divide into 50-50 labelled and unlabelled
    #train_data = pd.concat([x_train, y_train])
    lab_x, lab_y, unlab_x, unlab_y = lab_unlab_split(train_data)
    
    for j in range(len(unlab_x) + 1):
        #Use labelled data as your training data and unlabelled data as your test data
        c_l, c_h = get_c_range(lab_x, lab_y)
        parameters = {'C':np.logspace(log10(c_l), log10(c_h), 20)}

        scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

        svc = LinearSVC(penalty='l1', dual=False)
        kf = StratifiedKFold(n_splits=5, shuffle=True)
        clf = GridSearchCV(svc, parameters, cv=kf, scoring=scoring, refit='roc_auc', return_train_score=True)
        clf.fit(lab_x, lab_y)

        # calculating test accuracy
        sv = clf.best_estimator_
        xx, yy = get_farthest_points(sv, unlab_x)
        lab_x = lab_x.append(pd.Series(xx))
        lab_y = lab_y.append(pd.Series(yy))
        
    results = clf.cv_results_
    
    each.append(round(np.mean(results['mean_train_accuracy']), 2))
    each.append(round(np.mean(results['mean_train_precision']), 2))
    each.append(round(np.mean(results['mean_train_recall']), 2))
    each.append(round(np.mean(results['mean_train_f1']), 2))
    each.append(round(np.mean(results['mean_train_roc_auc']), 2))
    
    # calculating test accuracy
    sv = clf.best_estimator_
    y_pred = sv.predict(x_test)
    
    each.append(round(metrics.accuracy_score(y_test, y_pred), 2))
    each.append(round(metrics.recall_score(y_test, y_pred), 2))
    each.append(round(metrics.precision_score(y_test, y_pred), 2))
    each.append(round(metrics.f1_score(y_test, y_pred), 2))
    each.append(round(metrics.roc_auc_score(y_test, y_pred), 2))
    
    print('Avg. Train Accuracy {}'.format(round(np.mean(results['mean_train_accuracy']), 2)))
    print('Test Accuracy {}'.format(round(metrics.accuracy_score(y_test, y_pred), 2)))
    
    if(i == 0):
        print('For Run Number {}'.format(i))
        cnf_train = metrics.confusion_matrix(y_train, sv.predict(x_train))
        print('Train Confusion Matrix')
        print(cnf_train)
        #roc_curve(sv, x_train, y_train)

        cnf_test = metrics.confusion_matrix(y_test, y_pred)
        print('\nTest Confusion Matrix')
        print(cnf_test)
        #roc_curve(sv, x_test, y_test)
    
    results2.append(each)
    print('\n')

return_results(pd.DataFrame(results2))