In this notebook we demonstrate p-value significance test for model selection. Model selection is important to choose which model works best on your data. 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('breast_cancer_data.csv')
list(df)

['id',
 'diagnosis',
 'radius_mean',
 'texture_mean',
 'perimeter_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'radius_se',
 'texture_se',
 'perimeter_se',
 'area_se',
 'smoothness_se',
 'compactness_se',
 'concavity_se',
 'concave points_se',
 'symmetry_se',
 'fractal_dimension_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'smoothness_worst',
 'compactness_worst',
 'concavity_worst',
 'concave points_worst',
 'symmetry_worst',
 'fractal_dimension_worst',
 'Unnamed: 32']

In [3]:
# Pseudocode:
#
# 1. Measure the difference between the two group means.  The difference in means is measured
#    by (sum(grpA) / len(grpA)) - (sum(grpB) / len(grpB)).  
#
# 2. Set a counter to 0, this will count the number of times we get a difference
#    between the means greater than or equal to the original(calculated in step 1).  
#
# 3. Do the following 10,000 times:
#    a. Shuffle the original measurements.  To do this:
#       i. put the values from all the groups into one array but remembering the start and end indexes of each group
#       ii. shuffle the values in the array, effectively reassigning the values to different groups
#    b. Measure the difference between the two group means, just as we did in step (1).
#    c. If the difference from step (3b) is greater than or equal to the difference calculated in step 1, increment our counter 
#       from step (2). Note: if our original difference between the means were a negative value 
#       we would check for values less than or equal to that value.
#
# 4. counter / 10,000 equals the probability of getting our observed difference of two means greater than
#    or equal to 12.97, if there is in fact no significant difference.

import random

def shuffle(grps):
    num_grps = len(grps)
    pool = []

    # pool all values
    for i in range(num_grps):
        pool.extend(grps[i])
    # mix them up
    random.shuffle(pool)
    # reassign to groups of same size as original groups
    new_grps = []
    start_index = 0
    end_index = 0
    for i in range(num_grps):
        end_index = start_index + len(grps[i])
        new_grps.append(pool[start_index:end_index])
        start_index = end_index
    return new_grps

# subtracts group a mean from group b mean and returns result
def meandiff(grpA, grpB):
    return (sum(grpB) / float(len(grpB)) - sum(grpA) / float(len(grpA)))*100

def p_test(samples):
    a = 0
    b = 1
    observed_mean_diff = meandiff(samples[a], samples[b])
    

    count = 0
    num_shuffles = 10000
    for i in range(num_shuffles):
        new_samples = shuffle(samples)
        mean_diff = meandiff(new_samples[a], new_samples[b])
        # if the observed difference is negative, look for differences that are smaller
        # if the observed difference is positive, look for differences that are greater
        if observed_mean_diff < 0 and mean_diff <= observed_mean_diff:
            count = count + 1
        elif observed_mean_diff >= 0 and mean_diff >= observed_mean_diff:
            count = count + 1
    ######################################
    #
    # Output
    #
    ######################################

    print ("Observed difference of two means: %.2f" % observed_mean_diff)
    print (count, "out of", num_shuffles, "experiments had a difference of two means ", end="")
    if observed_mean_diff < 0:
        print ("less than or equal to ", end="")
    else:
        print ("greater than or equal to ", end=" ")
    print ("%.2f" % observed_mean_diff, ".")
    print ("The chance of getting a difference of two means", end=" ")
    if observed_mean_diff < 0:
        print ("less than or equal to ", end="")
    else:
        print ("greater than or equal to ", end="")
    print ("%.2f" % observed_mean_diff, "is", (count / float(num_shuffles)), ".")

As concluded in the last notebook, we only use features with `_mean` prefix. Hence, we remove all the other features.

In [4]:
columns_to_remove = ["Unnamed: 32","id", "perimeter_mean", "area_mean",
                       "perimeter_se", "area_se", "radius_worst", 
                       "texture_worst", 
                       "perimeter_worst", 
                       "area_worst", 
                       "smoothness_worst", 
                       "compactness_worst", 
                       "concavity_worst",
                       "concave points_mean", 
                       "symmetry_worst", 
                       "fractal_dimension_worst",
                       "concavity_mean",
                       "concavity_se",
                       "concave points_worst",
                       "concave points_se",
                       "radius_se",
                       "texture_se",
                       "smoothness_se",
                       "compactness_se",
                       "symmetry_se",
                       "fractal_dimension_se"]

In [5]:
df = df.drop(columns_to_remove, axis = 1)

In [6]:
list(df)

['diagnosis',
 'radius_mean',
 'texture_mean',
 'smoothness_mean',
 'compactness_mean',
 'symmetry_mean',
 'fractal_dimension_mean']

## The Models

In [7]:
#import the required modules from sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import f1_score
import numpy as np

In [8]:
#we generate labels, denoted by Y (diagnosis column) and the rest of the data will be denoted by X
labels = df['diagnosis']
Y = [1 if ele == "M" else 0 for ele in labels] #convert labels M and B to binary(1 and 0)
df = df.drop('diagnosis', axis=1)#drop the diagnosis column from the dataframe

In [9]:
X = df

#splitting data into train and test
#this is done to test on samples that the model has not been trained on, it leads to less bias
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

### Logistic Regression

In [10]:
clf = LogisticRegression().fit(X_train, Y_train)
Y_predicted_lr_train = clf.predict(X_train)
Y_predicted_lr = clf.predict(X_test)
lr_accuracy = np.logical_xor(Y_predicted_lr, Y_test)
lr_accuracy = np.logical_not(lr_accuracy)
print("Train accuracy: ",clf.score(X_train, Y_train)*100)
print("Test accuracy: ",clf.score(X_test, Y_test)*100)
print("Train F-1 score: ", f1_score(Y_train, Y_predicted_lr_train)*100)
print("Test F-1 score: ", f1_score(Y_test, Y_predicted_lr)*100)
lr_accuracy_int = [1 if ele == True else 0 for ele in lr_accuracy]

Train accuracy:  90.10989010989012
Test accuracy:  90.35087719298247
Train F-1 score:  85.43689320388349
Test F-1 score:  84.93150684931507


In [11]:
clf_gini = DecisionTreeClassifier(criterion="gini").fit(X_train, Y_train)
Y_predicted_dt_train = clf_gini.predict(X_train)
Y_predicted_dt = clf_gini.predict(X_test)
dt_accuracy = np.logical_xor(Y_predicted_dt, Y_test)
dt_accuracy = np.logical_not(dt_accuracy)
print("Train accuracy: ",clf_gini.score(X_train, Y_train)*100)
print("Test accuracy: ",clf_gini.score(X_test, Y_test)*100)
print("Train F-1 score: ", f1_score(Y_train, Y_predicted_dt_train)*100)
print("Test F-1 score: ", f1_score(Y_test, Y_predicted_dt)*100)
dt_accuracy_int = [1 if ele == True else 0 for ele in dt_accuracy]

Train accuracy:  100.0
Test accuracy:  91.22807017543859
Train F-1 score:  100.0
Test F-1 score:  86.8421052631579


In [12]:
clf_random_forest = RandomForestClassifier().fit(X_train, Y_train)
Y_predicted_rf_train = clf_random_forest.predict(X_train)
Y_predicted_rf = clf_random_forest.predict(X_test)
rf_accuracy = np.logical_xor(Y_predicted_rf, Y_test)
rf_accuracy = np.logical_not(rf_accuracy)
print("Train accuracy: ",clf_random_forest.score(X_train, Y_train)*100)
print("Test accuracy: ",clf_random_forest.score(X_test, Y_test)*100)
print("Train F-1 score: ", f1_score(Y_train, Y_predicted_rf_train)*100)
print("Test F-1 score: ", f1_score(Y_test, Y_predicted_rf)*100)
rf_accuracy_int = [1 if ele == True else 0 for ele in rf_accuracy]

Train accuracy:  100.0
Test accuracy:  93.85964912280701
Train F-1 score:  100.0
Test F-1 score:  91.13924050632912


In [13]:
clf_svm = svm.SVC().fit(X_train, Y_train)
Y_predicted_svm_train = clf_svm.predict(X_train)
Y_predicted_svm = clf_svm.predict(X_test)
svm_accuracy = np.logical_xor(Y_predicted_svm, Y_test)
svm_accuracy = np.logical_not(svm_accuracy)
print("Train accuracy: ",clf_svm.score(X_train, Y_train)*100)
print("Test accuracy: ",clf_svm.score(X_test, Y_test)*100)
print("Train F-1 score: ", f1_score(Y_train, Y_predicted_svm_train)*100)
print("Test F-1 score: ", f1_score(Y_test, Y_predicted_svm)*100)
svm_accuracy_int = [1 if ele == True else 0 for ele in svm_accuracy]

Train accuracy:  90.98901098901099
Test accuracy:  88.59649122807018
Train F-1 score:  87.22741433021807
Test F-1 score:  83.9506172839506


#### p_test: Logistic Regression vs Decision Trees(with Gini Index criterion)

In [14]:
samples = []
samples.append(lr_accuracy_int)
samples.append(dt_accuracy_int)
p_test(samples)

Observed difference of two means: 0.88
4930 out of 10000 experiments had a difference of two means greater than or equal to  0.88 .
The chance of getting a difference of two means greater than or equal to 0.88 is 0.493 .


#### p_test: Random Forest vs SVM

In [15]:
samples = []
samples.append(rf_accuracy_int)
samples.append(svm_accuracy_int)
p_test(samples)

Observed difference of two means: -5.26
1221 out of 10000 experiments had a difference of two means less than or equal to -5.26 .
The chance of getting a difference of two means less than or equal to -5.26 is 0.1221 .


#### p_test: Logistic Regression vs Random Forest

In [16]:
samples = []
samples.append(lr_accuracy_int)
samples.append(rf_accuracy_int)
p_test(samples)

Observed difference of two means: 3.51
2333 out of 10000 experiments had a difference of two means greater than or equal to  3.51 .
The chance of getting a difference of two means greater than or equal to 3.51 is 0.2333 .


#### p_test: Logistic Regression vs SVM

In [17]:
samples = []
samples.append(lr_accuracy_int)
samples.append(svm_accuracy_int)
p_test(samples)

Observed difference of two means: -1.75
4157 out of 10000 experiments had a difference of two means less than or equal to -1.75 .
The chance of getting a difference of two means less than or equal to -1.75 is 0.4157 .


#### p_test: Decision Trees vs SVM

In [18]:
samples = []
samples.append(dt_accuracy_int)
samples.append(svm_accuracy_int)
p_test(samples)

Observed difference of two means: -2.63
3256 out of 10000 experiments had a difference of two means less than or equal to -2.63 .
The chance of getting a difference of two means less than or equal to -2.63 is 0.3256 .


#### p_test: Decision Trees vs Random Forests

In [19]:
samples = []
samples.append(dt_accuracy_int)
samples.append(rf_accuracy_int)
p_test(samples)

Observed difference of two means: 2.63
3068 out of 10000 experiments had a difference of two means greater than or equal to  2.63 .
The chance of getting a difference of two means greater than or equal to 2.63 is 0.3068 .
