In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import numpy as np
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
data_original = pd.read_csv('breast_cancer_data.csv')
data_original = data_original.drop('Unnamed: 32', axis = 1)
data_original = data_original.drop('id', axis = 1)
#shuffling the dataframe
data_original = data_original.sample(frac=1)
data_original.shape

(569, 31)

In [8]:
labels = data_original['diagnosis']
Y = [1.0 if ele == "M" else 0.0 for ele in labels] #convert labels M and B to binary(1 and 0)
data_original = data_original.drop('diagnosis', axis=1)
X = data_original
#Create test dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

X_train['diagnosis'] = Y_train
train_original = X_train

In [9]:
train_original.shape

(455, 31)

In [10]:
def get_data_with_missing_values(data, portion_to_remove):
    ix = [(row, col) for row in range(data.shape[0]) for col in range(data.shape[1])]
    for row, col in random.sample(ix, int(round(portion*len(ix)))):
        data.iat[row, col] = np.nan
    return data

In [11]:
def remove_missing_data_row(data):
    df = data.copy()
    df = df.dropna()
    return df

In [12]:
def impute_missing_data(data):
    #we generate labels, denoted by Y (diagnosis column) and the rest of the data will be denoted by X
    Y = data['diagnosis']
    data = data.drop('diagnosis', axis=1)#drop the diagnosis column from the dataframe
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(data)  
    X_transformed = imp.transform(data)
    imputed_data = pd.DataFrame(X_transformed, columns= list(data))
    imputed_data['diagnosis'] = Y
    #to remove rows with unknown labels
    imputed_data = remove_missing_data_row(imputed_data)
    return imputed_data

In [13]:
portions = [0.02, 0.05]
missing_data = {}

for portion in portions:
    temp = train_original.copy()
    missing_data[portion] = get_data_with_missing_values(temp, portion)

In [14]:
removed_data = {}
imputed_data = {}

for portion in portions:
    removed_data[portion] = remove_missing_data_row(missing_data[portion])
    imputed_data[portion] = impute_missing_data(missing_data[portion])

In [15]:
def logreg(X_train,Y_train):
    clf = LogisticRegression().fit(X_train, Y_train)
    Y_predicted = clf.predict(X_test)
    score = f1_score(Y_test, Y_predicted)
    return score

In [16]:
def decisiontree(X_train, Y_train):
    clf = DecisionTreeClassifier(criterion="gini").fit(X_train, Y_train)
    Y_predicted = clf.predict(X_test)
    score = f1_score(Y_test, Y_predicted)
    return score

In [17]:
def randomforest(X_train, Y_train):
    clf = RandomForestClassifier().fit(X_train, Y_train)
    Y_predicted = clf.predict(X_test)
    score = f1_score(Y_test, Y_predicted)
    return score

In [18]:
def mysvm(X_train, Y_train):
    clf = svm.SVC().fit(X_train, Y_train)
    Y_predicted = clf.predict(X_test)
    score = f1_score(Y_test, Y_predicted)
    return score

In [19]:
def get_score_from_data(df, method):
    #we generate labels, denoted by Y (diagnosis column) and the rest of the data will be denoted by X
    Y_train = df['diagnosis']
    df = df.drop('diagnosis', axis=1)#drop the diagnosis column from the dataframe
    df = df.drop('missing',axis=1)
    X_train = df
    
    
    if method=="logreg":
        score = logreg(X_train, Y_train)
    elif method=="decisiontree":
        score = decisiontree(X_train, Y_train)
    elif method=="randomforest":
        score = randomforest(X_train, Y_train)
    elif method=="svm":
        score = mysvm(X_train, Y_train)
    return score

In [20]:
def get_new_score(data, replacement_parameter):
    if replacement_parameter == "remove_missing":
        temp_df = remove_missing_data_row(data)
    elif replacement_parameter == "inpute":
        temp_df = impute_missing_data(data)
    new_score = get_score_from_data(temp_df, method)
    return new_score

In [21]:
def shuffle_missing_labels(data):
    new_df = data.copy()
    new_df['missing'] = np.random.permutation(new_df['missing'].values)
    return new_df

In [22]:
def significance_test_missing_data(portion, replacement_parameter, method):

    portion1 = 0.0
    portion2 = portion
    
    
    if replacement_parameter == "remove_missing":
        data_with_missing_values = removed_data[portion]
    elif replacement_parameter == "impute":
        data_with_missing_values = imputed_data[portion]
    
    data_with_missing_values["missing"] = [portion]*len(data_with_missing_values.index)
    
    data_orig = train_original.copy() #TODO
    data_orig["missing"] = [0.0]*len(data_orig.index)

    
    score_missing_data = get_score_from_data(data_with_missing_values, method)
    orig_score = get_score_from_data(data_orig, method)
    observed_score_diff =  score_missing_data - orig_score
    
    data = pd.concat([data_orig,data_with_missing_values])
        
    count = 0
    num_shuffles = 1000
        
    for i in range(num_shuffles):
#         if(i!=0 and i%20 ==0):
#             print("Done: ", i)
        new_df = shuffle_missing_labels(data)
    
        new_df_portion_1 = new_df[:][new_df.missing == portion1]
        new_df_portion_2 = new_df[:][new_df.missing == portion2]

        
        score_diff = get_score_from_data(new_df_portion_1, method) - get_score_from_data(new_df_portion_2, method)
        
        if observed_score_diff < 0 and score_diff <= observed_score_diff:
            count += 1
        elif observed_score_diff >= 0 and score_diff >= observed_score_diff:
            count += 1
    ######################################
    #
    # Output
    #
    ######################################
    print("**********Removing ", portion*100, "% of data****************")
    
    print("Score on original: ", orig_score)
    print("Score on Missing data: ", score_missing_data)
    print ("Observed difference of two scores: %.2f" % observed_score_diff)
    print (count, "out of", num_shuffles, "experiments had a difference of two scores", end=" ")
    if observed_score_diff < 0:
        print ("less than or equal to", end=" ")
    else:
        print ("greater than or equal to", end=" ")
    print ("%.2f" % observed_score_diff, ".")
    print ("The chance of getting a difference of two scores", end=" ")
    if observed_score_diff < 0:
        print ("less than or equal to", end=" ")
    else:
        print ("greater than or equal to", end=" ")
    print ("%.2f" % observed_score_diff, "is %.4f"%(count / float(num_shuffles)), "\n")

# Approach 1: Remove missing data

In [23]:
for portion in portions:
    significance_test_missing_data(portion, "remove_missing", "logreg")

**********Removing  2.0 % of data****************
Score on original:  0.9500000000000001
Score on Missing data:  0.9500000000000001
Observed difference of two scores: 0.00
721 out of 1000 experiments had a difference of two scores greater than or equal to 0.00 .
The chance of getting a difference of two scores greater than or equal to 0.00 is 0.7210 

**********Removing  5.0 % of data****************
Score on original:  0.9500000000000001
Score on Missing data:  0.8947368421052632
Observed difference of two scores: -0.06
0 out of 1000 experiments had a difference of two scores less than or equal to -0.06 .
The chance of getting a difference of two scores less than or equal to -0.06 is 0.0000 



In [24]:
for portion in portions:
    significance_test_missing_data(portion, "remove_missing", "decisiontree")

**********Removing  2.0 % of data****************
Score on original:  0.926829268292683
Score on Missing data:  0.9024390243902439
Observed difference of two scores: -0.02
249 out of 1000 experiments had a difference of two scores less than or equal to -0.02 .
The chance of getting a difference of two scores less than or equal to -0.02 is 0.2490 

**********Removing  5.0 % of data****************
Score on original:  0.9069767441860465
Score on Missing data:  0.8433734939759037
Observed difference of two scores: -0.06
40 out of 1000 experiments had a difference of two scores less than or equal to -0.06 .
The chance of getting a difference of two scores less than or equal to -0.06 is 0.0400 



In [25]:
for portion in portions:
    significance_test_missing_data(portion, "remove_missing", "randomforest")

**********Removing  2.0 % of data****************
Score on original:  0.975
Score on Missing data:  0.9210526315789475
Observed difference of two scores: -0.05
30 out of 1000 experiments had a difference of two scores less than or equal to -0.05 .
The chance of getting a difference of two scores less than or equal to -0.05 is 0.0300 

**********Removing  5.0 % of data****************
Score on original:  0.9487179487179488
Score on Missing data:  0.8493150684931506
Observed difference of two scores: -0.10
0 out of 1000 experiments had a difference of two scores less than or equal to -0.10 .
The chance of getting a difference of two scores less than or equal to -0.10 is 0.0000 



In [26]:
for portion in portions:
    significance_test_missing_data(portion, "remove_missing", "svm")

**********Removing  2.0 % of data****************
Score on original:  0.0
Score on Missing data:  0.0
Observed difference of two scores: 0.00
1000 out of 1000 experiments had a difference of two scores greater than or equal to 0.00 .
The chance of getting a difference of two scores greater than or equal to 0.00 is 1.0000 

**********Removing  5.0 % of data****************
Score on original:  0.0
Score on Missing data:  0.0
Observed difference of two scores: 0.00
993 out of 1000 experiments had a difference of two scores greater than or equal to 0.00 .
The chance of getting a difference of two scores greater than or equal to 0.00 is 0.9930 



# Approach 2: Imputation of multivariate missing data using sklearn's impute method

In [27]:
for portion in portions:
    significance_test_missing_data(portion, "impute", "logreg")

**********Removing  2.0 % of data****************
Score on original:  0.9500000000000001
Score on Missing data:  0.1276595744680851
Observed difference of two scores: -0.82
0 out of 1000 experiments had a difference of two scores less than or equal to -0.82 .
The chance of getting a difference of two scores less than or equal to -0.82 is 0.0000 

**********Removing  5.0 % of data****************
Score on original:  0.9500000000000001
Score on Missing data:  0.125
Observed difference of two scores: -0.83
0 out of 1000 experiments had a difference of two scores less than or equal to -0.83 .
The chance of getting a difference of two scores less than or equal to -0.83 is 0.0000 



In [28]:
for portion in portions:
    significance_test_missing_data(portion, "impute", "decisiontree")

**********Removing  2.0 % of data****************
Score on original:  0.9024390243902439
Score on Missing data:  0.45
Observed difference of two scores: -0.45
0 out of 1000 experiments had a difference of two scores less than or equal to -0.45 .
The chance of getting a difference of two scores less than or equal to -0.45 is 0.0000 

**********Removing  5.0 % of data****************
Score on original:  0.9156626506024096
Score on Missing data:  0.3181818181818182
Observed difference of two scores: -0.60
0 out of 1000 experiments had a difference of two scores less than or equal to -0.60 .
The chance of getting a difference of two scores less than or equal to -0.60 is 0.0000 



In [29]:
for portion in portions:
    significance_test_missing_data(portion, "impute", "randomforest")

**********Removing  2.0 % of data****************
Score on original:  0.975
Score on Missing data:  0.14492753623188406
Observed difference of two scores: -0.83
0 out of 1000 experiments had a difference of two scores less than or equal to -0.83 .
The chance of getting a difference of two scores less than or equal to -0.83 is 0.0000 

**********Removing  5.0 % of data****************
Score on original:  0.975
Score on Missing data:  0.31746031746031744
Observed difference of two scores: -0.66
0 out of 1000 experiments had a difference of two scores less than or equal to -0.66 .
The chance of getting a difference of two scores less than or equal to -0.66 is 0.0000 



In [30]:
for portion in portions:
    significance_test_missing_data(portion, "impute", "svm")

**********Removing  2.0 % of data****************
Score on original:  0.0
Score on Missing data:  0.0
Observed difference of two scores: 0.00
1000 out of 1000 experiments had a difference of two scores greater than or equal to 0.00 .
The chance of getting a difference of two scores greater than or equal to 0.00 is 1.0000 

**********Removing  5.0 % of data****************
Score on original:  0.0
Score on Missing data:  0.0
Observed difference of two scores: 0.00
1000 out of 1000 experiments had a difference of two scores greater than or equal to 0.00 .
The chance of getting a difference of two scores greater than or equal to 0.00 is 1.0000 

