In [251]:
import operator
import numpy as np
import pandas as pd
import sklearn

# preprocess
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import f_classif, SelectKBest

# for classifiers
from sklearn import model_selection
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, precision_score, recall_score, f1_score 
# for cross_validate
from sklearn.cross_validation import train_test_split,cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score

# Ignore Warning
import warnings
warnings.filterwarnings('ignore')

# Inline plots
#%matplotlib inline

In [250]:
def chooseDataset(name):
    dataPath = 'Datasets/'
    if name == 'diabetes':
        colNames= ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
        df = pd.read_csv(dataPath + 'PIMA_Indiana_diabetes/pima-indians-diabetes.data.csv', names = colNames)
        missCol = ['glucose','bp','skin','insulin','bmi']
        for col in missCol:
            df[col].replace([0,0.0], np.nan, inplace = True)
                   
#       convert 0 as missing values
    elif name == 'breastCancer':
        colNames = ['sampleCodeNumber', 'clumpThickness', 'uniformityCellSize', 'uniformityCellShape', 'marginalAdhesion', 'singleEpithelialCellSize', 'bareNuclei', 'blandChromatin', 'normalNucleoli', 'mitoses', 'label']
        df = pd.read_csv(dataPath + 'BreastCancer/breast-cancer-wisconsin.data.txt', names = colNames)
        df.replace("?", np.nan, inplace= True)
        df['bareNuclei'] = pd.to_numeric(df['bareNuclei'])
        # making class labels as 0 (Benign) and 1 (Malignant)
        df['label'] = df['label'].replace(2, 0)
        df['label'] = df['label'].replace(4, 1)
        missCol = ['bareNuclei']
        k_knn = 7
#         print pd.unique(df[missCol].values.ravel('K'))
    elif name == 'parkinsons':
        colNames= ['name','MDVP:Fo','MDVP:Fhi','MDVP:Flo','MDVP:Jitter(%)','MDVP:Jitter(Abs)','MDVP:RAP','MDVP:PPQ','Jitter:DDP','MDVP:Shimmer','MDVP:Shimmer(dB)','Shimmer:APQ3','Shimmer:APQ5','MDVP:APQ','Shimmer:DDA','NHR','HNR','label','RPDE','DFA','spread1','spread2','D2','PPE']
        df=pd.read_csv(dataPath+'Parkinsons/parkinsons.data.txt',names=colNames)
        missCol=[]
        #no missing values
        
    elif name == 'BUPA':
        #not given which one is infected
        colNames = ['mcv', 'alkphos', 'sgpt','sgot','gammagt','drinks','label']
        df = pd.read_csv(dataPath + 'BUPA(Liver)/bupa.data.txt',names = colNames )
        df.drop_duplicates(subset=None, keep='first', inplace=True)
        df['Class'] = df['Class'].replace(1, 0)
        df['Class'] = df['Class'].replace(2, 1)
        missCol=[]
        
    elif name == 'Cleveland':
        colNames= ['age','sex','cp','trestbps','chol','fbs','restecg'  ,'thalach', 'exang', 'oldpeak','slope','ca', 'thal','label']
        df = pd.read_csv(dataPath + 'Cleveland(Heart)/processed.cleveland.data.txt',names = colNames)
        df.replace ("-9.0",np.nan,inplace=True)
        missCol = df.columns[df.isna().any()].tolist()
        
    elif name == 'Hepatitis':
        colNames =['label','AGE','SEX','STEROID','ANTIVIRALS','FATIGUE','MALAISE','ANOREXIA','LIVER BIG','LIVER FIRM','SPLEEN PALPABLE','SPIDERS','ASCITES','VARICES','BILIRUBIN','ALK PHOSPHATE','SGOT','ALBUMIN','PROTIME','HISTOLOGY']
        df = pd.read_csv(dataPath + 'Hepatitis/hepatitis.data.txt',names= colNames)
        df.replace("?",np.nan,inplace=True)
        missCol= df.columns[df.isna().any()].tolist()
        
    elif name == 'ILPD':
        colNames =['Age','Gender','TB','DB','Alkphos','SGPT','SGOT','TP','ALB','A/G','label']
        df = pd.read_csv(dataPath + 'ILPD(Liver)/ILPD.csv',names= colNames)
        df['Gender'] = df['Gender'].replace('Male', 0)
        df['Gender'] = df['Gender'].replace('Female', 1)
        missCol=['A/G']
        k_knn = 0
    else:
        print ("NOT FOUND")
        return
    return df, missCol, k_knn
        

In [81]:
df, missCol, k_knn = chooseDataset('breastCancer')
print (df.columns, df.dtypes, df.shape)
print (df.head())
# print df.loc[np.isnan(df['A/G'])]

Index(['sampleCodeNumber', 'clumpThickness', 'uniformityCellSize',
       'uniformityCellShape', 'marginalAdhesion', 'singleEpithelialCellSize',
       'bareNuclei', 'blandChromatin', 'normalNucleoli', 'mitoses', 'label'],
      dtype='object') sampleCodeNumber              int64
clumpThickness                int64
uniformityCellSize            int64
uniformityCellShape           int64
marginalAdhesion              int64
singleEpithelialCellSize      int64
bareNuclei                  float64
blandChromatin                int64
normalNucleoli                int64
mitoses                       int64
label                         int64
dtype: object (699, 11)
   sampleCodeNumber  clumpThickness  uniformityCellSize  uniformityCellShape  \
0           1000025               5                   1                    1   
1           1002945               5                   4                    4   
2           1015425               3                   1                    1   
3           101

In [82]:
# normalizing data
def normalizeData(df):
    scaler = MinMaxScaler(feature_range=(0, 1))
    for i in df.columns:
        null_index = df[i].isnull()
        df.loc[~null_index, [i]] = scaler.fit_transform(df.loc[~null_index, [i]])
    return df

In [83]:
df = normalizeData(df)
print (df.head())

# print pd.unique(df[missCol].values.ravel('K'))

   sampleCodeNumber  clumpThickness  uniformityCellSize  uniformityCellShape  \
0          0.070067        0.444444            0.000000             0.000000   
1          0.070285        0.444444            0.333333             0.333333   
2          0.071217        0.222222            0.000000             0.000000   
3          0.071281        0.555556            0.777778             0.777778   
4          0.071336        0.333333            0.000000             0.000000   

   marginalAdhesion  singleEpithelialCellSize  bareNuclei  blandChromatin  \
0          0.000000                  0.111111    0.000000        0.222222   
1          0.444444                  0.666667    1.000000        0.222222   
2          0.000000                  0.111111    0.111111        0.222222   
3          0.000000                  0.222222    0.333333        0.222222   
4          0.222222                  0.111111    0.000000        0.222222   

   normalNucleoli  mitoses  label  
0        0.000000   

In [84]:
def computeMissing(df,missCol, k = 5):
    # get No Missing Data Rows 
#     print pd.unique(df[missCol].values.ravel('K'))
    if missCol == []:
        return df
    no_missing_df = df.dropna(axis=0, how='any')
#     print(no_missing_df.shape, df.shape)
#     print no_missing_df.head()
    # get Missing Data Rows 
    missing_df = pd.DataFrame(df[~df.isin(no_missing_df).all(1)])
#     print missing_df.head()
    # removed last column
    data = no_missing_df.drop(['label'], axis = 1, inplace = False)
    # Create the knn model.
    y_columns = missCol
    x_columns = data.columns.tolist()
#     print y_columns
    for col in y_columns:
        x_columns.remove(col)
        
    # Look at the five closest neighbors.
    knn = KNeighborsRegressor(n_neighbors=k)
    # Fit the model on the training data.
    knn.fit(no_missing_df[x_columns], no_missing_df[y_columns])
    # Make point predictions on the test set using the fit model.
    predictions = knn.predict(missing_df[x_columns])
    missing_df[y_columns] = predictions
    no_missing_df = no_missing_df.append(missing_df)
    return no_missing_df

In [85]:
df = computeMissing(df, missCol)

In [86]:
print (df.shape)
print (df.head())

(699, 11)
   sampleCodeNumber  clumpThickness  uniformityCellSize  uniformityCellShape  \
0          0.070067        0.444444            0.000000             0.000000   
1          0.070285        0.444444            0.333333             0.333333   
2          0.071217        0.222222            0.000000             0.000000   
3          0.071281        0.555556            0.777778             0.777778   
4          0.071336        0.333333            0.000000             0.000000   

   marginalAdhesion  singleEpithelialCellSize  bareNuclei  blandChromatin  \
0          0.000000                  0.111111    0.000000        0.222222   
1          0.444444                  0.666667    1.000000        0.222222   
2          0.000000                  0.111111    0.111111        0.222222   
3          0.000000                  0.222222    0.333333        0.222222   
4          0.222222                  0.111111    0.000000        0.222222   

   normalNucleoli  mitoses  label  
0        0

In [87]:
def removeOutliers(df):
    maskall = {}
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    
    for col in df.columns[:-1]:
        IQR = Q3[col] - Q1[col]
        mask = df[col].between(Q1[col] - 1.5*IQR, Q3[col] + 1.5*IQR, inclusive=True)
        maskall[col] =mask
        df = df[mask]
#         print df.shape
    return df

In [88]:
df2 = removeOutliers(df)
print (df2.shape)

(491, 11)


In [89]:
label = df['label']
df.drop('label', axis=1, inplace=True)

In [90]:
X, y = df, label
print (X,y)

     sampleCodeNumber  clumpThickness  uniformityCellSize  \
0            0.070067        0.444444            0.000000   
1            0.070285        0.444444            0.333333   
2            0.071217        0.222222            0.000000   
3            0.071281        0.555556            0.777778   
4            0.071336        0.333333            0.000000   
5            0.071344        0.777778            1.000000   
6            0.071417        0.000000            0.000000   
7            0.071451        0.111111            0.000000   
8            0.072535        0.111111            0.000000   
9            0.072535        0.333333            0.111111   
10           0.072700        0.000000            0.000000   
11           0.072766        0.111111            0.000000   
12           0.073187        0.444444            0.222222   
13           0.073351        0.000000            0.000000   
14           0.073393        0.777778            0.666667   
15           0.073622   

In [91]:
for col in df.columns:
    print (sorted(pd.Series(df[col]).unique())[-1])
#     print np.isnan(df[col].any())
#     print np.isfinite(df[col].all()) #and gets True

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [92]:
def PCA_Compute(X,y, n):
    pca = PCA(n_components= n, svd_solver='full')
    pca.fit(X)
    X = pca.transform(X)
    print(pca.explained_variance_ratio_)
    return X,y

In [93]:
def Fscore_Compute(X,y, n = 5):
    selector = SelectKBest(f_classif, k=n)
    selector.fit(X,y)
    X = selector.transform(X)
    print (selector.scores_, X.shape)
    return X,y

In [94]:
def featureSelection(X,y,k, choice = 0):
    if choice == 1:
        return Fscore_Compute(X,y,k)
    return PCA_Compute(X,y)

In [174]:
X,y = featureSelection(X,y,X.shape[1],1)

[    4.51505927   733.20697841  1408.52721279  1419.30553012   657.79369959
   608.71955539  1417.3134106    933.28729668   717.62804135   152.04023895] (699, 10)


In [248]:
# usage: get_results2(y, predictions, scores.std())
def get_results2(y_true, y_pred, std):
    TN, FP, FN, TP = confusion_matrix(y_true, y_pred).ravel()
    # print(TN, FP, FN, TP)
    sen = (TP / (TP + FN))*100  
    spec = (TN / (TN + FP))*100
    acc = ((TP + TN) / (TP + FP + TN + FN))*100
    prec = TP / (TP + FP)
    recall = TP / (TP + FN) 
    Fscore = ((2*prec*recall)/(prec+recall))*100
    print ("\nAccuracy = " , acc ," (+/-", std, ")","\nSensitivity = ", sen, "\nSpecificity = ", spec , "\nF-measure = " , Fscore)
    return

In [None]:
def get_results(results):
    print("Accuracy: ", results['test_accuracy'].mean()*100)
    print("Sensitivity (Recall): ", results['test_recall'].mean()*100)
    print("Precision: ", results['test_precision'].mean()*100)
    print("F-Measure: ", results['test_f1_score'].mean()*100)

In [223]:
def getQDA(X,y,k=10):
    clf = QDA(priors=None, reg_param=0.0)
    clf.fit(X, y)
    scores = cross_val_score(clf, X, y, cv=k)
    predictions = clf.predict(X)
    return scores, predictions

In [224]:
scores, predictions = getQDA(X,y)
print("Mean = " , scores.mean()*100)
get_results(results)

Mean =  94.7221007203

Accuracy =  95.2789699571  (+/- 0.0296256757468 ) 
Sensitivity =  97.510373444 
Specificity =  94.1048034934 
F-measure =  93.4393638171


In [225]:
def getLR(X,y,k=10):
    regr = LogisticRegressionCV(class_weight='balanced',scoring='roc_auc',n_jobs=10, max_iter=10000, verbose=1,cv=10)
    regr.fit(X, y)
    scores = cross_val_score(regr, X, y, cv=k)
    predictions = regr.predict(X)
#     regr.score(X,y)
    return scores, predictions


In [226]:
# help(KFold)
scores, predictions = getLR(X,y)
print("Mean = " , scores.mean()*100)
get_results(results)

[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.2s remaining:    1.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.3s remaining:    1.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.3s remaining:    1.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.2s remaining:    0.8s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.2s remaining:    0.9s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.1s remaining:    0.4s
[Parall

Mean =  95.8589799668

Accuracy =  97.2818311874  (+/- 0.0288107893611 ) 
Sensitivity =  97.0954356846 
Specificity =  97.3799126638 
F-measure =  96.0985626283


[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.2s remaining:    0.8s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished


In [227]:
def getSVM(X,y,k=10):
    svc = svm.SVC(C=1, kernel='linear')
    svc.fit(X, y)
    scores = cross_val_score(svc, X, y, cv=k, scoring='precision_macro')
    predictions = svc.predict(X)
    return scores, predictions

In [228]:
scores, predictions = getQDA(X,y)
print("Mean = " , scores.mean()*100)
get_results(results)

Mean =  94.7221007203

Accuracy =  95.2789699571  (+/- 0.0296256757468 ) 
Sensitivity =  97.510373444 
Specificity =  94.1048034934 
F-measure =  93.4393638171


In [229]:
def getKNN(X,y,k_knn,k=10):
    knn = KNeighborsClassifier(n_neighbors = k_knn, algorithm = 'ball_tree', leaf_size=500)
    knn.fit(X, y)
    scores = cross_val_score(knn, X, y, cv=k, scoring='precision_macro')
    predictions = knn.predict(X)
    return scores, predictions

In [103]:
# kr = range(1,40)
# ac = []
# for k in kr:
#     ac.append(getKNN(X,y,k))
# import matplotlib.pyplot as plt
# plt.figure()
# plt.plot(kr,ac)
# plt.show()

In [230]:
scores, predictions = getKNN(X,y,7)
print("Mean = " , scores.mean()*100)
get_results(results)

Mean =  96.5000660709

Accuracy =  97.7110157368  (+/- 0.025214088159 ) 
Sensitivity =  97.510373444 
Specificity =  97.8165938865 
F-measure =  96.70781893


In [231]:
def getNaivebayes(X,y,k=10):    
    clf = GaussianNB()
    clf.fit(X, y)
    scores = cross_val_score(clf, X, y, cv=k)
    predictions = clf.predict(X)
    return scores, predictions

In [232]:
scores, predictions = getNaivebayes(X,y)
print("Mean = " , scores.mean()*100)
get_results(results)

Mean =  95.8609337183

Accuracy =  95.9942775393  (+/- 0.0213516958891 ) 
Sensitivity =  97.510373444 
Specificity =  95.1965065502 
F-measure =  94.3775100402


In [234]:
def getDecisionTree_infogain(X,y,k=10):
    clf = DecisionTreeClassifier(class_weight=None, criterion='entropy', random_state=100, splitter='best')
    clf.fit(X, y)
    scores = cross_val_score(clf, X, y, cv=k)
    predictions = clf.predict(X)
    return scores, predictions

In [237]:
scores, predictions = getDecisionTree_infogain(X,y)
print("Mean = " , scores.mean()*100)
get_results(results)

Mean =  92.4321290059

Accuracy =  100.0  (+/- 0.0333685342286 ) 
Sensitivity =  100.0 
Specificity =  100.0 
F-measure =  100.0


In [238]:
def getDecisionTree_gini(X,y,k=10):
    #X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    clf = DecisionTreeClassifier(class_weight=None, criterion='gini',random_state=100, splitter='best')
    clf.fit(X, y)
    scores = cross_val_score(clf, X, y, cv=5)
    predictions = clf.predict(X)
    return scores, predictions

In [239]:
scores, predictions = getDecisionTree_gini(X,y)
print("Mean = " , scores.mean()*100)
get_results(results)

Mean =  91.5640302348

Accuracy =  100.0  (+/- 0.0222636978003 ) 
Sensitivity =  100.0 
Specificity =  100.0 
F-measure =  100.0


In [242]:
def getRandomForest(X,y, numtrees =10,k=10):
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X, y)
    scores = cross_val_score(clf, X, y, cv=10)
    predictions = clf.predict(X)
    return scores, predictions

In [243]:
scores, predictions = getRandomForest(X,y)
print("Mean = " , scores.mean()*100)
get_results(results)

Mean =  96.1467063249

Accuracy =  99.7138769671  (+/- 0.0237727928005 ) 
Sensitivity =  100.0 
Specificity =  99.5633187773 
F-measure =  99.5867768595


In [113]:
def getAdaBoost(X,y, estimators=100):
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators=estimators)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores.mean()

In [114]:
from sklearn.neighbors import KNeighborsClassifier
def getBaggingClassifier(X,y, poorclf=KNeighborsClassifier()):
    from sklearn.ensemble import BaggingClassifier
    bagging = BaggingClassifier(poorclf,max_samples=0.5, max_features=0.5)
    scores = cross_val_score(bagging, X, y, cv=5)
    return scores.mean()

In [124]:
def HMV_Layer1():
    clf1 = QDA(priors=None, reg_param=0.0)
    clf2 = LogisticRegression()
    clf3 = GaussianNB()
    clf = VotingClassifier(estimators=[('QDA', clf1), ('LR', clf2), ('NB', clf3)], voting='hard') 
#     clf.fit(X_train, y_train)
#     print
    return clf
    

In [125]:
def HMV_Layer2(clf_prev, k):
    clf1 = KNeighborsClassifier(n_neighbors = k, algorithm = 'ball_tree', leaf_size=500)
    clf2 = svm.SVC(C=1, kernel='linear')
    clf = VotingClassifier(estimators=[('KNN', clf1), ('SVM', clf2), ('LAYER1', clf_prev)], voting='hard') 
#     clf.fit(X_train, y_train)
#     clf.predict(X_test)
    return clf
    

In [126]:
def HMV_Layer3(clf_prev):
    clf1 = DecisionTreeClassifier(class_weight=None, criterion='entropy', random_state=100, splitter='best')
    clf2 = DecisionTreeClassifier(class_weight=None, criterion='gini', random_state=100, splitter='best')
    clf = VotingClassifier(estimators=[('DTIG', clf1), ('DTGI', clf2), ('LAYER2', clf_prev)], voting='hard')
#     clf.fit(X_train, y_train)
    return clf

In [246]:
def run_HMV(X,y, k_knn):
    lay1 = HMV_Layer1()
    lay2 = HMV_Layer2(lay1, k_knn)
    lay3 = HMV_Layer3(lay2)
    scores = cross_val_score(lay3, X, y, cv=10)
    lay3.fit(X, y)
    predictions = lay3.predict(X, y)
    return scores, predictions
    

In [247]:
scores, predictions = run_HMV(X,y,k_knn)
print("Mean = " , scores.mean()*100)
get_results(results)

TypeError: predict() takes 2 positional arguments but 3 were given