In [489]:
import operator
import numpy as np
import pandas as pd
import sklearn

# preprocess
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import f_classif, SelectKBest

# for classifiers
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.linear_model import LinearRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

# for cross_validate
from sklearn.cross_validation import train_test_split,cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score

# Ignore Warning
import warnings
warnings.filterwarnings('ignore')

# Inline plots
%matplotlib inline

In [490]:
def chooseDataset(name):
    dataPath = 'datasets/'
    if name == 'diabetes':
        colNames= ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
        df = pd.read_csv(dataPath + 'PIMA_Indiana_diabetes/pima-indians-diabetes.data.csv', names = colNames)
        missCol = ['glucose','bp','skin','insulin','bmi']
        for col in missCol:
            df[col].replace([0,0.0], np.nan, inplace = True)
                   
#       convert 0 as missing values
    elif name == 'breastCancer':
        colNames = ['sampleCodeNumber', 'clumpThickness', 'uniformityCellSize', 'uniformityCellShape', 'marginalAdhesion', 'singleEpithelialCellSize', 'bareNuclei', 'blandChromatin', 'normalNucleoli', 'mitoses', 'label']
        df = pd.read_csv(dataPath + 'BreastCancer/breast-cancer-wisconsin.data.txt', names = colNames)
        df.replace("?", np.nan, inplace= True)
        df['bareNuclei'] = pd.to_numeric(df['bareNuclei'])
        # making class labels as 0 (Benign) and 1 (Malignant)
        df['label'] = df['label'].replace(2, 0)
        df['label'] = df['label'].replace(4, 1)
        missCol = ['bareNuclei']
        k_knn = 7
#         print pd.unique(df[missCol].values.ravel('K'))
    elif name == 'parkinsons':
        colNames= ['name','MDVP:Fo','MDVP:Fhi','MDVP:Flo','MDVP:Jitter(%)','MDVP:Jitter(Abs)','MDVP:RAP','MDVP:PPQ','Jitter:DDP','MDVP:Shimmer','MDVP:Shimmer(dB)','Shimmer:APQ3','Shimmer:APQ5','MDVP:APQ','Shimmer:DDA','NHR','HNR','label','RPDE','DFA','spread1','spread2','D2','PPE']
        df=pd.read_csv(dataPath+'Parkinsons/parkinsons.data.txt',names=colNames)
        missCol=[]
        #no missing values
        
    elif name == 'BUPA':
        #not given which one is infected
        colNames = ['mcv', 'alkphos', 'sgpt','sgot','gammagt','drinks','label']
        df = pd.read_csv(dataPath + 'BUPA(Liver)/bupa.data.txt',names = colNames )
        df.drop_duplicates(subset=None, keep='first', inplace=True)
        df['Class'] = df['Class'].replace(1, 0)
        df['Class'] = df['Class'].replace(2, 1)
        missCol=[]
        
    elif name == 'Cleveland':
        colNames= ['age','sex','cp','trestbps','chol','fbs','restecg'  ,'thalach', 'exang', 'oldpeak','slope','ca', 'thal','label']
        df = pd.read_csv(dataPath + 'Cleveland(Heart)/processed.cleveland.data.txt',names = colNames)
        df.replace ("-9.0",np.nan,inplace=True)
        missCol = df.columns[df.isna().any()].tolist()
        
    elif name == 'Hepatitis':
        colNames =['label','AGE','SEX','STEROID','ANTIVIRALS','FATIGUE','MALAISE','ANOREXIA','LIVER BIG','LIVER FIRM','SPLEEN PALPABLE','SPIDERS','ASCITES','VARICES','BILIRUBIN','ALK PHOSPHATE','SGOT','ALBUMIN','PROTIME','HISTOLOGY']
        df = pd.read_csv(dataPath + 'Hepatitis/hepatitis.data.txt',names= colNames)
        df.replace("?",np.nan,inplace=True)
        missCol= df.columns[df.isna().any()].tolist()
        
    elif name == 'ILPD':
        colNames =['Age','Gender','TB','DB','Alkphos','SGPT','SGOT','TP','ALB','A/G','label']
        df = pd.read_csv(dataPath + 'ILPD(Liver)/ILPD.csv',names= colNames)
        df['Gender'] = df['Gender'].replace('Male', 0)
        df['Gender'] = df['Gender'].replace('Female', 1)
        missCol=['A/G']
        k_knn = 0
    else:
        print ("NOT FOUND")
        return
    return df, missCol, k_knn
        

In [491]:
df, missCol, k_knn = chooseDataset('ILPD')
print (df.columns, df.dtypes, df.shape)
print (df.head())
# print df.loc[np.isnan(df['A/G'])]

(Index([u'Age', u'Gender', u'TB', u'DB', u'Alkphos', u'SGPT', u'SGOT', u'TP',
       u'ALB', u'A/G', u'label'],
      dtype='object'), Age          int64
Gender       int64
TB         float64
DB         float64
Alkphos      int64
SGPT         int64
SGOT         int64
TP         float64
ALB        float64
A/G        float64
label        int64
dtype: object, (583, 11))
   Age  Gender    TB   DB  Alkphos  SGPT  SGOT   TP  ALB   A/G  label
0   65       1   0.7  0.1      187    16    18  6.8  3.3  0.90      1
1   62       0  10.9  5.5      699    64   100  7.5  3.2  0.74      1
2   62       0   7.3  4.1      490    60    68  7.0  3.3  0.89      1
3   58       0   1.0  0.4      182    14    20  6.8  3.4  1.00      1
4   72       0   3.9  2.0      195    27    59  7.3  2.4  0.40      1


In [492]:
# normalizing data
def normalizeData(df):
    scaler = MinMaxScaler(feature_range=(0, 1))
    for i in df.columns:
        null_index = df[i].isnull()
        df.loc[~null_index, [i]] = scaler.fit_transform(df.loc[~null_index, [i]])
    return df

In [493]:
df = normalizeData(df)
print (df.head())

# print pd.unique(df[missCol].values.ravel('K'))

        Age  Gender        TB        DB   Alkphos      SGPT      SGOT  \
0  0.709302     1.0  0.004021  0.000000  0.060576  0.003015  0.001626   
1  0.674419     0.0  0.140751  0.275510  0.310699  0.027136  0.018296   
2  0.674419     0.0  0.092493  0.204082  0.208598  0.025126  0.011791   
3  0.627907     0.0  0.008043  0.015306  0.058134  0.002010  0.002033   
4  0.790698     0.0  0.046917  0.096939  0.064485  0.008543  0.009961   

         TP       ALB    A/G  label  
0  0.594203  0.521739  0.240    0.0  
1  0.695652  0.500000  0.176    0.0  
2  0.623188  0.521739  0.236    0.0  
3  0.594203  0.543478  0.280    0.0  
4  0.666667  0.326087  0.040    0.0  


In [494]:
def computeMissing(df,missCol, k = 5):
    # get No Missing Data Rows 
#     print pd.unique(df[missCol].values.ravel('K'))
    if missCol == []:
        return df
    no_missing_df = df.dropna(axis=0, how='any')
#     print(no_missing_df.shape, df.shape)
#     print no_missing_df.head()
    # get Missing Data Rows 
    missing_df = pd.DataFrame(df[~df.isin(no_missing_df).all(1)])
#     print missing_df.head()
    # removed last column
    data = no_missing_df.drop(['label'], axis = 1, inplace = False)
    # Create the knn model.
    y_columns = missCol
    x_columns = data.columns.tolist()
#     print y_columns
    for col in y_columns:
        x_columns.remove(col)
        
    # Look at the five closest neighbors.
    knn = KNeighborsRegressor(n_neighbors=k)
    # Fit the model on the training data.
    knn.fit(no_missing_df[x_columns], no_missing_df[y_columns])
    # Make point predictions on the test set using the fit model.
    predictions = knn.predict(missing_df[x_columns])
    missing_df[y_columns] = predictions
    no_missing_df = no_missing_df.append(missing_df)
    return no_missing_df

In [495]:
df = computeMissing(df, missCol)

In [496]:
print df.shape
print df.head()

(583, 11)
        Age  Gender        TB        DB   Alkphos      SGPT      SGOT  \
0  0.709302     1.0  0.004021  0.000000  0.060576  0.003015  0.001626   
1  0.674419     0.0  0.140751  0.275510  0.310699  0.027136  0.018296   
2  0.674419     0.0  0.092493  0.204082  0.208598  0.025126  0.011791   
3  0.627907     0.0  0.008043  0.015306  0.058134  0.002010  0.002033   
4  0.790698     0.0  0.046917  0.096939  0.064485  0.008543  0.009961   

         TP       ALB    A/G  label  
0  0.594203  0.521739  0.240    0.0  
1  0.695652  0.500000  0.176    0.0  
2  0.623188  0.521739  0.236    0.0  
3  0.594203  0.543478  0.280    0.0  
4  0.666667  0.326087  0.040    0.0  


In [497]:
def removeOutliers(df):
    maskall = {}
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    
    for col in df.columns[:-1]:
        IQR = Q3[col] - Q1[col]
        mask = df[col].between(Q1[col] - 1.5*IQR, Q3[col] + 1.5*IQR, inclusive=True)
        maskall[col] =mask
        df = df[mask]
#         print df.shape
    return df

In [498]:
df2 = removeOutliers(df)
print (df2.shape)

(288, 11)


In [499]:
label = df['label']
df.drop('label', axis=1, inplace=True)

In [500]:
X, y = df, label
print (X,y)

(          Age  Gender        TB        DB   Alkphos      SGPT      SGOT  \
0    0.709302     1.0  0.004021  0.000000  0.060576  0.003015  0.001626   
1    0.674419     0.0  0.140751  0.275510  0.310699  0.027136  0.018296   
2    0.674419     0.0  0.092493  0.204082  0.208598  0.025126  0.011791   
3    0.627907     0.0  0.008043  0.015306  0.058134  0.002010  0.002033   
4    0.790698     0.0  0.046917  0.096939  0.064485  0.008543  0.009961   
5    0.488372     0.0  0.018767  0.030612  0.070835  0.004523  0.000813   
6    0.255814     1.0  0.006702  0.005102  0.044455  0.003015  0.000407   
7    0.290698     1.0  0.006702  0.010204  0.067904  0.002010  0.000203   
8    0.151163     0.0  0.006702  0.010204  0.067904  0.006030  0.001830   
9    0.593023     0.0  0.004021  0.005102  0.110894  0.021608  0.009758   
10   0.616279     0.0  0.002681  0.000000  0.071812  0.020603  0.009961   
11   0.790698     0.0  0.030831  0.061224  0.096238  0.010553  0.009351   
12   0.697674     0.0  0

In [501]:
for col in df.columns:
    print sorted(pd.Series(df[col]).unique())[-1]
#     print np.isnan(df[col].any())
#     print np.isfinite(df[col].all()) #and gets True

1.0
1.0
1.0
1.0000000000000002
1.0
1.0000000000000002
1.0
1.0
1.0000000000000002
0.9999999999999999


In [502]:
def PCA_Compute(X,y, n):
    pca = PCA(n_components= n, svd_solver='full')
    pca.fit(X)
    X = pca.transform(X)
    print(pca.explained_variance_ratio_)
    return X,y

In [503]:
def Fscore_Compute(X,y, n = 5):
    selector = SelectKBest(f_classif, k=n)
    selector.fit(X,y)
    X = selector.transform(X)
    print (selector.scores_, X.shape)
    return X,y

In [504]:
def featureSelection(X,y,k, choice = 0):
    if choice == 1:
        return Fscore_Compute(X,y,k)
    return PCA_Compute(X,y)

In [505]:
X,y = featureSelection(X,y,X.shape[1],1)

(array([11.1714293 ,  3.97336307, 29.60928154, 37.43959214, 20.55843531,
       15.94121994, 13.72863496,  0.71293374, 15.53743097, 16.16743276]), (583, 10))


In [506]:
def getQDA(X,y,k=10):
    clf = QDA(priors=None, reg_param=0.0)
    scores = cross_val_score(clf, X, y, cv=k)
    return scores.mean()

In [507]:
print  (getQDA(X,y))

0.559722948516821


In [525]:
def getLR(X,y,k=10):
    regr = LogisticRegressionCV(class_weight='balanced',scoring='roc_auc',n_jobs=10, max_iter=10000, verbose=1,cv=10)
    scores = cross_val_score(regr, X, y, cv=k)
#     regr.score(X,y)
    return scores.mean()


In [526]:
# help(KFold)
print (getLR(X,y))

[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.1s remaining:    0.4s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.2s remaining:    0.6s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.2s remaining:    0.6s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.1s remaining:    0.4s
[Parall

0.6402165554154233


[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.4s finished


In [480]:
def getSVM(X,y,k=10):
    svc = svm.SVC(C=1, kernel='linear')
    scores = cross_val_score(svc, X, y, cv=k, scoring='precision_macro')
    return scores.mean()

In [399]:
print (getSVM(X,y))

0.3165188834154352


In [303]:
def getKNN(X,y,k_knn,k=10):
    knn = KNeighborsClassifier(n_neighbors = k_knn, algorithm = 'ball_tree', leaf_size=500)
    scores = cross_val_score(knn, X, y, cv=k, scoring='precision_macro')
    return scores.mean()

In [304]:
# kr = range(1,40)
# ac = []
# for k in kr:
#     ac.append(getKNN(X,y,k))
# import matplotlib.pyplot as plt
# plt.figure()
# plt.plot(kr,ac)
# plt.show()

In [305]:
getKNN(X,y,7)

0.965000660708806

In [306]:
def getNaivebayes(X,y,k=10):    
    clf = GaussianNB()
    clf.fit(X, y)
    scores = cross_val_score(clf, X, y, cv=k)
    return scores.mean()

In [307]:
print (getNaivebayes(X,y))

0.9443043741298756


In [320]:
def getDecisionTreeinfogain(X,y,k=10):
    clf = DecisionTreeClassifier(class_weight=None, criterion='entropy', random_state=100, splitter='best')
    scores = cross_val_score(clf, X, y, cv=k)
    return scores.mean()

In [321]:
print(getDecisionTreeinfogain(X,y))

0.9485478085906746


In [323]:
def getDecisionTreegini(X,y,k=10):
    #X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    clf = DecisionTreeClassifier(class_weight=None, criterion='gini',random_state=100, splitter='best')
    scores = cross_val_score(clf, X, y, cv=5)
    return scores.mean()

In [324]:
print (getDecisionTreegini(X,y))

0.935753063202933


In [327]:
def getRandomForest(X,y, numtrees =10,k=10):
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=10)
    scores = cross_val_score(clf, X, y, cv=10)
    return scores.mean()

In [328]:
print (getRandomForest(X,y))

0.9599976671623948


In [329]:
def getAdaBoost(X,y, estimators=100):
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators=estimators)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores.mean()

In [4]:
from sklearn.neighbors import KNeighborsClassifier
def getBaggingClassifier(X,y, poorclf=KNeighborsClassifier()):
    from sklearn.ensemble import BaggingClassifier
    bagging = BaggingClassifier(poorclf,max_samples=0.5, max_features=0.5)
    scores = cross_val_score(bagging, X, y, cv=5)
    return scores.mean()

In [330]:
def HMV_Layer1():
    clf1 = QDA(priors=None, reg_param=0.0)
    clf2 = LogisticRegression(random_state=1)
    clf3 = GaussianNB(random_state=1)
    clf = VotingClassifier(estimators=[('QDA', clf1), ('LR', clf2), ('NB', clf3)], voting='hard') 
#     clf.fit(X_train, y_train)
#     print
    return clf
    

In [331]:
def HMV_Layer2(clf_prev, k):
    clf1 = KNeighborsClassifier(n_neighbors = k, algorithm = 'ball_tree', leaf_size=500)
    clf2 = svm.SVC(C=1, kernel='linear')
    clf = VotingClassifier(estimators=[('KNN', clf1), ('SVM', clf2), ('LAYER1', clf_prev)], voting='hard') 
#     clf.fit(X_train, y_train)
#     clf.predict(X_test)
    return clf
    

In [332]:
def HMV_Layer3(clf_prev):
    clf1 = DecisionTreeClassifier(class_weight=None, criterion='entropy', 
           random_state=100, splitter='best')
    clf2 = DecisionTreeClassifier(class_weight=None, criterion='gini', 
           random_state=100, splitter='best')
    clf = VotingClassifier(estimators=[('DTIG', clf1), ('DTGI', clf2), ('LAYER2', clf_prev)], voting='hard')
#     clf.fit(X_train, y_train)
    return clf

In [333]:
def run_HMV(X,y, k_knn):
    lay1 = HMV_Layer1()
    lay2 = HMV_Layer2(lay1, k_knn)
    lay3 = HMV_Layer3(lay2)
    scores = cross_val_score(lay3, X, y, cv=10)
    return scores.mean()
    

In [226]:
run_HMV(X,y,k_knn)

0.9542827982387078