In [208]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.neighbors import KNeighborsRegressor

from sklearn.cross_validation import train_test_split

In [227]:
def chooseDataset(name):
    dataPath = 'datasets/'
    if name == 'diabetes':
        colNames= ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'Class']
        df = pd.read_csv(dataPath + 'PIMA_Indiana_diabetes/pima-indians-diabetes.data.csv', names = colNames)
        missCol = ['glucose','bp','skin','insulin','bmi']
        for col in missCol:
            df[col].replace([0,0.0], np.nan, inplace = True)
                   
#       convert 0 as missing values
    elif name == 'breastCancer':
        colNames = ['sampleCodeNumber', 'clumpThickness', 'uniformityCellSize', 'uniformityCellShape', 'marginalAdhesion', 'singleEpithelialCellSize', 'bareNuclei', 'blandChromatin', 'normalNucleoli', 'mitoses', 'Class']
        df = pd.read_csv(dataPath + 'BreastCancer/breast-cancer-wisconsin.data.txt', names = colNames)
        df.replace("?", np.nan, inplace= True)
        df['bareNuclei'] = pd.to_numeric(df['bareNuclei'])
        # making class labels as 0 (Benign) and 1 (Malignant)
        df['Class'] = df['Class'].replace(2, 0)
        df['Class'] = df['Class'].replace(4, 1)
        missCol = ['bareNuclei']
    elif name == 'parkinsons':
        colNames= ['name','MDVP:Fo','MDVP:Fhi','MDVP:Flo','MDVP:Jitter(%)','MDVP:Jitter(Abs)','MDVP:RAP','MDVP:PPQ','Jitter:DDP','MDVP:Shimmer','MDVP:Shimmer(dB)','Shimmer:APQ3','Shimmer:APQ5','MDVP:APQ','Shimmer:DDA','NHR','HNR','Class','RPDE','DFA','spread1','spread2','D2','PPE']
        df=pd.read_csv(datapath+'Parkinsons/parkinsons.data.txt',names=colNames)
        missCol=[]
        #no missing values
    elif name == 'BUPA':
        #not given which one is infected
        colNames = ['mcv', 'alkphos', 'sgpt','sgot','gammagt','drinks','Class']
        df = pd.read_csv(datapath + 'BUPA(Liver)/bupa.data.txt',names = colNames )
        df.drop_duplicates(subset=None, keep='first', inplace=True)
        df['Class'] = df['Class'].replace(1, 0)
        df['Class'] = df['Class'].replace(2, 1)
        missCol=[]
    elif name == 'Cleveland':
        colNames= ['age','sex','cp','trestbps','chol','fbs','restecg'  ,'thalach', 'exang', 'oldpeak','slope','ca', 'thal','Class']
        df = pd.read_csv(datapath + 'Cleveland(Heart)/processed.cleveland.data.txt',names = colNames)
        df.replace ("-9.0",np.nan,inplace=True)
        missCol = df.columns[df.isna().any()].tolist()
    elif name == 'Hepatitis':
        colNames =['Class','AGE','SEX','STEROID','ANTIVIRALS','FATIGUE','MALAISE','ANOREXIA','LIVER BIG','LIVER FIRM','SPLEEN PALPABLE','SPIDERS','ASCITES','VARICES','BILIRUBIN','ALK PHOSPHATE','SGOT','ALBUMIN','PROTIME','HISTOLOGY']
        df = pd.read_csv(datapath + 'Hepatitis/hepatitis.data.txt',names= colNames)
        df.replace("?",np.nan,inplace=True)
        missCol= df.columns[df.isna().any()].tolist()
    elif name == 'ILPD':
        colNames =['Age','Gender','TB','DB','Alkphos','SGPT','SGOT','TP','ALB','A/G','Class']
        df = pd.read_csv(datapath + 'ILPD(Liver)/ILPD.csv',names= colNames)
        missCol=[]
        
        
    else:
        print "NOT FOUND"
        return
    return df, missCol
        

In [214]:
df, missCol = chooseDataset('breastCancer')
print df.columns, df.dtypes, df.shape
print df.head()

Index([u'sampleCodeNumber', u'clumpThickness', u'uniformityCellSize',
       u'uniformityCellShape', u'marginalAdhesion',
       u'singleEpithelialCellSize', u'bareNuclei', u'blandChromatin',
       u'normalNucleoli', u'mitoses', u'label'],
      dtype='object') sampleCodeNumber              int64
clumpThickness                int64
uniformityCellSize            int64
uniformityCellShape           int64
marginalAdhesion              int64
singleEpithelialCellSize      int64
bareNuclei                  float64
blandChromatin                int64
normalNucleoli                int64
mitoses                       int64
label                         int64
dtype: object (699, 11)
   sampleCodeNumber  clumpThickness  uniformityCellSize  uniformityCellShape  \
0           1000025               5                   1                    1   
1           1002945               5                   4                    4   
2           1015425               3                   1                    1 

In [210]:
# normalizing data
def normalizeData(df):
    scaler = MinMaxScaler(feature_range=(0, 1))
    for i in df.columns:
        null_index = df[i].isnull()
        df.loc[~null_index, [i]] = scaler.fit_transform(df.loc[~null_index, [i]])
    return df

In [211]:
df = normalizeData(df)

In [None]:
def computeMissing(df):
    # get No Missing Data Rows 
    no_missing_df = df.dropna(axis=0, how='any')
    # get Missing Data Rows 
    missing_df = pd.DataFrame(df[~df.isin(no_missing_df).all(1)])
    # removed last column
    data = no_missing_df.drop(['label'], axis = 1, inplace = False)
    # Create the knn model.

    # Look at the five closest neighbors.
    knn = KNeighborsRegressor(n_neighbors=5)
    # Fit the model on the training data.
    knn.fit(no_missing_df[x_columns], no_missing_df[y_column])
    # Make point predictions on the test set using the fit model.
    predictions = knn.predict(missing_df[x_columns])
    # print (predictions)
    missing_df['Bare Nuclei'] = predictions

In [212]:
def removeOutliers(df):
    maskall = {}
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    
    for col in df.columns[:-1]:
        IQR = Q3[col] - Q1[col]
        mask = df[col].between(Q1[col] - 1.5*IQR, Q3[col] + 1.5*IQR, inclusive=True)
        maskall[col] =mask
        df = df[mask]
#         print df.shape
    return df

In [213]:
df2 = removeOutliers(df)
print df2.shape

(480, 11)


In [None]:
label = df['label']
df.drop('label', axis=1, inplace=True)

X, y = df, label

In [190]:
def PCA_Compute(X,y, n = 0):
    pca = PCA(n_components= X.shape[-1] - n, svd_solver='full')
    pca.fit(X,y)
    X = pca.transform(X,y)
    print(pca.explained_variance_ratio_)

In [196]:
def Fscore_Compute(X,y, n = 5):
    selector = SelectKBest(f_classif, k=n)
    selector.fit(X,y)
    X = selector.transform(X)
    print selector.scores_, X.shape

In [197]:
Fscore_Compute(X,y)

[  39.67022739  213.16175218    3.2569504     4.30438091   13.28110753
   71.7720721    23.8713002    46.14061124] (768, 5)
