In [104]:
import numpy as np
import pandas as pd
import sklearn
import operator
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.neighbors import KNeighborsRegressor

from sklearn.cross_validation import train_test_split

In [105]:
def chooseDataset(name):
    dataPath = 'datasets/'
    if name == 'diabetes':
        colNames= ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
        df = pd.read_csv(dataPath + 'PIMA_Indiana_diabetes/pima-indians-diabetes.data.csv', names = colNames)
        missCol = ['glucose','bp','skin','insulin','bmi']
        for col in missCol:
            df[col].replace([0,0.0], np.nan, inplace = True)
                   
#       convert 0 as missing values
    elif name == 'breastCancer':
        colNames = ['sampleCodeNumber', 'clumpThickness', 'uniformityCellSize', 'uniformityCellShape', 'marginalAdhesion', 'singleEpithelialCellSize', 'bareNuclei', 'blandChromatin', 'normalNucleoli', 'mitoses', 'label']
        df = pd.read_csv(dataPath + 'BreastCancer/breast-cancer-wisconsin.data.txt', names = colNames)
        df.replace("?", np.nan, inplace= True)
        df['bareNuclei'] = pd.to_numeric(df['bareNuclei'])
        # making class labels as 0 (Benign) and 1 (Malignant)
        df['label'] = df['label'].replace(2, 0)
        df['label'] = df['label'].replace(4, 1)
        missCol = ['bareNuclei']
#         print pd.unique(df[missCol].values.ravel('K'))
        
    else:
        print "NOT FOUND"
        return
    return df, missCol
        

In [106]:
df, missCol = chooseDataset('breastCancer')
print df.columns, df.dtypes, df.shape
print df.head()

Index([u'sampleCodeNumber', u'clumpThickness', u'uniformityCellSize',
       u'uniformityCellShape', u'marginalAdhesion',
       u'singleEpithelialCellSize', u'bareNuclei', u'blandChromatin',
       u'normalNucleoli', u'mitoses', u'label'],
      dtype='object') sampleCodeNumber              int64
clumpThickness                int64
uniformityCellSize            int64
uniformityCellShape           int64
marginalAdhesion              int64
singleEpithelialCellSize      int64
bareNuclei                  float64
blandChromatin                int64
normalNucleoli                int64
mitoses                       int64
label                         int64
dtype: object (699, 11)
   sampleCodeNumber  clumpThickness  uniformityCellSize  uniformityCellShape  \
0           1000025               5                   1                    1   
1           1002945               5                   4                    4   
2           1015425               3                   1                    1 

In [107]:
# normalizing data
def normalizeData(df):
    scaler = MinMaxScaler(feature_range=(0, 1))
    for i in df.columns:
        null_index = df[i].isnull()
        df.loc[~null_index, [i]] = scaler.fit_transform(df.loc[~null_index, [i]])
    return df

In [108]:
df = normalizeData(df)
print df.head()

# print pd.unique(df[missCol].values.ravel('K'))

   sampleCodeNumber  clumpThickness  uniformityCellSize  uniformityCellShape  \
0          0.070067        0.444444            0.000000             0.000000   
1          0.070285        0.444444            0.333333             0.333333   
2          0.071217        0.222222            0.000000             0.000000   
3          0.071281        0.555556            0.777778             0.777778   
4          0.071336        0.333333            0.000000             0.000000   

   marginalAdhesion  singleEpithelialCellSize  bareNuclei  blandChromatin  \
0          0.000000                  0.111111    0.000000        0.222222   
1          0.444444                  0.666667    1.000000        0.222222   
2          0.000000                  0.111111    0.111111        0.222222   
3          0.000000                  0.222222    0.333333        0.222222   
4          0.222222                  0.111111    0.000000        0.222222   

   normalNucleoli  mitoses  label  
0        0.000000   

In [109]:
def computeMissing(df,missCol, k = 5):
    # get No Missing Data Rows 
#     print pd.unique(df[missCol].values.ravel('K'))
    no_missing_df = df.dropna(axis=0, how='any')
#     print(no_missing_df.shape, df.shape)
#     print no_missing_df.head()
    # get Missing Data Rows 
    missing_df = pd.DataFrame(df[~df.isin(no_missing_df).all(1)])
#     print missing_df.head()
    # removed last column
    data = no_missing_df.drop(['label'], axis = 1, inplace = False)
    # Create the knn model.
    y_columns = missCol
    x_columns = data.columns.tolist()
#     print y_columns
    for col in y_columns:
        x_columns.remove(col)
        
    # Look at the five closest neighbors.
    knn = KNeighborsRegressor(n_neighbors=k)
    # Fit the model on the training data.
    knn.fit(no_missing_df[x_columns], no_missing_df[y_columns])
    # Make point predictions on the test set using the fit model.
    predictions = knn.predict(missing_df[x_columns])
    missing_df[y_columns] = predictions
    no_missing_df = no_missing_df.append(missing_df)
    return no_missing_df

In [110]:
df = computeMissing(df, missCol)

In [111]:
df.shape
df.head()

Unnamed: 0,sampleCodeNumber,clumpThickness,uniformityCellSize,uniformityCellShape,marginalAdhesion,singleEpithelialCellSize,bareNuclei,blandChromatin,normalNucleoli,mitoses,label
0,0.070067,0.444444,0.0,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0,0
1,0.070285,0.444444,0.333333,0.333333,0.444444,0.666667,1.0,0.222222,0.111111,0.0,0
2,0.071217,0.222222,0.0,0.0,0.0,0.111111,0.111111,0.222222,0.0,0.0,0
3,0.071281,0.555556,0.777778,0.777778,0.0,0.222222,0.333333,0.222222,0.666667,0.0,0
4,0.071336,0.333333,0.0,0.0,0.222222,0.111111,0.0,0.222222,0.0,0.0,0


In [112]:
def removeOutliers(df):
    maskall = {}
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    
    for col in df.columns[:-1]:
        IQR = Q3[col] - Q1[col]
        mask = df[col].between(Q1[col] - 1.5*IQR, Q3[col] + 1.5*IQR, inclusive=True)
        maskall[col] =mask
        df = df[mask]
#         print df.shape
    return df

In [113]:
df2 = removeOutliers(df)
print df2.shape

(491, 11)


In [114]:
label = df['label']
df.drop('label', axis=1, inplace=True)

In [118]:
X, y = df, label
print X,y

     sampleCodeNumber  clumpThickness  uniformityCellSize  \
0            0.070067        0.444444            0.000000   
1            0.070285        0.444444            0.333333   
2            0.071217        0.222222            0.000000   
3            0.071281        0.555556            0.777778   
4            0.071336        0.333333            0.000000   
5            0.071344        0.777778            1.000000   
6            0.071417        0.000000            0.000000   
7            0.071451        0.111111            0.000000   
8            0.072535        0.111111            0.000000   
9            0.072535        0.333333            0.111111   
10           0.072700        0.000000            0.000000   
11           0.072766        0.111111            0.000000   
12           0.073187        0.444444            0.222222   
13           0.073351        0.000000            0.000000   
14           0.073393        0.777778            0.666667   
15           0.073622   

In [122]:
def PCA_Compute(X,y, n = 0):
    pca = PCA(n_components= X.shape[-1] - n, svd_solver='full')
    pca.fit(X,y)
    X = pca.transform(X,y)
    print(pca.explained_variance_ratio_)
    return X,y

In [101]:
def Fscore_Compute(X,y, n = 5):
    selector = SelectKBest(f_classif, k=n)
    selector.fit(X,y)
    X = selector.transform(X)
    print selector.scores_, X.shape
    return X,y

In [102]:
X,y = Fscore_Compute(X,y)

[    4.51505927   733.20697841  1408.52721279  1419.30553012   657.79369959
   608.71955539  1417.3134106    933.28729668   717.62804135   152.04023895] (699, 5)


In [123]:
X,y = PCA_Compute(X,y)

[ 0.68958431  0.07112366  0.06063292  0.04437934  0.03869191  0.03413345
  0.02515129  0.02254191  0.01135833  0.00240289]
