In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from tqdm import tqdm

#features = [f"V{x}" for x in range(1,2000)]
df = pd.read_csv('Data/TCGAdata.txt', sep=" " ,header=0)
labels_df = pd.read_csv('Data/TCGAlabels', sep=" " ,header=0)

#Set max number of components for PCA
max_num_components = 30

num_components_range = range(1, max_num_components+1)

In [2]:
#Pre-processes the data by splitting and normalizing 
def pre_process(data, labels, train_size):
    #Split data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(df, labels_df.values.ravel(), test_size=1-train_size)
    
    #Standardize the rows (transposing as fit_transform standardizes along columns)
    #Scale after split to avoid data leakage
    scaler = StandardScaler()
    X_train = pd.DataFrame(np.transpose(scaler.fit_transform(X_train.transpose())), columns=X_train.columns)
    X_test = pd.DataFrame(np.transpose(scaler.fit_transform(X_test.transpose())), columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [3]:
##KNN PCA

def KNN_PCA(X_train, X_test, y_train, y_test):

    KNN_mean_scores = np.zeros(max_num_components+1)

    # Loop over different numbers of components
    for n_components in tqdm(num_components_range):

        #PCA

        KNN_pipeline = make_pipeline(PCA(n_components=n_components), KNeighborsClassifier(n_neighbors=5))

        KNN_scores = cross_val_score(KNN_pipeline, X_train, y_train, cv=5)
        KNN_mean_score = KNN_scores.mean()

        #KNN_mean_scores.append(KNN_mean_score)
        KNN_mean_scores[n_components] = KNN_mean_score

    KNN_optimal_n_components = np.where(KNN_mean_scores==KNN_mean_scores.max())[0][0]
    cross_val_err = 1 - max(KNN_mean_scores)

    print("KNN optimal number of PCA components:", KNN_optimal_n_components)

    opt_pipeline = make_pipeline(PCA(n_components=KNN_optimal_n_components), KNeighborsClassifier(n_neighbors=5))

    opt_pipeline.fit(X_train, y_train)
    train_pred = opt_pipeline.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = opt_pipeline.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)
    print("\n")

    return [train_error, cross_val_err, test_error, KNN_optimal_n_components]


In [4]:
##KNN features

def KNN_features(X_train, X_test, y_train, y_test):
    max_num_features = 100

    num_features = range(1, max_num_features+1)
    KNN_mean_scores = np.zeros(max_num_features+1)

    # Loop over different numbers of features
    for k in tqdm(num_features):

        feature_selector = SelectKBest(f_classif, k=k)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)

        model = KNeighborsClassifier(n_neighbors=5)

        KNN_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
        KNN_mean_score = KNN_scores.mean()

        KNN_mean_scores[k] = KNN_mean_score

        #KNN_mean_scores.append(KNN_mean_score)

    KNN_optimal_k_features = np.where(KNN_mean_scores==KNN_mean_scores.max())[0][0]
    cross_val_err = 1 - max(KNN_mean_scores)

    print("KNN optimal number of features:", KNN_optimal_k_features)

    feature_selector = SelectKBest(f_classif, k=KNN_optimal_k_features)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    model.fit(X_train_selected, y_train)


    # Get the indices of the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)

    # Get the names of the most predictive features
    selected_features = X_train.columns[selected_feature_indices]
    print("Most predictive features:", selected_features)


    train_pred = model.predict(X_train_selected)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
    test_error = 1 - accuracy_score(y_test, test_pred)
    
    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, KNN_optimal_k_features]


In [5]:
##SVC PCA
def SVC_PCA(X_train, X_test, y_train, y_test):
    SVC_mean_scores = np.zeros(max_num_components+1)

    #num_components_range = range(1, max_num_components)

    # Loop over different numbers of components
    for n_components in tqdm(num_components_range):

        #PCA

        SVC_pipeline = make_pipeline(PCA(n_components=n_components), SVC())

        SVC_scores = cross_val_score(SVC_pipeline, X_train, y_train, cv=5)
        SVC_mean_score = SVC_scores.mean()

        #SVC_mean_scores.append(SVC_mean_score)
        SVC_mean_scores[n_components] = SVC_mean_score

    SVC_optimal_n_components = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]
    cross_val_err = 1 - max(SVC_mean_scores)

    print("SVC optimal number of PCA components:", SVC_optimal_n_components)

    opt_pipeline = make_pipeline(PCA(n_components=SVC_optimal_n_components), SVC())

    opt_pipeline.fit(X_train, y_train)
    train_pred = opt_pipeline.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = opt_pipeline.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, SVC_optimal_n_components]

In [6]:
##SVC features

def SVC_features(X_train, X_test, y_train, y_test):
    max_num_features = 100
    num_features = range(1, max_num_features+1)
    SVC_mean_scores = np.zeros(max_num_features+1)

    # Loop over different numbers of components
    for k in tqdm(num_features):

        feature_selector = SelectKBest(f_classif, k=k)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)

        model = SVC()

        SVC_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
        SVC_mean_score = SVC_scores.mean()

        #SVC_mean_scores.append(SVC_mean_score)
        SVC_mean_scores[k] = SVC_mean_score


    SVC_optimal_k_features = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]
    cross_val_err = 1 - max(SVC_mean_scores)

    print("SVC optimal number of features:", SVC_optimal_k_features)

    feature_selector = SelectKBest(f_classif, k=SVC_optimal_k_features)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    model.fit(X_train_selected, y_train)

    # Get the indices of the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)

    # Get the names of the most predictive features
    selected_features = X_train.columns[selected_feature_indices]
    print("Most predictive features:", selected_features)


    train_pred = model.predict(X_train_selected)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, SVC_optimal_k_features]

In [7]:
##Logistic regression PCA

def LR_PCA(X_train, X_test, y_train, y_test):
    LR_mean_scores = np.zeros(max_num_components+1)

    #num_components_range = range(1, 25)

    # Loop over different numbers of components
    for n_components in tqdm(num_components_range):

        #PCA

        LR_pipeline = make_pipeline(PCA(n_components=n_components), LogisticRegression(solver='lbfgs', max_iter=10000))

        LR_scores = cross_val_score(LR_pipeline, X_train, y_train, cv=5)
        LR_mean_score = LR_scores.mean()

        #LR_mean_scores.append(LR_mean_score)
        LR_mean_scores[n_components] = LR_mean_score

    LR_optimal_n_components = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]
    cross_val_err = 1 - max(LR_mean_scores)

    print("KNN optimal number of PCA components:", LR_optimal_n_components)

    opt_pipeline = make_pipeline(PCA(n_components=LR_optimal_n_components), LogisticRegression(solver='lbfgs', max_iter=10000))

    opt_pipeline.fit(X_train, y_train)
    train_pred = opt_pipeline.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = opt_pipeline.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, LR_optimal_n_components]

In [8]:
##Logistic Regression features

def LR_features(X_train, X_test, y_train, y_test):
    max_num_features = 100
    num_features = range(1, max_num_features+1)
    LR_mean_scores = np.zeros(max_num_features+1)

    # Loop over different numbers of components
    for k in tqdm(num_features):

        feature_selector = SelectKBest(f_classif, k=k)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)

        model = LogisticRegression(solver='lbfgs', max_iter=10000)

        LR_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
        LR_mean_score = LR_scores.mean()

        #LR_mean_scores.append(LR_mean_score)
        LR_mean_scores[k] = LR_mean_score

    LR_optimal_k_features = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]
    cross_val_err = 1 - max(LR_mean_scores)

    print("LR optimal number of features:", LR_optimal_k_features)

    feature_selector = SelectKBest(f_classif, k=LR_optimal_k_features)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    model.fit(X_train_selected, y_train)

    # Get the indices of the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)

    # Get the names of the most predictive features
    selected_features = X_train.columns[selected_feature_indices]
    print("Most predictive features:", selected_features)

    train_pred = model.predict(X_train_selected)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, LR_optimal_k_features]

In [9]:
##Creating dictionary to save data of part 1
d = dict()
#a =0.7
#print(f"KNN_PCA_{(1-a)*100:.0f}_{a*100:.0f}")

In [10]:
##Run everything with 70/30 split
split = 0.7
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)

d[f"KNN_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train, y_test)
d[f"KNN_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train, y_test)
d[f"SVC_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train, y_test)
d[f"SVC_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train, y_test)
d[f"LR_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train, y_test)
d[f"LR_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train, y_test)


100%|███████████████████████████████████████████| 30/30 [03:36<00:00,  7.23s/it]


KNN optimal number of PCA components: 22
Cross val err:  0.008415841584158423
Train err:  0.007920792079207928
Test err:  0.009227220299884653




100%|█████████████████████████████████████████| 100/100 [00:09<00:00, 10.81it/s]


KNN optimal number of features: 100
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V83', 'V87', 'V178', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307',
       'V308', 'V350', 'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507',
       'V539', 'V544', 'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687',
       'V691', 'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V922',
       'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101',
       'V1102', 'V1126', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234',
       'V1249', 'V1256', 'V1263', 'V1293', 'V1315', 'V1443', 'V1478', 'V1517',
       'V1530', 'V1533', 'V1549', 'V1572', 'V1575', 'V1580', 'V1583', 'V1654',
       'V1657', 'V1658', 'V1673', 'V1697', 'V1719', 'V1731', 'V1744', 'V1760',
       'V1772', 'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846',
       'V1868', 'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='

100%|███████████████████████████████████████████| 30/30 [03:53<00:00,  7.79s/it]


SVC optimal number of PCA components: 20
Cross val err:  0.007920792079207928
Train err:  0.0034653465346534684
Test err:  0.008073817762399127


100%|█████████████████████████████████████████| 100/100 [00:16<00:00,  5.90it/s]


SVC optimal number of features: 100
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V83', 'V87', 'V178', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307',
       'V308', 'V350', 'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507',
       'V539', 'V544', 'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687',
       'V691', 'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V922',
       'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101',
       'V1102', 'V1126', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234',
       'V1249', 'V1256', 'V1263', 'V1293', 'V1315', 'V1443', 'V1478', 'V1517',
       'V1530', 'V1533', 'V1549', 'V1572', 'V1575', 'V1580', 'V1583', 'V1654',
       'V1657', 'V1658', 'V1673', 'V1697', 'V1719', 'V1731', 'V1744', 'V1760',
       'V1772', 'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846',
       'V1868', 'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='

100%|███████████████████████████████████████████| 30/30 [04:31<00:00,  9.05s/it]


KNN optimal number of PCA components: 18
Cross val err:  0.008415841584158423
Train err:  0.0
Test err:  0.01038062283737029


100%|█████████████████████████████████████████| 100/100 [01:25<00:00,  1.17it/s]


LR optimal number of features: 92
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V87', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350',
       'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V544',
       'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687', 'V691', 'V730',
       'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V922', 'V982', 'V1005',
       'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1102', 'V1126',
       'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234', 'V1249', 'V1256',
       'V1263', 'V1293', 'V1315', 'V1443', 'V1478', 'V1517', 'V1533', 'V1549',
       'V1580', 'V1654', 'V1657', 'V1673', 'V1697', 'V1719', 'V1731', 'V1744',
       'V1760', 'V1772', 'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829',
       'V1846', 'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.00544554455445545
Train err:  0.0
Test err:  0.0046



In [11]:
##Run everything with 80/20 split
split = 0.8
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)

d[f"KNN_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train, y_test)
d[f"KNN_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train, y_test)
d[f"SVC_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train, y_test)
d[f"SVC_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train, y_test)
d[f"LR_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train, y_test)
d[f"LR_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████| 30/30 [04:21<00:00,  8.73s/it]


KNN optimal number of PCA components: 30
Cross val err:  0.004330882421988647
Train err:  0.004330879168471191
Test err:  0.01557093425605538




100%|█████████████████████████████████████████| 100/100 [00:09<00:00, 10.53it/s]


KNN optimal number of features: 75
Most predictive features: Index(['V3', 'V18', 'V29', 'V35', 'V63', 'V64', 'V68', 'V72', 'V87', 'V227',
       'V289', 'V307', 'V308', 'V350', 'V462', 'V464', 'V475', 'V494', 'V507',
       'V539', 'V544', 'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687',
       'V691', 'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V982',
       'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1173',
       'V1193', 'V1203', 'V1206', 'V1218', 'V1234', 'V1256', 'V1263', 'V1443',
       'V1478', 'V1517', 'V1530', 'V1533', 'V1549', 'V1654', 'V1657', 'V1673',
       'V1697', 'V1719', 'V1744', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829',
       'V1846', 'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.006928285019391245
Train err:  0.0047639670853183436
Test err:  0.01557093425605538


100%|███████████████████████████████████████████| 30/30 [04:31<00:00,  9.05s/it]


SVC optimal number of PCA components: 12
Cross val err:  0.004763782854889209
Train err:  0.0038977912516240387
Test err:  0.01384083044982698


100%|█████████████████████████████████████████| 100/100 [00:19<00:00,  5.05it/s]


SVC optimal number of features: 88
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72', 'V87',
       'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394',
       'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V544', 'V568',
       'V600', 'V627', 'V657', 'V658', 'V673', 'V687', 'V691', 'V730', 'V803',
       'V818', 'V845', 'V850', 'V855', 'V889', 'V922', 'V982', 'V1005',
       'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1102', 'V1173',
       'V1193', 'V1203', 'V1206', 'V1218', 'V1234', 'V1249', 'V1256', 'V1263',
       'V1293', 'V1443', 'V1478', 'V1517', 'V1530', 'V1533', 'V1549', 'V1580',
       'V1654', 'V1657', 'V1673', 'V1697', 'V1719', 'V1744', 'V1787', 'V1799',
       'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1868', 'V1871', 'V1877',
       'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.003897042942596074
Train err:  0.0025985275010826925
Test err:  0.01211072664359858


100%|███████████████████████████████████████████| 30/30 [05:36<00:00, 11.23s/it]


KNN optimal number of PCA components: 24
Cross val err:  0.007360246405799464
Train err:  0.0
Test err:  0.0017301038062284002


100%|█████████████████████████████████████████| 100/100 [01:32<00:00,  1.09it/s]


LR optimal number of features: 88
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72', 'V87',
       'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394',
       'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V544', 'V568',
       'V600', 'V627', 'V657', 'V658', 'V673', 'V687', 'V691', 'V730', 'V803',
       'V818', 'V845', 'V850', 'V855', 'V889', 'V922', 'V982', 'V1005',
       'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1102', 'V1173',
       'V1193', 'V1203', 'V1206', 'V1218', 'V1234', 'V1249', 'V1256', 'V1263',
       'V1293', 'V1443', 'V1478', 'V1517', 'V1530', 'V1533', 'V1549', 'V1580',
       'V1654', 'V1657', 'V1673', 'V1697', 'V1719', 'V1744', 'V1787', 'V1799',
       'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1868', 'V1871', 'V1877',
       'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.003031242076795282
Train err:  0.0
Test err:  0.00692041522491349




In [12]:
##Run everything with 90/10 split
split = 0.9
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)

d[f"KNN_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train, y_test)
d[f"KNN_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train, y_test)
d[f"SVC_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train, y_test)
d[f"SVC_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train, y_test)
d[f"LR_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train, y_test)
d[f"LR_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████| 30/30 [04:44<00:00,  9.47s/it]


KNN optimal number of PCA components: 28
Cross val err:  0.006542166888987566
Train err:  0.004234026173979943
Test err:  0.02076124567474047




100%|█████████████████████████████████████████| 100/100 [00:10<00:00,  9.13it/s]


KNN optimal number of features: 76
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72', 'V87',
       'V217', 'V227', 'V289', 'V307', 'V308', 'V350', 'V462', 'V464', 'V475',
       'V494', 'V507', 'V539', 'V544', 'V568', 'V600', 'V627', 'V657', 'V673',
       'V687', 'V691', 'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889',
       'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101',
       'V1173', 'V1193', 'V1206', 'V1218', 'V1234', 'V1249', 'V1256', 'V1263',
       'V1443', 'V1478', 'V1517', 'V1533', 'V1549', 'V1654', 'V1657', 'V1673',
       'V1697', 'V1719', 'V1744', 'V1787', 'V1799', 'V1812', 'V1813', 'V1827',
       'V1829', 'V1846', 'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.008079887357343885
Train err:  0.007698229407236301
Test err:  0.01038062283737029


100%|███████████████████████████████████████████| 30/30 [04:45<00:00,  9.52s/it]


SVC optimal number of PCA components: 22
Cross val err:  0.005001482140210456
Train err:  0.002694380292532772
Test err:  0.01038062283737029


100%|█████████████████████████████████████████| 100/100 [00:23<00:00,  4.22it/s]


SVC optimal number of features: 93
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V87', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350',
       'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V541',
       'V544', 'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687', 'V691',
       'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V922', 'V982',
       'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1126',
       'V1152', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234', 'V1249',
       'V1256', 'V1263', 'V1293', 'V1315', 'V1443', 'V1478', 'V1517', 'V1530',
       'V1533', 'V1549', 'V1580', 'V1654', 'V1657', 'V1673', 'V1697', 'V1719',
       'V1744', 'V1760', 'V1772', 'V1787', 'V1799', 'V1812', 'V1813', 'V1827',
       'V1829', 'V1846', 'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.005387579665036335
Train err:  0.001924557

100%|███████████████████████████████████████████| 30/30 [05:33<00:00, 11.11s/it]


KNN optimal number of PCA components: 22
Cross val err:  0.004233733511190052
Train err:  0.0
Test err:  0.01038062283737029


100%|█████████████████████████████████████████| 100/100 [01:43<00:00,  1.03s/it]


LR optimal number of features: 95
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V87', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350',
       'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V541',
       'V544', 'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687', 'V691',
       'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V922', 'V982',
       'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1102',
       'V1126', 'V1152', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234',
       'V1249', 'V1256', 'V1263', 'V1293', 'V1315', 'V1443', 'V1478', 'V1517',
       'V1530', 'V1533', 'V1549', 'V1575', 'V1580', 'V1654', 'V1657', 'V1673',
       'V1697', 'V1719', 'V1744', 'V1760', 'V1772', 'V1787', 'V1799', 'V1812',
       'V1813', 'V1827', 'V1829', 'V1846', 'V1871', 'V1877', 'V1882', 'V1936',
       'V1971'],
      dtype='object')
Cross val err:  0.004616866755595139



In [13]:
#Save data of part 1
df_1 = pd.DataFrame(data =d, index = ['Train', 'Cross', 'Test', 'Opt_Feat_or_PCA'])
df_1.to_csv('./data.csv', sep=" ")

In [14]:
##Part 2 Theme 1 mislabeling

def mislabel(mislabel_fraction, y_train):
    labels = set(labels_df["x"])

    num_samples = len(y_train)
    num_mislabels = int(mislabel_fraction * num_samples)
    mislabel_indices = np.random.choice(num_samples, num_mislabels, replace=False)

    y_train_noise = y_train.copy()

    for i in mislabel_indices:
        correct = y_train[i]
        y_train_noise[i] = np.random.choice(list(labels - set([correct])))
    
    return y_train_noise

In [15]:
##Creating dictionary to save data of part 2
d = dict()

In [16]:
##Mislabel fraction 0.2, 70/30 split
mislabel_fraction = 0.2
split = 0.7
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)



100%|███████████████████████████████████████████| 30/30 [03:46<00:00,  7.53s/it]


KNN optimal number of PCA components: 9
Cross val err:  0.2163366336633663
Train err:  0.20297029702970293
Test err:  0.019607843137254943




100%|█████████████████████████████████████████| 100/100 [00:09<00:00, 10.65it/s]


KNN optimal number of features: 99
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V68', 'V72', 'V87', 'V178', 'V193', 'V200',
       'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394', 'V462',
       'V464', 'V475', 'V493', 'V494', 'V504', 'V507', 'V539', 'V541', 'V544',
       'V568', 'V578', 'V600', 'V614', 'V627', 'V657', 'V663', 'V667', 'V673',
       'V676', 'V687', 'V691', 'V730', 'V845', 'V855', 'V871', 'V914', 'V959',
       'V982', 'V1015', 'V1028', 'V1033', 'V1059', 'V1066', 'V1097', 'V1098',
       'V1101', 'V1102', 'V1147', 'V1152', 'V1193', 'V1203', 'V1218', 'V1222',
       'V1230', 'V1249', 'V1256', 'V1263', 'V1290', 'V1300', 'V1315', 'V1478',
       'V1517', 'V1530', 'V1533', 'V1535', 'V1549', 'V1575', 'V1612', 'V1635',
       'V1644', 'V1654', 'V1658', 'V1673', 'V1697', 'V1702', 'V1731', 'V1744',
       'V1772', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829', 'V1846', 'V1871',
       'V1882', 'V1931', 'V1936', 'V1971', 'V1999'],
      dtype='objec

100%|███████████████████████████████████████████| 30/30 [04:28<00:00,  8.95s/it]


SVC optimal number of PCA components: 9
Cross val err:  0.2069306930693069
Train err:  0.2054455445544554
Test err:  0.0046136101499423265


100%|█████████████████████████████████████████| 100/100 [00:55<00:00,  1.80it/s]


SVC optimal number of features: 95
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V68', 'V72', 'V87', 'V178', 'V193', 'V200',
       'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394', 'V462',
       'V464', 'V475', 'V493', 'V494', 'V504', 'V507', 'V539', 'V541', 'V544',
       'V568', 'V578', 'V600', 'V614', 'V627', 'V663', 'V667', 'V673', 'V676',
       'V687', 'V691', 'V730', 'V845', 'V855', 'V871', 'V914', 'V959', 'V982',
       'V1028', 'V1033', 'V1059', 'V1066', 'V1097', 'V1098', 'V1101', 'V1102',
       'V1147', 'V1152', 'V1193', 'V1203', 'V1218', 'V1222', 'V1230', 'V1249',
       'V1256', 'V1263', 'V1290', 'V1300', 'V1315', 'V1478', 'V1517', 'V1530',
       'V1533', 'V1535', 'V1549', 'V1575', 'V1612', 'V1635', 'V1654', 'V1658',
       'V1673', 'V1697', 'V1702', 'V1731', 'V1744', 'V1772', 'V1787', 'V1799',
       'V1812', 'V1827', 'V1829', 'V1846', 'V1871', 'V1882', 'V1931', 'V1936',
       'V1971'],
      dtype='object')




Cross val err:  0.21039603960396036
Train err:  0.20247524752475243
Test err:  0.005767012687427964


100%|███████████████████████████████████████████| 30/30 [04:01<00:00,  8.06s/it]


KNN optimal number of PCA components: 22
Cross val err:  0.20841584158415838
Train err:  0.2074257425742574
Test err:  0.0023068050749711633


100%|█████████████████████████████████████████| 100/100 [01:26<00:00,  1.15it/s]

LR optimal number of features: 37
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V289', 'V307', 'V350', 'V475', 'V507',
       'V539', 'V600', 'V627', 'V673', 'V687', 'V845', 'V1066', 'V1097',
       'V1098', 'V1101', 'V1193', 'V1218', 'V1256', 'V1263', 'V1478', 'V1517',
       'V1533', 'V1549', 'V1654', 'V1673', 'V1697', 'V1744', 'V1787', 'V1799',
       'V1812', 'V1846', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.2193069306930694
Train err:  0.21089108910891086
Test err:  0.029988465974625123





In [17]:
##Mislabel fraction 0.2, 80/20 split
mislabel_fraction = 0.2
split = 0.8
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|███████████████████████████████████████████| 30/30 [04:34<00:00,  9.14s/it]


KNN optimal number of PCA components: 9
Cross val err:  0.21394953564150965
Train err:  0.19922044174967524
Test err:  0.03287197231833905




100%|█████████████████████████████████████████| 100/100 [00:09<00:00, 10.41it/s]


KNN optimal number of features: 72
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V68', 'V87', 'V193', 'V200', 'V217', 'V227',
       'V289', 'V307', 'V308', 'V350', 'V394', 'V462', 'V464', 'V475', 'V494',
       'V507', 'V539', 'V541', 'V568', 'V578', 'V600', 'V627', 'V657', 'V663',
       'V673', 'V687', 'V730', 'V845', 'V982', 'V1033', 'V1066', 'V1097',
       'V1098', 'V1101', 'V1102', 'V1152', 'V1193', 'V1218', 'V1256', 'V1263',
       'V1290', 'V1300', 'V1478', 'V1517', 'V1530', 'V1533', 'V1535', 'V1549',
       'V1575', 'V1635', 'V1644', 'V1654', 'V1673', 'V1697', 'V1702', 'V1731',
       'V1744', 'V1772', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829', 'V1846',
       'V1871', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.21785127381656666
Train err:  0.20398440883499347
Test err:  0.01384083044982698


100%|███████████████████████████████████████████| 30/30 [04:50<00:00,  9.69s/it]


SVC optimal number of PCA components: 11
Cross val err:  0.2061554497563175
Train err:  0.20398440883499347
Test err:  0.00865051903114189


100%|█████████████████████████████████████████| 100/100 [01:11<00:00,  1.40it/s]


SVC optimal number of features: 63
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V68', 'V87', 'V289', 'V307', 'V308', 'V350',
       'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V541', 'V568', 'V578',
       'V600', 'V627', 'V673', 'V687', 'V845', 'V982', 'V1033', 'V1066',
       'V1097', 'V1098', 'V1101', 'V1102', 'V1152', 'V1193', 'V1218', 'V1256',
       'V1263', 'V1300', 'V1478', 'V1517', 'V1530', 'V1533', 'V1535', 'V1549',
       'V1575', 'V1635', 'V1644', 'V1654', 'V1673', 'V1697', 'V1702', 'V1731',
       'V1744', 'V1772', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829', 'V1846',
       'V1871', 'V1882', 'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.20658741114272572
Train err:  0.20138588133391078
Test err:  0.0017301038062284002


100%|███████████████████████████████████████████| 30/30 [04:50<00:00,  9.70s/it]


KNN optimal number of PCA components: 23
Cross val err:  0.20702218966861052
Train err:  0.20355132091814643
Test err:  0.00692041522491349


100%|█████████████████████████████████████████| 100/100 [01:36<00:00,  1.04it/s]


LR optimal number of features: 48
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V68', 'V87', 'V289', 'V307', 'V350', 'V462',
       'V475', 'V494', 'V507', 'V539', 'V578', 'V627', 'V673', 'V687', 'V845',
       'V982', 'V1066', 'V1097', 'V1098', 'V1101', 'V1152', 'V1193', 'V1218',
       'V1256', 'V1263', 'V1478', 'V1517', 'V1533', 'V1549', 'V1644', 'V1654',
       'V1673', 'V1697', 'V1744', 'V1772', 'V1787', 'V1799', 'V1812', 'V1829',
       'V1846', 'V1871', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.21179254584894502
Train err:  0.2087483759203118
Test err:  0.00519031141868509




In [18]:
##Mislabel fraction 0.2, 90/10 split
mislabel_fraction = 0.2
split = 0.9
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|███████████████████████████████████████████| 30/30 [04:42<00:00,  9.43s/it]


KNN optimal number of PCA components: 21
Cross val err:  0.214782866459167
Train err:  0.20438799076212466
Test err:  0.02076124567474047




100%|█████████████████████████████████████████| 100/100 [00:11<00:00,  8.35it/s]


KNN optimal number of features: 72
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V64', 'V68', 'V87', 'V200', 'V217', 'V289',
       'V307', 'V308', 'V350', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539',
       'V544', 'V568', 'V578', 'V600', 'V627', 'V657', 'V663', 'V673', 'V687',
       'V691', 'V730', 'V803', 'V845', 'V855', 'V982', 'V1005', 'V1033',
       'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1152', 'V1193', 'V1218',
       'V1249', 'V1256', 'V1263', 'V1293', 'V1300', 'V1478', 'V1517', 'V1530',
       'V1533', 'V1549', 'V1572', 'V1635', 'V1654', 'V1673', 'V1697', 'V1719',
       'V1744', 'V1772', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829', 'V1846',
       'V1871', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.2174803616422113
Train err:  0.2005388760585065
Test err:  0.02768166089965396


100%|███████████████████████████████████████████| 30/30 [05:32<00:00, 11.07s/it]


SVC optimal number of PCA components: 9
Cross val err:  0.20554616866755582
Train err:  0.2028483448806775
Test err:  0.01384083044982698


100%|█████████████████████████████████████████| 100/100 [01:27<00:00,  1.14it/s]


SVC optimal number of features: 97
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72', 'V87',
       'V178', 'V193', 'V200', 'V217', 'V227', 'V289', 'V307', 'V308', 'V350',
       'V362', 'V392', 'V394', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539',
       'V541', 'V544', 'V568', 'V578', 'V600', 'V627', 'V657', 'V663', 'V673',
       'V687', 'V691', 'V730', 'V803', 'V845', 'V855', 'V889', 'V959', 'V982',
       'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1102',
       'V1147', 'V1152', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1249',
       'V1256', 'V1263', 'V1286', 'V1293', 'V1300', 'V1443', 'V1478', 'V1517',
       'V1530', 'V1533', 'V1535', 'V1549', 'V1572', 'V1575', 'V1635', 'V1644',
       'V1654', 'V1658', 'V1673', 'V1697', 'V1702', 'V1719', 'V1744', 'V1772',
       'V1787', 'V1799', 'V1812', 'V1827', 'V1829', 'V1846', 'V1871', 'V1882',
       'V1931', 'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.20593374833259226
Train err:  0.2013086989992302
Test err:  0.01384083044982698


100%|███████████████████████████████████████████| 30/30 [04:56<00:00,  9.89s/it]


KNN optimal number of PCA components: 28
Cross val err:  0.20708685341633326
Train err:  0.20592763664357194
Test err:  0.00692041522491349


100%|█████████████████████████████████████████| 100/100 [01:43<00:00,  1.03s/it]


LR optimal number of features: 71
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V64', 'V68', 'V87', 'V200', 'V217', 'V289',
       'V307', 'V308', 'V350', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539',
       'V544', 'V568', 'V578', 'V600', 'V627', 'V657', 'V663', 'V673', 'V687',
       'V730', 'V803', 'V845', 'V855', 'V982', 'V1005', 'V1033', 'V1066',
       'V1071', 'V1097', 'V1098', 'V1101', 'V1152', 'V1193', 'V1218', 'V1249',
       'V1256', 'V1263', 'V1293', 'V1300', 'V1478', 'V1517', 'V1530', 'V1533',
       'V1549', 'V1572', 'V1635', 'V1654', 'V1673', 'V1697', 'V1719', 'V1744',
       'V1772', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829', 'V1846', 'V1871',
       'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.2167066844523492
Train err:  0.20477290223248656
Test err:  0.02768166089965396




In [19]:
##Mislabel fraction 0.5, 70/30 split
mislabel_fraction = 0.5
split = 0.7
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|███████████████████████████████████████████| 30/30 [03:45<00:00,  7.52s/it]


KNN optimal number of PCA components: 6
Cross val err:  0.5836633663366337
Train err:  0.4346534653465347
Test err:  0.23068050749711644




100%|█████████████████████████████████████████| 100/100 [00:08<00:00, 11.74it/s]


KNN optimal number of features: 96
Most predictive features: Index(['V3', 'V18', 'V30', 'V64', 'V68', 'V77', 'V87', 'V91', 'V96', 'V177',
       'V193', 'V200', 'V217', 'V274', 'V289', 'V307', 'V308', 'V350', 'V392',
       'V394', 'V462', 'V464', 'V475', 'V493', 'V494', 'V507', 'V539', 'V541',
       'V568', 'V578', 'V614', 'V627', 'V687', 'V691', 'V759', 'V845', 'V861',
       'V871', 'V914', 'V922', 'V982', 'V1012', 'V1015', 'V1033', 'V1066',
       'V1097', 'V1098', 'V1101', 'V1126', 'V1147', 'V1152', 'V1193', 'V1198',
       'V1218', 'V1230', 'V1256', 'V1263', 'V1286', 'V1295', 'V1300', 'V1315',
       'V1351', 'V1397', 'V1443', 'V1478', 'V1517', 'V1519', 'V1530', 'V1533',
       'V1535', 'V1549', 'V1572', 'V1575', 'V1644', 'V1646', 'V1654', 'V1658',
       'V1660', 'V1667', 'V1673', 'V1697', 'V1702', 'V1719', 'V1731', 'V1744',
       'V1772', 'V1787', 'V1799', 'V1812', 'V1813', 'V1829', 'V1846', 'V1882',
       'V1936', 'V1971', 'V1999'],
      dtype='object')
Cross val err:  0.5

100%|███████████████████████████████████████████| 30/30 [04:25<00:00,  8.85s/it]


SVC optimal number of PCA components: 7
Cross val err:  0.504950495049505
Train err:  0.501980198019802
Test err:  0.019607843137254943


100%|█████████████████████████████████████████| 100/100 [01:29<00:00,  1.12it/s]


SVC optimal number of features: 17
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V308', 'V494', 'V539', 'V627', 'V1101',
       'V1193', 'V1256', 'V1654', 'V1744', 'V1787', 'V1846', 'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.504950495049505
Train err:  0.5014851485148515
Test err:  0.035755478662053086


100%|███████████████████████████████████████████| 30/30 [04:00<00:00,  8.00s/it]


KNN optimal number of PCA components: 6
Cross val err:  0.504950495049505
Train err:  0.5054455445544555
Test err:  0.035755478662053086


100%|█████████████████████████████████████████| 100/100 [01:18<00:00,  1.28it/s]

LR optimal number of features: 16
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V308', 'V539', 'V627', 'V1101', 'V1193',
       'V1256', 'V1654', 'V1744', 'V1787', 'V1846', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.5113861386138614
Train err:  0.5079207920792079
Test err:  0.056516724336793556





In [20]:
##Mislabel fraction 0.5, 80/20 split
mislabel_fraction = 0.5
split = 0.8
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|███████████████████████████████████████████| 30/30 [32:47<00:00, 65.58s/it]


KNN optimal number of PCA components: 5
Cross val err:  0.587707881417209
Train err:  0.43958423559982673
Test err:  0.25951557093425603




100%|█████████████████████████████████████████| 100/100 [00:10<00:00,  9.49it/s]


KNN optimal number of features: 41
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V289', 'V308', 'V350', 'V475', 'V507',
       'V539', 'V578', 'V627', 'V673', 'V687', 'V845', 'V1033', 'V1066',
       'V1097', 'V1098', 'V1101', 'V1147', 'V1193', 'V1218', 'V1256', 'V1263',
       'V1286', 'V1478', 'V1533', 'V1535', 'V1549', 'V1654', 'V1673', 'V1697',
       'V1744', 'V1787', 'V1799', 'V1812', 'V1846', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.5894272755444121
Train err:  0.4525768731052404
Test err:  0.22145328719723179


100%|███████████████████████████████████████████| 30/30 [04:44<00:00,  9.48s/it]


SVC optimal number of PCA components: 8
Cross val err:  0.5080222741827948
Train err:  0.5023819835426592
Test err:  0.01903114186851207


100%|█████████████████████████████████████████| 100/100 [01:59<00:00,  1.20s/it]


SVC optimal number of features: 40
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V289', 'V308', 'V350', 'V475', 'V507',
       'V539', 'V578', 'V627', 'V673', 'V687', 'V845', 'V1033', 'V1066',
       'V1097', 'V1098', 'V1101', 'V1193', 'V1218', 'V1256', 'V1263', 'V1286',
       'V1478', 'V1533', 'V1535', 'V1549', 'V1654', 'V1673', 'V1697', 'V1744',
       'V1787', 'V1799', 'V1812', 'V1846', 'V1882', 'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.5093190973885118
Train err:  0.5010827197921178
Test err:  0.03114186851211076


100%|███████████████████████████████████████████| 30/30 [04:24<00:00,  8.83s/it]


KNN optimal number of PCA components: 12
Cross val err:  0.5093228535744804
Train err:  0.5049805110437419
Test err:  0.02595155709342556


100%|█████████████████████████████████████████| 100/100 [01:32<00:00,  1.08it/s]

LR optimal number of features: 14
Most predictive features: Index(['V3', 'V18', 'V68', 'V289', 'V539', 'V627', 'V1101', 'V1193', 'V1549',
       'V1654', 'V1673', 'V1744', 'V1787', 'V1936'],
      dtype='object')
Cross val err:  0.5162511385938717
Train err:  0.5119099177132957
Test err:  0.0709342560553633





In [21]:
##Mislabel fraction 0.5, 90/10 split
mislabel_fraction = 0.5
split = 0.9
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|███████████████████████████████████████████| 30/30 [04:44<00:00,  9.48s/it]


KNN optimal number of PCA components: 3
Cross val err:  0.5935423151030087
Train err:  0.4526558891454965
Test err:  0.2802768166089965




100%|█████████████████████████████████████████| 100/100 [00:12<00:00,  7.92it/s]


KNN optimal number of features: 23
Most predictive features: Index(['V18', 'V68', 'V350', 'V475', 'V539', 'V627', 'V982', 'V1066', 'V1097',
       'V1101', 'V1193', 'V1256', 'V1517', 'V1533', 'V1572', 'V1635', 'V1654',
       'V1673', 'V1744', 'V1787', 'V1799', 'V1846', 'V1936'],
      dtype='object')
Cross val err:  0.5804468652734548
Train err:  0.4588144726712856
Test err:  0.24913494809688586


100%|███████████████████████████████████████████| 30/30 [05:28<00:00, 10.96s/it]


SVC optimal number of PCA components: 13
Cross val err:  0.5042418852823477
Train err:  0.5003849114703618
Test err:  0.00692041522491349


100%|█████████████████████████████████████████| 100/100 [02:31<00:00,  1.52s/it]


SVC optimal number of features: 66
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V68', 'V77', 'V87', 'V178', 'V193', 'V200',
       'V278', 'V289', 'V307', 'V350', 'V475', 'V494', 'V539', 'V605', 'V627',
       'V663', 'V664', 'V673', 'V845', 'V855', 'V959', 'V982', 'V1024',
       'V1028', 'V1033', 'V1066', 'V1078', 'V1097', 'V1101', 'V1126', 'V1193',
       'V1218', 'V1249', 'V1256', 'V1263', 'V1286', 'V1290', 'V1300', 'V1399',
       'V1478', 'V1517', 'V1533', 'V1535', 'V1549', 'V1572', 'V1635', 'V1644',
       'V1654', 'V1673', 'V1697', 'V1744', 'V1767', 'V1772', 'V1787', 'V1799',
       'V1827', 'V1846', 'V1871', 'V1882', 'V1931', 'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.5046272417370684
Train err:  0.4992301770592764
Test err:  0.01038062283737029


100%|███████████████████████████████████████████| 30/30 [04:51<00:00,  9.71s/it]


KNN optimal number of PCA components: 24
Cross val err:  0.5065518008003557
Train err:  0.5057736720554272
Test err:  0.02076124567474047


100%|█████████████████████████████████████████| 100/100 [01:45<00:00,  1.05s/it]


LR optimal number of features: 47
Most predictive features: Index(['V3', 'V18', 'V29', 'V68', 'V87', 'V178', 'V278', 'V289', 'V350',
       'V475', 'V494', 'V539', 'V627', 'V673', 'V845', 'V959', 'V982', 'V1028',
       'V1033', 'V1066', 'V1097', 'V1101', 'V1193', 'V1218', 'V1256', 'V1263',
       'V1290', 'V1300', 'V1399', 'V1478', 'V1517', 'V1533', 'V1549', 'V1572',
       'V1635', 'V1644', 'V1654', 'V1673', 'V1697', 'V1744', 'V1772', 'V1787',
       'V1799', 'V1846', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.521562175781829
Train err:  0.5084680523479599
Test err:  0.04152249134948094




In [22]:
##Mislabel fraction 0.9, 70/30 split
mislabel_fraction = 0.9
split = 0.7
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|███████████████████████████████████████████| 30/30 [03:55<00:00,  7.85s/it]


KNN optimal number of PCA components: 1
Cross val err:  0.8257425742574257
Train err:  0.555940594059406
Test err:  0.889273356401384




100%|█████████████████████████████████████████| 100/100 [00:09<00:00, 11.06it/s]


KNN optimal number of features: 40
Most predictive features: Index(['V3', 'V68', 'V75', 'V77', 'V111', 'V187', 'V201', 'V289', 'V350',
       'V541', 'V745', 'V750', 'V795', 'V1015', 'V1017', 'V1089', 'V1101',
       'V1147', 'V1191', 'V1193', 'V1218', 'V1225', 'V1256', 'V1286', 'V1397',
       'V1530', 'V1549', 'V1635', 'V1637', 'V1646', 'V1654', 'V1667', 'V1702',
       'V1738', 'V1772', 'V1787', 'V1846', 'V1862', 'V1882', 'V1971'],
      dtype='object')
Cross val err:  0.810891089108911
Train err:  0.550990099009901
Test err:  0.9123414071510957


100%|███████████████████████████████████████████| 30/30 [04:35<00:00,  9.19s/it]


SVC optimal number of PCA components: 2
Cross val err:  0.8004950495049505
Train err:  0.7900990099009901
Test err:  0.9953863898500577


100%|█████████████████████████████████████████| 100/100 [01:44<00:00,  1.04s/it]


SVC optimal number of features: 2
Most predictive features: Index(['V201', 'V1654'], dtype='object')




Cross val err:  0.7876237623762377
Train err:  0.7806930693069307
Test err:  0.9700115340253749


100%|███████████████████████████████████████████| 30/30 [03:50<00:00,  7.68s/it]


KNN optimal number of PCA components: 3
Cross val err:  0.8014851485148515
Train err:  0.7930693069306931
Test err:  1.0


100%|█████████████████████████████████████████| 100/100 [01:35<00:00,  1.05it/s]

LR optimal number of features: 3
Most predictive features: Index(['V201', 'V795', 'V1654'], dtype='object')
Cross val err:  0.7871287128712872
Train err:  0.7876237623762377
Test err:  0.9746251441753172





In [23]:
##Mislabel fraction 0.9, 80/20 split
mislabel_fraction = 0.9
split = 0.8
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|███████████████████████████████████████████| 30/30 [04:24<00:00,  8.83s/it]


KNN optimal number of PCA components: 21
Cross val err:  0.799044989717441
Train err:  0.5599826764833261
Test err:  0.9117647058823529




100%|█████████████████████████████████████████| 100/100 [00:10<00:00,  9.18it/s]


KNN optimal number of features: 1
Most predictive features: Index(['V1549'], dtype='object')
Cross val err:  0.810304157158821
Train err:  0.5612819402338675
Test err:  0.8477508650519031


100%|███████████████████████████████████████████| 30/30 [05:27<00:00, 10.93s/it]


SVC optimal number of PCA components: 29
Cross val err:  0.8025128884131054
Train err:  0.6470333477695973
Test err:  1.0


100%|█████████████████████████████████████████| 100/100 [02:16<00:00,  1.36s/it]


SVC optimal number of features: 28
Most predictive features: Index(['V3', 'V24', 'V68', 'V87', 'V200', 'V289', 'V392', 'V413', 'V507',
       'V627', 'V871', 'V1012', 'V1089', 'V1101', 'V1193', 'V1256', 'V1293',
       'V1299', 'V1549', 'V1575', 'V1654', 'V1747', 'V1787', 'V1846', 'V1924',
       'V1931', 'V1971', 'V1999'],
      dtype='object')




Cross val err:  0.7968673409020481
Train err:  0.6652230402771763
Test err:  0.9982698961937716


100%|███████████████████████████████████████████| 30/30 [04:28<00:00,  8.95s/it]


KNN optimal number of PCA components: 5
Cross val err:  0.8033815064183828
Train err:  0.7899523603291468
Test err:  1.0


100%|█████████████████████████████████████████| 100/100 [01:42<00:00,  1.02s/it]


LR optimal number of features: 76
Most predictive features: Index(['V3', 'V24', 'V68', 'V77', 'V87', 'V91', 'V96', 'V98', 'V178', 'V200',
       'V201', 'V217', 'V227', 'V289', 'V314', 'V350', 'V381', 'V392', 'V413',
       'V418', 'V464', 'V507', 'V541', 'V555', 'V594', 'V627', 'V704', 'V728',
       'V795', 'V819', 'V822', 'V823', 'V832', 'V871', 'V982', 'V992', 'V1012',
       'V1028', 'V1040', 'V1050', 'V1089', 'V1095', 'V1101', 'V1149', 'V1152',
       'V1193', 'V1208', 'V1256', 'V1273', 'V1286', 'V1290', 'V1293', 'V1299',
       'V1443', 'V1497', 'V1530', 'V1533', 'V1536', 'V1549', 'V1575', 'V1615',
       'V1635', 'V1654', 'V1658', 'V1747', 'V1770', 'V1772', 'V1787', 'V1812',
       'V1846', 'V1882', 'V1884', 'V1924', 'V1931', 'V1971', 'V1999'],
      dtype='object')
Cross val err:  0.7791249964785757
Train err:  0.6829796448679082
Test err:  0.9584775086505191




In [24]:
##Mislabel fraction 0.9, 90/10 split
mislabel_fraction = 0.9
split = 0.9
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|███████████████████████████████████████████| 30/30 [04:49<00:00,  9.64s/it]


KNN optimal number of PCA components: 28
Cross val err:  0.8090810730695124
Train err:  0.5746728252501925
Test err:  0.8927335640138409




100%|█████████████████████████████████████████| 100/100 [00:13<00:00,  7.63it/s]


KNN optimal number of features: 4
Most predictive features: Index(['V68', 'V178', 'V1101', 'V1654'], dtype='object')
Cross val err:  0.8148517859789536
Train err:  0.5585065434949962
Test err:  0.8858131487889274


100%|███████████████████████████████████████████| 30/30 [06:14<00:00, 12.47s/it]


SVC optimal number of PCA components: 1
Cross val err:  0.8291003408922484
Train err:  0.8033102386451116
Test err:  0.9930795847750865


100%|█████████████████████████████████████████| 100/100 [02:51<00:00,  1.71s/it]


SVC optimal number of features: 18
Most predictive features: Index(['V68', 'V128', 'V178', 'V282', 'V295', 'V320', 'V394', 'V469', 'V915',
       'V1045', 'V1097', 'V1101', 'V1193', 'V1654', 'V1725', 'V1787', 'V1971',
       'V1999'],
      dtype='object')




Cross val err:  0.8117896843041352
Train err:  0.7090069284064665
Test err:  0.9930795847750865


100%|███████████████████████████████████████████| 30/30 [04:47<00:00,  9.59s/it]


KNN optimal number of PCA components: 2
Cross val err:  0.8152356602934638
Train err:  0.8148575827559661
Test err:  0.986159169550173


100%|█████████████████████████████████████████| 100/100 [01:44<00:00,  1.05s/it]


LR optimal number of features: 53
Most predictive features: Index(['V3', 'V24', 'V68', 'V87', 'V128', 'V147', 'V178', 'V282', 'V295',
       'V320', 'V394', 'V424', 'V469', 'V494', 'V627', 'V642', 'V667', 'V795',
       'V823', 'V855', 'V915', 'V959', 'V1017', 'V1038', 'V1045', 'V1097',
       'V1101', 'V1193', 'V1217', 'V1218', 'V1290', 'V1315', 'V1423', 'V1487',
       'V1549', 'V1589', 'V1644', 'V1654', 'V1725', 'V1738', 'V1778', 'V1787',
       'V1792', 'V1806', 'V1827', 'V1846', 'V1862', 'V1915', 'V1931', 'V1936',
       'V1962', 'V1971', 'V1999'],
      dtype='object')
Cross val err:  0.8002245442418853
Train err:  0.7340261739799846
Test err:  0.9896193771626297




In [25]:
#Save data of part 2
df_miss = pd.DataFrame(data =d, index = ['Train', 'Cross', 'Test', 'Opt_Feat_or_PCA'])
df_miss.to_csv('./data_miss.csv', sep=" ")

In [26]:
##Print data from both parts
td = pd.read_csv('./data.csv', sep = " ", header=0, index_col=0)
print(td)
td_miss= pd.read_csv('./data_miss.csv', sep = " ", header=0, index_col=0)
print(td_miss)



                 KNN_PCA_30_70  KNN_Feat_30_70  SVC_PCA_30_70  SVC_Feat_30_70  \
Train                 0.007921        0.007921       0.003465        0.001980   
Cross                 0.008416        0.008416       0.007921        0.006931   
Test                  0.009227        0.010381       0.008074        0.005767   
Opt_Feat_or_PCA      22.000000      100.000000      20.000000      100.000000   

                 LR_PCA_30_70  LR_Feat_30_70  KNN_PCA_20_80  KNN_Feat_20_80  \
Train                0.000000       0.000000       0.004331        0.004764   
Cross                0.008416       0.005446       0.004331        0.006928   
Test                 0.010381       0.004614       0.015571        0.015571   
Opt_Feat_or_PCA     18.000000      92.000000      30.000000       75.000000   

                 SVC_PCA_20_80  SVC_Feat_20_80  LR_PCA_20_80  LR_Feat_20_80  \
Train                 0.003898        0.002599       0.00000       0.000000   
Cross                 0.004764        0.