In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from tqdm import tqdm

#features = [f"V{x}" for x in range(1,2000)]
df = pd.read_csv('Data/TCGAdata.txt', sep=" " ,header=0)
labels_df = pd.read_csv('Data/TCGAlabels', sep=" " ,header=0)

#Set max number of components for PCA
max_num_components = 25

num_components_range = range(1, max_num_components)

In [None]:
#Pre-processes the data by splitting and normalizing 
def pre_process(data, labels, train_size):
    #Split data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(df, labels_df.values.ravel(), test_size=1-train_size, random_state=42)
    
    #Standardize the rows (transposing as fit_transform standardizes along columns)
    #Scale after split to avoid data leakage
    scaler = StandardScaler()
    X_train = pd.DataFrame(np.transpose(scaler.fit_transform(X_train.transpose())), columns=X_train.columns)
    X_test = pd.DataFrame(np.transpose(scaler.fit_transform(X_test.transpose())), columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
##KNN PCA

def KNN_PCA(X_train, X_test, y_train, y_test):

    KNN_mean_scores = np.zeros(max_num_components)

    # Loop over different numbers of components
    for n_components in tqdm(num_components_range):

        #PCA

        KNN_pipeline = make_pipeline(PCA(n_components=n_components), KNeighborsClassifier(n_neighbors=5))

        KNN_scores = cross_val_score(KNN_pipeline, X_train, y_train, cv=5)
        KNN_mean_score = KNN_scores.mean()

        #KNN_mean_scores.append(KNN_mean_score)
        KNN_mean_scores[n_components] = KNN_mean_score

    KNN_optimal_n_components = np.where(KNN_mean_scores==KNN_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(KNN_mean_scores)

    print("KNN optimal number of PCA components:", KNN_optimal_n_components)

    opt_pipeline = make_pipeline(PCA(n_components=KNN_optimal_n_components), KNeighborsClassifier(n_neighbors=5))

    opt_pipeline.fit(X_train, y_train)
    train_pred = opt_pipeline.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = opt_pipeline.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)
    print("\n")

In [None]:
##KNN features

def KNN_features(X_train, X_test, y_train, y_test):
    max_num_features = 50

    num_features = range(1, max_num_features)
    KNN_mean_scores = np.zeros(max_num_features)

    # Loop over different numbers of features
    for k in tqdm(num_features):

        feature_selector = SelectKBest(f_classif, k=k)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)

        model = KNeighborsClassifier(n_neighbors=5)

        KNN_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
        KNN_mean_score = KNN_scores.mean()

        KNN_mean_scores[k] = KNN_mean_score

        #KNN_mean_scores.append(KNN_mean_score)

    KNN_optimal_k_features = np.where(KNN_mean_scores==KNN_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(KNN_mean_scores)

    print("KNN optimal number of features:", KNN_optimal_k_features)

    feature_selector = SelectKBest(f_classif, k=KNN_optimal_k_features)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    model.fit(X_train_selected, y_train)


    # Get the indices of the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)

    # Get the names of the most predictive features
    selected_features = X_train.columns[selected_feature_indices]
    print("Most predictive features:", selected_features)


    train_pred = model.predict(X_train_selected)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

In [None]:
##SVC PCA
def SVC_PCA(X_train, X_test, y_train, y_test):
    SVC_mean_scores = np.zeros(max_num_components)

    #num_components_range = range(1, max_num_components)

    # Loop over different numbers of components
    for n_components in tqdm(num_components_range):

        #PCA

        SVC_pipeline = make_pipeline(PCA(n_components=n_components), SVC())

        SVC_scores = cross_val_score(SVC_pipeline, X_train, y_train, cv=5)
        SVC_mean_score = SVC_scores.mean()

        #SVC_mean_scores.append(SVC_mean_score)
        SVC_mean_scores[n_components] = SVC_mean_score

    SVC_optimal_n_components = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(SVC_mean_scores)

    print("SVC optimal number of PCA components:", SVC_optimal_n_components)

    opt_pipeline = make_pipeline(PCA(n_components=SVC_optimal_n_components), SVC())

    opt_pipeline.fit(X_train, y_train)
    train_pred = opt_pipeline.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = opt_pipeline.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

In [None]:
##SVC features

def SVC_features(X_train, X_test, y_train, y_test):
    max_num_features = 50
    num_features = range(1, max_num_features)
    SVC_mean_scores = np.zeros(max_num_features)

    # Loop over different numbers of components
    for k in tqdm(num_features):

        feature_selector = SelectKBest(f_classif, k=k)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)

        model = SVC()

        SVC_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
        SVC_mean_score = SVC_scores.mean()

        #SVC_mean_scores.append(SVC_mean_score)
        SVC_mean_scores[k] = SVC_mean_score


    SVC_optimal_k_features = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(SVC_mean_scores)

    print("SVC optimal number of features:", SVC_optimal_k_features)

    feature_selector = SelectKBest(f_classif, k=SVC_optimal_k_features)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    model.fit(X_train_selected, y_train)

    # Get the indices of the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)

    # Get the names of the most predictive features
    selected_features = X_train.columns[selected_feature_indices]
    print("Most predictive features:", selected_features)


    train_pred = model.predict(X_train_selected)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

In [None]:
##Logistic regression PCA

def LR_PCA(X_train, X_test, y_train, y_test):
    LR_mean_scores = np.zeros(max_num_components)

    #num_components_range = range(1, 25)

    # Loop over different numbers of components
    for n_components in tqdm(num_components_range):

        #PCA

        LR_pipeline = make_pipeline(PCA(n_components=n_components), LogisticRegression(random_state=16, max_iter=10000))

        LR_scores = cross_val_score(LR_pipeline, X_train, y_train, cv=5)
        LR_mean_score = LR_scores.mean()

        #LR_mean_scores.append(LR_mean_score)
        LR_mean_scores[n_components] = LR_mean_score

    LR_optimal_n_components = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(LR_mean_scores)

    print("KNN optimal number of PCA components:", LR_optimal_n_components)

    opt_pipeline = make_pipeline(PCA(n_components=LR_optimal_n_components), LogisticRegression(random_state=16, max_iter=10000))

    opt_pipeline.fit(X_train, y_train)
    train_pred = opt_pipeline.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = opt_pipeline.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

In [None]:
##Logistic Regression features

def LR_features(X_train, X_test, y_train, y_test):
    max_num_features = 100
    num_features = range(1, max_num_features)
    LR_mean_scores = np.zeros(max_num_features)

    # Loop over different numbers of components
    for k in tqdm(num_features):

        feature_selector = SelectKBest(f_classif, k=k)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)

        model = LogisticRegression(random_state=16, max_iter=10000)

        LR_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
        LR_mean_score = LR_scores.mean()

        #LR_mean_scores.append(LR_mean_score)
        LR_mean_scores[k] = LR_mean_score

    LR_optimal_k_features = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(LR_mean_scores)

    print("LR optimal number of features:", LR_optimal_k_features)

    feature_selector = SelectKBest(f_classif, k=LR_optimal_k_features)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    model.fit(X_train_selected, y_train)

    # Get the indices of the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)

    # Get the names of the most predictive features
    selected_features = X_train.columns[selected_feature_indices]
    print("Most predictive features:", selected_features)

    train_pred = model.predict(X_train_selected)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

In [None]:
##Run everything with 70/30 split
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.7)

KNN_PCA(X_train, X_test, y_train, y_test)
KNN_features(X_train, X_test, y_train, y_test)
SVC_PCA(X_train, X_test, y_train, y_test)
SVC_features(X_train, X_test, y_train, y_test)
LR_PCA(X_train, X_test, y_train, y_test)
LR_features(X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████████████████████████| 24/24 [02:58<00:00,  7.42s/it]


KNN optimal number of PCA components: 25
Cross val err:  0.006930693069306826
Train err:  0.00544554455445545
Test err:  0.011534025374855816




100%|████████████████████████████████████████████████████████████████████| 49/49 [00:04<00:00, 11.76it/s]


KNN optimal number of features: 48
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V289', 'V307', 'V308', 'V350',
       'V462', 'V475', 'V507', 'V539', 'V544', 'V627', 'V657', 'V687', 'V730',
       'V803', 'V845', 'V1005', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101',
       'V1173', 'V1193', 'V1206', 'V1218', 'V1256', 'V1263', 'V1478', 'V1517',
       'V1533', 'V1654', 'V1673', 'V1697', 'V1719', 'V1744', 'V1787', 'V1799',
       'V1812', 'V1829', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.011881188118811892
Train err:  0.008415841584158423
Test err:  0.011534025374855816


100%|████████████████████████████████████████████████████████████████████| 24/24 [02:57<00:00,  7.39s/it]


SVC optimal number of PCA components: 8
Cross val err:  0.005940594059405946
Train err:  0.002970297029702973
Test err:  0.009227220299884653


100%|████████████████████████████████████████████████████████████████████| 49/49 [00:07<00:00,  6.41it/s]


SVC optimal number of features: 42
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V289', 'V308', 'V350', 'V462',
       'V475', 'V507', 'V539', 'V627', 'V657', 'V687', 'V730', 'V803', 'V845',
       'V1005', 'V1071', 'V1097', 'V1098', 'V1101', 'V1173', 'V1193', 'V1206',
       'V1218', 'V1256', 'V1517', 'V1533', 'V1654', 'V1673', 'V1697', 'V1744',
       'V1787', 'V1799', 'V1812', 'V1829', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.010396039603960405
Train err:  0.004950495049504955
Test err:  0.01038062283737029


100%|████████████████████████████████████████████████████████████████████| 24/24 [03:13<00:00,  8.08s/it]


KNN optimal number of PCA components: 20
Cross val err:  0.008415841584158423
Train err:  0.0
Test err:  0.0034602076124568004


100%|████████████████████████████████████████████████████████████████████| 99/99 [01:20<00:00,  1.23it/s]


LR optimal number of features: 100
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V83', 'V87', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308',
       'V350', 'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539',
       'V541', 'V544', 'V568', 'V578', 'V600', 'V627', 'V657', 'V658', 'V673',
       'V687', 'V691', 'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889',
       'V922', 'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098',
       'V1101', 'V1102', 'V1126', 'V1152', 'V1173', 'V1193', 'V1203', 'V1206',
       'V1218', 'V1234', 'V1249', 'V1256', 'V1263', 'V1290', 'V1293', 'V1443',
       'V1478', 'V1517', 'V1530', 'V1533', 'V1549', 'V1575', 'V1580', 'V1583',
       'V1635', 'V1654', 'V1657', 'V1673', 'V1697', 'V1719', 'V1744', 'V1772',
       'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1868',
       'V1871', 'V1877', 'V1881', 'V1882', 'V1936', 'V1971'],
      dtype='ob



In [None]:
##Run everything with 80/20 split
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.8)

KNN_PCA(X_train, X_test, y_train, y_test)
KNN_features(X_train, X_test, y_train, y_test)
SVC_PCA(X_train, X_test, y_train, y_test)
SVC_features(X_train, X_test, y_train, y_test)
LR_PCA(X_train, X_test, y_train, y_test)
LR_features(X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████████████████████████| 24/24 [03:10<00:00,  7.93s/it]


KNN optimal number of PCA components: 23
Cross val err:  0.006063423200082574
Train err:  0.0047639670853183436
Test err:  0.01211072664359858




100%|████████████████████████████████████████████████████████████████████| 49/49 [00:04<00:00,  9.91it/s]


KNN optimal number of features: 48
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V289', 'V307', 'V308', 'V350',
       'V462', 'V475', 'V507', 'V539', 'V544', 'V627', 'V657', 'V687', 'V730',
       'V803', 'V845', 'V1005', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101',
       'V1173', 'V1193', 'V1206', 'V1218', 'V1256', 'V1263', 'V1478', 'V1517',
       'V1533', 'V1654', 'V1673', 'V1697', 'V1719', 'V1744', 'V1787', 'V1799',
       'V1812', 'V1829', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.010395244668563564
Train err:  0.008661758336942382
Test err:  0.01384083044982698


100%|████████████████████████████████████████████████████████████████████| 24/24 [03:10<00:00,  7.95s/it]


SVC optimal number of PCA components: 23
Cross val err:  0.005198561380773903
Train err:  0.0017323516673884987
Test err:  0.01384083044982698


100%|████████████████████████████████████████████████████████████████████| 49/49 [00:08<00:00,  5.45it/s]


SVC optimal number of features: 44
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V289', 'V307', 'V308', 'V462',
       'V475', 'V507', 'V539', 'V544', 'V627', 'V657', 'V687', 'V730', 'V803',
       'V845', 'V1005', 'V1071', 'V1097', 'V1098', 'V1101', 'V1173', 'V1193',
       'V1206', 'V1218', 'V1256', 'V1263', 'V1517', 'V1533', 'V1654', 'V1673',
       'V1697', 'V1744', 'V1787', 'V1799', 'V1812', 'V1829', 'V1871', 'V1882',
       'V1936'],
      dtype='object')
Cross val err:  0.007363063545276161
Train err:  0.004330879168471191
Test err:  0.01211072664359858


100%|████████████████████████████████████████████████████████████████████| 24/24 [04:30<00:00, 11.27s/it]


KNN optimal number of PCA components: 19
Cross val err:  0.005198561380773903
Train err:  0.0
Test err:  0.00519031141868509


100%|████████████████████████████████████████████████████████████████████| 99/99 [01:30<00:00,  1.09it/s]


LR optimal number of features: 94
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V87', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394',
       'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V544', 'V568',
       'V600', 'V627', 'V657', 'V658', 'V673', 'V687', 'V691', 'V730', 'V803',
       'V818', 'V845', 'V850', 'V855', 'V889', 'V922', 'V982', 'V1005',
       'V1033', 'V1066', 'V1071', 'V1078', 'V1097', 'V1098', 'V1101', 'V1102',
       'V1152', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234', 'V1249',
       'V1256', 'V1263', 'V1293', 'V1443', 'V1478', 'V1517', 'V1530', 'V1533',
       'V1549', 'V1575', 'V1580', 'V1583', 'V1635', 'V1654', 'V1657', 'V1673',
       'V1697', 'V1719', 'V1744', 'V1760', 'V1787', 'V1799', 'V1812', 'V1813',
       'V1827', 'V1829', 'V1846', 'V1868', 'V1871', 'V1877', 'V1882', 'V1936',
       'V1971'],
      dtype='object')
Cross val err:  0.003033120169779635
Train 



In [None]:
##Run everything with 90/10 split
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.9)

KNN_PCA(X_train, X_test, y_train, y_test)
KNN_features(X_train, X_test, y_train, y_test)
SVC_PCA(X_train, X_test, y_train, y_test)
SVC_features(X_train, X_test, y_train, y_test)
LR_PCA(X_train, X_test, y_train, y_test)
LR_features(X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████████████████████████| 24/24 [03:23<00:00,  8.49s/it]


KNN optimal number of PCA components: 10
Cross val err:  0.006543649029198129
Train err:  0.005388760585065433
Test err:  0.01038062283737029




100%|████████████████████████████████████████████████████████████████████| 49/49 [00:05<00:00,  8.67it/s]


KNN optimal number of features: 39
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V289', 'V308', 'V462', 'V475',
       'V507', 'V539', 'V627', 'V657', 'V687', 'V730', 'V803', 'V845', 'V1005',
       'V1071', 'V1097', 'V1098', 'V1101', 'V1193', 'V1206', 'V1218', 'V1256',
       'V1517', 'V1533', 'V1654', 'V1673', 'V1744', 'V1787', 'V1799', 'V1812',
       'V1829', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.01116348006521406
Train err:  0.009622786759045376
Test err:  0.01038062283737029


100%|████████████████████████████████████████████████████████████████████| 24/24 [03:30<00:00,  8.79s/it]


SVC optimal number of PCA components: 9
Cross val err:  0.006543649029198129
Train err:  0.0034642032332563577
Test err:  0.01384083044982698


100%|████████████████████████████████████████████████████████████████████| 49/49 [00:10<00:00,  4.51it/s]


SVC optimal number of features: 33
Most predictive features: Index(['V3', 'V18', 'V29', 'V68', 'V87', 'V462', 'V475', 'V507', 'V539',
       'V627', 'V657', 'V687', 'V730', 'V803', 'V845', 'V1005', 'V1071',
       'V1098', 'V1101', 'V1193', 'V1218', 'V1256', 'V1533', 'V1654', 'V1673',
       'V1744', 'V1787', 'V1799', 'V1812', 'V1829', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.008084333777975461
Train err:  0.0038491147036181506
Test err:  0.01038062283737029


100%|████████████████████████████████████████████████████████████████████| 24/24 [04:32<00:00, 11.36s/it]


KNN optimal number of PCA components: 24
Cross val err:  0.00500592856084181
Train err:  0.0
Test err:  0.01384083044982698


100%|████████████████████████████████████████████████████████████████████| 99/99 [01:48<00:00,  1.09s/it]


LR optimal number of features: 82
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72', 'V87',
       'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394', 'V462', 'V464',
       'V475', 'V494', 'V507', 'V539', 'V544', 'V568', 'V600', 'V627', 'V657',
       'V658', 'V673', 'V687', 'V691', 'V730', 'V803', 'V818', 'V845', 'V850',
       'V855', 'V889', 'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097',
       'V1098', 'V1101', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234',
       'V1249', 'V1256', 'V1263', 'V1293', 'V1443', 'V1478', 'V1517', 'V1530',
       'V1533', 'V1549', 'V1580', 'V1654', 'V1657', 'V1673', 'V1697', 'V1719',
       'V1744', 'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846',
       'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.0038498591966800166
Train err:  0.0
Test err:  0.01384083044982698




In [None]:
##Part 2 Theme 2 mislabeling

def mislabel(mislabel_fraction, y_train):
    labels = set(labels_df["x"])

    num_samples = len(y_train)
    num_mislabels = int(mislabel_fraction * num_samples)
    mislabel_indices = np.random.choice(num_samples, num_mislabels, replace=False)

    y_train_noise = y_train.copy()

    for i in mislabel_indices:
        correct = y_train[i]
        y_train_noise[i] = np.random.choice(list(labels - set([correct])))
    
    return y_train_noise

In [None]:
##Mislabel fraction 0.2, 70/30 split
mislabel_fraction = 0.2
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.7)
y_train_noise = mislabel(mislabel_fraction, y_train)

KNN_PCA(X_train, X_test, y_train_noise, y_test)
KNN_features(X_train, X_test, y_train_noise, y_test)
SVC_PCA(X_train, X_test, y_train_noise, y_test)
SVC_features(X_train, X_test, y_train_noise, y_test)
LR_PCA(X_train, X_test, y_train_noise, y_test)
LR_features(X_train, X_test, y_train_noise, y_test)

100%|████████████████████████████████████████████████████████████████████| 24/24 [02:49<00:00,  7.07s/it]


KNN optimal number of PCA components: 25
Cross val err:  0.006930693069306826
Train err:  0.00544554455445545
Test err:  0.011534025374855816




100%|████████████████████████████████████████████████████████████████████| 49/49 [00:04<00:00, 11.63it/s]


KNN optimal number of features: 48
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V289', 'V307', 'V308', 'V350',
       'V462', 'V475', 'V507', 'V539', 'V544', 'V627', 'V657', 'V687', 'V730',
       'V803', 'V845', 'V1005', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101',
       'V1173', 'V1193', 'V1206', 'V1218', 'V1256', 'V1263', 'V1478', 'V1517',
       'V1533', 'V1654', 'V1673', 'V1697', 'V1719', 'V1744', 'V1787', 'V1799',
       'V1812', 'V1829', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.011881188118811892
Train err:  0.008415841584158423
Test err:  0.011534025374855816


100%|████████████████████████████████████████████████████████████████████| 24/24 [02:51<00:00,  7.13s/it]


SVC optimal number of PCA components: 8
Cross val err:  0.005940594059405946
Train err:  0.002970297029702973
Test err:  0.009227220299884653


100%|████████████████████████████████████████████████████████████████████| 49/49 [00:08<00:00,  6.04it/s]


SVC optimal number of features: 42
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V289', 'V308', 'V350', 'V462',
       'V475', 'V507', 'V539', 'V627', 'V657', 'V687', 'V730', 'V803', 'V845',
       'V1005', 'V1071', 'V1097', 'V1098', 'V1101', 'V1173', 'V1193', 'V1206',
       'V1218', 'V1256', 'V1517', 'V1533', 'V1654', 'V1673', 'V1697', 'V1744',
       'V1787', 'V1799', 'V1812', 'V1829', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.010396039603960405
Train err:  0.004950495049504955
Test err:  0.01038062283737029


100%|████████████████████████████████████████████████████████████████████| 24/24 [03:41<00:00,  9.21s/it]


KNN optimal number of PCA components: 20
Cross val err:  0.008415841584158423
Train err:  0.0
Test err:  0.0034602076124568004


100%|████████████████████████████████████████████████████████████████████| 99/99 [01:27<00:00,  1.13it/s]


LR optimal number of features: 100
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V83', 'V87', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308',
       'V350', 'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539',
       'V541', 'V544', 'V568', 'V578', 'V600', 'V627', 'V657', 'V658', 'V673',
       'V687', 'V691', 'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889',
       'V922', 'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098',
       'V1101', 'V1102', 'V1126', 'V1152', 'V1173', 'V1193', 'V1203', 'V1206',
       'V1218', 'V1234', 'V1249', 'V1256', 'V1263', 'V1290', 'V1293', 'V1443',
       'V1478', 'V1517', 'V1530', 'V1533', 'V1549', 'V1575', 'V1580', 'V1583',
       'V1635', 'V1654', 'V1657', 'V1673', 'V1697', 'V1719', 'V1744', 'V1772',
       'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1868',
       'V1871', 'V1877', 'V1881', 'V1882', 'V1936', 'V1971'],
      dtype='ob



In [None]:
##Mislabel fraction 0.2, 80/20 split
mislabel_fraction = 0.2
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.8)
y_train_noise = mislabel(mislabel_fraction, y_train)

KNN_PCA(X_train, X_test, y_train_noise, y_test)
KNN_features(X_train, X_test, y_train_noise, y_test)
SVC_PCA(X_train, X_test, y_train_noise, y_test)
SVC_features(X_train, X_test, y_train_noise, y_test)
LR_PCA(X_train, X_test, y_train_noise, y_test)
LR_features(X_train, X_test, y_train_noise, y_test)

100%|████████████████████████████████████████████████████████████████████| 24/24 [03:22<00:00,  8.45s/it]


KNN optimal number of PCA components: 23
Cross val err:  0.006063423200082574
Train err:  0.0047639670853183436
Test err:  0.01211072664359858




100%|████████████████████████████████████████████████████████████████████| 49/49 [00:05<00:00,  8.85it/s]


KNN optimal number of features: 48
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V289', 'V307', 'V308', 'V350',
       'V462', 'V475', 'V507', 'V539', 'V544', 'V627', 'V657', 'V687', 'V730',
       'V803', 'V845', 'V1005', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101',
       'V1173', 'V1193', 'V1206', 'V1218', 'V1256', 'V1263', 'V1478', 'V1517',
       'V1533', 'V1654', 'V1673', 'V1697', 'V1719', 'V1744', 'V1787', 'V1799',
       'V1812', 'V1829', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.010395244668563564
Train err:  0.008661758336942382
Test err:  0.01384083044982698


100%|████████████████████████████████████████████████████████████████████| 24/24 [03:24<00:00,  8.54s/it]


SVC optimal number of PCA components: 23
Cross val err:  0.005198561380773903
Train err:  0.0017323516673884987
Test err:  0.01384083044982698


100%|████████████████████████████████████████████████████████████████████| 49/49 [00:09<00:00,  4.95it/s]


SVC optimal number of features: 44
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V289', 'V307', 'V308', 'V462',
       'V475', 'V507', 'V539', 'V544', 'V627', 'V657', 'V687', 'V730', 'V803',
       'V845', 'V1005', 'V1071', 'V1097', 'V1098', 'V1101', 'V1173', 'V1193',
       'V1206', 'V1218', 'V1256', 'V1263', 'V1517', 'V1533', 'V1654', 'V1673',
       'V1697', 'V1744', 'V1787', 'V1799', 'V1812', 'V1829', 'V1871', 'V1882',
       'V1936'],
      dtype='object')
Cross val err:  0.007363063545276161
Train err:  0.004330879168471191
Test err:  0.01211072664359858


100%|████████████████████████████████████████████████████████████████████| 24/24 [04:09<00:00, 10.39s/it]


KNN optimal number of PCA components: 25
Cross val err:  0.005198561380773903
Train err:  0.0
Test err:  0.00519031141868509


100%|████████████████████████████████████████████████████████████████████| 99/99 [01:36<00:00,  1.03it/s]


LR optimal number of features: 94
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V87', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394',
       'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V544', 'V568',
       'V600', 'V627', 'V657', 'V658', 'V673', 'V687', 'V691', 'V730', 'V803',
       'V818', 'V845', 'V850', 'V855', 'V889', 'V922', 'V982', 'V1005',
       'V1033', 'V1066', 'V1071', 'V1078', 'V1097', 'V1098', 'V1101', 'V1102',
       'V1152', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234', 'V1249',
       'V1256', 'V1263', 'V1293', 'V1443', 'V1478', 'V1517', 'V1530', 'V1533',
       'V1549', 'V1575', 'V1580', 'V1583', 'V1635', 'V1654', 'V1657', 'V1673',
       'V1697', 'V1719', 'V1744', 'V1760', 'V1787', 'V1799', 'V1812', 'V1813',
       'V1827', 'V1829', 'V1846', 'V1868', 'V1871', 'V1877', 'V1882', 'V1936',
       'V1971'],
      dtype='object')
Cross val err:  0.003033120169779635
Train 



In [None]:
##Mislabel fraction 0.2, 90/10 split
mislabel_fraction = 0.2
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.9)
y_train_noise = mislabel(mislabel_fraction, y_train)

KNN_PCA(X_train, X_test, y_train_noise, y_test)
KNN_features(X_train, X_test, y_train_noise, y_test)
SVC_PCA(X_train, X_test, y_train_noise, y_test)
SVC_features(X_train, X_test, y_train_noise, y_test)
LR_PCA(X_train, X_test, y_train_noise, y_test)
LR_features(X_train, X_test, y_train_noise, y_test)

 42%|████████████████████████████▎                                       | 10/24 [01:07<01:34,  6.75s/it]


KeyboardInterrupt: 

In [None]:
##Mislabel fraction 0.4, 70/30 split
mislabel_fraction = 0.4
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.7)
y_train_noise = mislabel(mislabel_fraction, y_train)

KNN_PCA(X_train, X_test, y_train_noise, y_test)
KNN_features(X_train, X_test, y_train_noise, y_test)
SVC_PCA(X_train, X_test, y_train_noise, y_test)
SVC_features(X_train, X_test, y_train_noise, y_test)
LR_PCA(X_train, X_test, y_train_noise, y_test)
LR_features(X_train, X_test, y_train_noise, y_test)

In [None]:
##Mislabel fraction 0.4, 80/20 split
mislabel_fraction = 0.4
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.8)
y_train_noise = mislabel(mislabel_fraction, y_train)

KNN_PCA(X_train, X_test, y_train_noise, y_test)
KNN_features(X_train, X_test, y_train_noise, y_test)
SVC_PCA(X_train, X_test, y_train_noise, y_test)
SVC_features(X_train, X_test, y_train_noise, y_test)
LR_PCA(X_train, X_test, y_train_noise, y_test)
LR_features(X_train, X_test, y_train_noise, y_test)

In [None]:
##Mislabel fraction 0.4, 90/10 split
mislabel_fraction = 0.4
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.9)
y_train_noise = mislabel(mislabel_fraction, y_train)

KNN_PCA(X_train, X_test, y_train_noise, y_test)
KNN_features(X_train, X_test, y_train_noise, y_test)
SVC_PCA(X_train, X_test, y_train_noise, y_test)
SVC_features(X_train, X_test, y_train_noise, y_test)
LR_PCA(X_train, X_test, y_train_noise, y_test)
LR_features(X_train, X_test, y_train_noise, y_test)

In [None]:
##Mislabel fraction 0.6, 70/30 split
mislabel_fraction = 0.6
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.7)
y_train_noise = mislabel(mislabel_fraction, y_train)

KNN_PCA(X_train, X_test, y_train_noise, y_test)
KNN_features(X_train, X_test, y_train_noise, y_test)
SVC_PCA(X_train, X_test, y_train_noise, y_test)
SVC_features(X_train, X_test, y_train_noise, y_test)
LR_PCA(X_train, X_test, y_train_noise, y_test)
LR_features(X_train, X_test, y_train_noise, y_test)

In [None]:
##Mislabel fraction 0.6, 80/20 split
mislabel_fraction = 0.6
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.8)
y_train_noise = mislabel(mislabel_fraction, y_train)

KNN_PCA(X_train, X_test, y_train_noise, y_test)
KNN_features(X_train, X_test, y_train_noise, y_test)
SVC_PCA(X_train, X_test, y_train_noise, y_test)
SVC_features(X_train, X_test, y_train_noise, y_test)
LR_PCA(X_train, X_test, y_train_noise, y_test)
LR_features(X_train, X_test, y_train_noise, y_test)

In [None]:
##Mislabel fraction 0.6, 90/10 split
mislabel_fraction = 0.6
X_train, X_test, y_train, y_test = pre_process(df, labels_df, 0.9)
y_train_noise = mislabel(mislabel_fraction, y_train)

KNN_PCA(X_train, X_test, y_train_noise, y_test)
KNN_features(X_train, X_test, y_train_noise, y_test)
SVC_PCA(X_train, X_test, y_train_noise, y_test)
SVC_features(X_train, X_test, y_train_noise, y_test)
LR_PCA(X_train, X_test, y_train_noise, y_test)
LR_features(X_train, X_test, y_train_noise, y_test)