In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from tqdm import tqdm

#features = [f"V{x}" for x in range(1,2000)]
df = pd.read_csv('Data/TCGAdata.txt', sep=" " ,header=0)
labels_df = pd.read_csv('Data/TCGAlabels', sep=" " ,header=0)

#Set max number of components for PCA
max_num_components = 30

num_components_range = range(1, max_num_components)

In [2]:
#Pre-processes the data by splitting and normalizing 
def pre_process(data, labels, train_size):
    #Split data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(df, labels_df.values.ravel(), test_size=1-train_size)
    
    #Standardize the rows (transposing as fit_transform standardizes along columns)
    #Scale after split to avoid data leakage
    scaler = StandardScaler()
    X_train = pd.DataFrame(np.transpose(scaler.fit_transform(X_train.transpose())), columns=X_train.columns)
    X_test = pd.DataFrame(np.transpose(scaler.fit_transform(X_test.transpose())), columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [3]:
##KNN PCA

def KNN_PCA(X_train, X_test, y_train, y_test):

    KNN_mean_scores = np.zeros(max_num_components)

    # Loop over different numbers of components
    for n_components in tqdm(num_components_range):

        #PCA

        KNN_pipeline = make_pipeline(PCA(n_components=n_components), KNeighborsClassifier(n_neighbors=5))

        KNN_scores = cross_val_score(KNN_pipeline, X_train, y_train, cv=5)
        KNN_mean_score = KNN_scores.mean()

        #KNN_mean_scores.append(KNN_mean_score)
        KNN_mean_scores[n_components] = KNN_mean_score

    KNN_optimal_n_components = np.where(KNN_mean_scores==KNN_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(KNN_mean_scores)

    print("KNN optimal number of PCA components:", KNN_optimal_n_components)

    opt_pipeline = make_pipeline(PCA(n_components=KNN_optimal_n_components), KNeighborsClassifier(n_neighbors=5))

    opt_pipeline.fit(X_train, y_train)
    train_pred = opt_pipeline.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = opt_pipeline.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)
    print("\n")

    return [train_error, cross_val_err, test_error, KNN_optimal_n_components]


In [4]:
##KNN features

def KNN_features(X_train, X_test, y_train, y_test):
    max_num_features = 100

    num_features = range(1, max_num_features)
    KNN_mean_scores = np.zeros(max_num_features)

    # Loop over different numbers of features
    for k in tqdm(num_features):

        feature_selector = SelectKBest(f_classif, k=k)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)

        model = KNeighborsClassifier(n_neighbors=5)

        KNN_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
        KNN_mean_score = KNN_scores.mean()

        KNN_mean_scores[k] = KNN_mean_score

        #KNN_mean_scores.append(KNN_mean_score)

    KNN_optimal_k_features = np.where(KNN_mean_scores==KNN_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(KNN_mean_scores)

    print("KNN optimal number of features:", KNN_optimal_k_features)

    feature_selector = SelectKBest(f_classif, k=KNN_optimal_k_features)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    model.fit(X_train_selected, y_train)


    # Get the indices of the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)

    # Get the names of the most predictive features
    selected_features = X_train.columns[selected_feature_indices]
    print("Most predictive features:", selected_features)


    train_pred = model.predict(X_train_selected)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
    test_error = 1 - accuracy_score(y_test, test_pred)
    
    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, KNN_optimal_k_features]


In [5]:
##SVC PCA
def SVC_PCA(X_train, X_test, y_train, y_test):
    SVC_mean_scores = np.zeros(max_num_components)

    #num_components_range = range(1, max_num_components)

    # Loop over different numbers of components
    for n_components in tqdm(num_components_range):

        #PCA

        SVC_pipeline = make_pipeline(PCA(n_components=n_components), SVC())

        SVC_scores = cross_val_score(SVC_pipeline, X_train, y_train, cv=5)
        SVC_mean_score = SVC_scores.mean()

        #SVC_mean_scores.append(SVC_mean_score)
        SVC_mean_scores[n_components] = SVC_mean_score

    SVC_optimal_n_components = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(SVC_mean_scores)

    print("SVC optimal number of PCA components:", SVC_optimal_n_components)

    opt_pipeline = make_pipeline(PCA(n_components=SVC_optimal_n_components), SVC())

    opt_pipeline.fit(X_train, y_train)
    train_pred = opt_pipeline.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = opt_pipeline.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, SVC_optimal_n_components]

In [6]:
##SVC features

def SVC_features(X_train, X_test, y_train, y_test):
    max_num_features = 100
    num_features = range(1, max_num_features)
    SVC_mean_scores = np.zeros(max_num_features)

    # Loop over different numbers of components
    for k in tqdm(num_features):

        feature_selector = SelectKBest(f_classif, k=k)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)

        model = SVC()

        SVC_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
        SVC_mean_score = SVC_scores.mean()

        #SVC_mean_scores.append(SVC_mean_score)
        SVC_mean_scores[k] = SVC_mean_score


    SVC_optimal_k_features = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(SVC_mean_scores)

    print("SVC optimal number of features:", SVC_optimal_k_features)

    feature_selector = SelectKBest(f_classif, k=SVC_optimal_k_features)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    model.fit(X_train_selected, y_train)

    # Get the indices of the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)

    # Get the names of the most predictive features
    selected_features = X_train.columns[selected_feature_indices]
    print("Most predictive features:", selected_features)


    train_pred = model.predict(X_train_selected)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, SVC_optimal_k_features]

In [7]:
##Logistic regression PCA

def LR_PCA(X_train, X_test, y_train, y_test):
    LR_mean_scores = np.zeros(max_num_components)


    # Loop over different numbers of components
    for n_components in tqdm(num_components_range):

        #PCA

        LR_pipeline = make_pipeline(PCA(n_components=n_components), LogisticRegression(solver='lbfgs', max_iter=10000))

        LR_scores = cross_val_score(LR_pipeline, X_train, y_train, cv=5)
        LR_mean_score = LR_scores.mean()

        #LR_mean_scores.append(LR_mean_score)
        LR_mean_scores[n_components] = LR_mean_score

    LR_optimal_n_components = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(LR_mean_scores)

    print("KNN optimal number of PCA components:", LR_optimal_n_components)

    opt_pipeline = make_pipeline(PCA(n_components=LR_optimal_n_components), LogisticRegression(solver='lbfgs', max_iter=10000))

    opt_pipeline.fit(X_train, y_train)
    train_pred = opt_pipeline.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = opt_pipeline.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, LR_optimal_n_components]

In [8]:
##Logistic Regression features

def LR_features(X_train, X_test, y_train, y_test):
    max_num_features = 100
    num_features = range(1, max_num_features)
    LR_mean_scores = np.zeros(max_num_features)

    # Loop over different numbers of components
    for k in tqdm(num_features):

        feature_selector = SelectKBest(f_classif, k=k)

        X_train_selected = feature_selector.fit_transform(X_train, y_train)

        model = LogisticRegression(solver='lbfgs', max_iter=10000)

        LR_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
        LR_mean_score = LR_scores.mean()

        #LR_mean_scores.append(LR_mean_score)
        LR_mean_scores[k] = LR_mean_score

    LR_optimal_k_features = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]+1
    cross_val_err = 1 - max(LR_mean_scores)

    print("LR optimal number of features:", LR_optimal_k_features)

    feature_selector = SelectKBest(f_classif, k=LR_optimal_k_features)
    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    model.fit(X_train_selected, y_train)

    # Get the indices of the selected features
    selected_feature_indices = feature_selector.get_support(indices=True)

    # Get the names of the most predictive features
    selected_features = X_train.columns[selected_feature_indices]
    print("Most predictive features:", selected_features)

    train_pred = model.predict(X_train_selected)
    train_error = 1 - accuracy_score(y_train, train_pred)

    test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
    test_error = 1 - accuracy_score(y_test, test_pred)

    print("Cross val err: ", cross_val_err)
    print("Train err: ", train_error)
    print("Test err: ", test_error)

    return [train_error, cross_val_err, test_error, LR_optimal_k_features]

In [9]:
##Creating dictionary to save data of part 1
d = dict()
#a =0.7
#print(f"KNN_PCA_{(1-a)*100:.0f}_{a*100:.0f}")

In [10]:
##Run everything with 70/30 split
split = 0.7
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)


d[f"KNN_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train, y_test)
d[f"KNN_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train, y_test)
d[f"SVC_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train, y_test)
d[f"SVC_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train, y_test)
d[f"LR_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train, y_test)
d[f"LR_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train, y_test)


100%|██████████| 29/29 [01:41<00:00,  3.49s/it]


KNN optimal number of PCA components: 22
Cross val err:  0.007920792079207928
Train err:  0.005940594059405946
Test err:  0.00692041522491349




100%|██████████| 99/99 [00:12<00:00,  8.01it/s]


KNN optimal number of features: 90
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V63', 'V64', 'V68', 'V72', 'V87',
       'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394',
       'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V544', 'V568', 'V600',
       'V627', 'V657', 'V658', 'V673', 'V687', 'V691', 'V730', 'V803', 'V818',
       'V845', 'V850', 'V855', 'V889', 'V922', 'V982', 'V1005', 'V1033',
       'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1102', 'V1152', 'V1173',
       'V1193', 'V1203', 'V1206', 'V1218', 'V1234', 'V1249', 'V1256', 'V1263',
       'V1443', 'V1478', 'V1517', 'V1530', 'V1533', 'V1549', 'V1575', 'V1580',
       'V1635', 'V1654', 'V1657', 'V1673', 'V1697', 'V1719', 'V1744', 'V1760',
       'V1772', 'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846',
       'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.007425742574257321
Train err:  0.005940594059405946
Test err:  0.01038062283737029


100%|██████████| 29/29 [01:41<00:00,  3.50s/it]


SVC optimal number of PCA components: 10
Cross val err:  0.005940594059406168
Train err:  0.004950495049504955
Test err:  0.008073817762399127


100%|██████████| 99/99 [00:23<00:00,  4.21it/s]


SVC optimal number of features: 87
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V63', 'V64', 'V68', 'V72', 'V87', 'V200',
       'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394', 'V462',
       'V464', 'V475', 'V494', 'V507', 'V539', 'V544', 'V568', 'V600', 'V627',
       'V657', 'V658', 'V673', 'V687', 'V691', 'V730', 'V803', 'V818', 'V845',
       'V850', 'V855', 'V889', 'V922', 'V982', 'V1005', 'V1033', 'V1066',
       'V1071', 'V1097', 'V1098', 'V1101', 'V1102', 'V1152', 'V1173', 'V1193',
       'V1203', 'V1206', 'V1218', 'V1234', 'V1249', 'V1256', 'V1263', 'V1443',
       'V1478', 'V1517', 'V1530', 'V1533', 'V1549', 'V1575', 'V1654', 'V1657',
       'V1673', 'V1697', 'V1719', 'V1744', 'V1760', 'V1772', 'V1787', 'V1799',
       'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1871', 'V1877', 'V1882',
       'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.003960396039603964
Train err:  0.0014851485148514865
Test err:  0.008073817762399127


100%|██████████| 29/29 [02:24<00:00,  5.00s/it]


KNN optimal number of PCA components: 30
Cross val err:  0.00544554455445545
Train err:  0.0
Test err:  0.0034602076124568004


100%|██████████| 99/99 [02:32<00:00,  1.54s/it]


LR optimal number of features: 87
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V63', 'V64', 'V68', 'V72', 'V87', 'V200',
       'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V394', 'V462',
       'V464', 'V475', 'V494', 'V507', 'V539', 'V544', 'V568', 'V600', 'V627',
       'V657', 'V658', 'V673', 'V687', 'V691', 'V730', 'V803', 'V818', 'V845',
       'V850', 'V855', 'V889', 'V922', 'V982', 'V1005', 'V1033', 'V1066',
       'V1071', 'V1097', 'V1098', 'V1101', 'V1102', 'V1152', 'V1173', 'V1193',
       'V1203', 'V1206', 'V1218', 'V1234', 'V1249', 'V1256', 'V1263', 'V1443',
       'V1478', 'V1517', 'V1530', 'V1533', 'V1549', 'V1575', 'V1654', 'V1657',
       'V1673', 'V1697', 'V1719', 'V1744', 'V1760', 'V1772', 'V1787', 'V1799',
       'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1871', 'V1877', 'V1882',
       'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.002970297029702862
Train err:  0.0
Test err:  0.00692041522491349




In [11]:
##Run everything with 80/20 split
split = 0.8
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)

d[f"KNN_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train, y_test)
d[f"KNN_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train, y_test)
d[f"SVC_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train, y_test)
d[f"SVC_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train, y_test)
d[f"LR_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train, y_test)
d[f"LR_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train, y_test)

100%|██████████| 29/29 [02:18<00:00,  4.78s/it]


KNN optimal number of PCA components: 11
Cross val err:  0.007361185452291585
Train err:  0.0047639670853183436
Test err:  0.01384083044982698




100%|██████████| 99/99 [00:13<00:00,  7.59it/s]


KNN optimal number of features: 81
Most predictive features: Index(['V3', 'V18', 'V29', 'V35', 'V63', 'V64', 'V68', 'V72', 'V87', 'V227',
       'V274', 'V289', 'V307', 'V308', 'V350', 'V418', 'V462', 'V464', 'V475',
       'V494', 'V507', 'V539', 'V544', 'V568', 'V600', 'V627', 'V657', 'V658',
       'V673', 'V687', 'V691', 'V730', 'V803', 'V818', 'V845', 'V850', 'V855',
       'V889', 'V922', 'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097',
       'V1098', 'V1101', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234',
       'V1249', 'V1256', 'V1263', 'V1293', 'V1443', 'V1478', 'V1517', 'V1533',
       'V1549', 'V1654', 'V1657', 'V1673', 'V1697', 'V1719', 'V1744', 'V1760',
       'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1871',
       'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.007795024931684269
Train err:  0.005197055002165385
Test err:  0.01211072664359858


100%|██████████| 29/29 [02:38<00:00,  5.47s/it]


SVC optimal number of PCA components: 25
Cross val err:  0.005630522767182122
Train err:  0.003031615417929845
Test err:  0.01038062283737029


100%|██████████| 99/99 [00:25<00:00,  3.82it/s]


SVC optimal number of features: 83
Most predictive features: Index(['V3', 'V18', 'V29', 'V35', 'V63', 'V64', 'V68', 'V72', 'V87', 'V217',
       'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V418', 'V462', 'V464',
       'V475', 'V494', 'V507', 'V539', 'V544', 'V568', 'V600', 'V627', 'V657',
       'V658', 'V673', 'V687', 'V691', 'V730', 'V803', 'V818', 'V845', 'V850',
       'V855', 'V889', 'V922', 'V982', 'V1005', 'V1033', 'V1066', 'V1071',
       'V1097', 'V1098', 'V1101', 'V1102', 'V1173', 'V1193', 'V1203', 'V1206',
       'V1218', 'V1234', 'V1249', 'V1256', 'V1263', 'V1293', 'V1443', 'V1478',
       'V1517', 'V1533', 'V1549', 'V1654', 'V1657', 'V1673', 'V1697', 'V1719',
       'V1744', 'V1760', 'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829',
       'V1846', 'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.005630522767182122
Train err:  0.0012992637505413462
Test err:  0.01384083044982698


100%|██████████| 29/29 [02:57<00:00,  6.12s/it]


KNN optimal number of PCA components: 27
Cross val err:  0.005630522767182122
Train err:  0.0
Test err:  0.01038062283737029


100%|██████████| 99/99 [02:25<00:00,  1.47s/it]


LR optimal number of features: 98
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V87', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350',
       'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V541',
       'V544', 'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687', 'V691',
       'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V914', 'V922',
       'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101',
       'V1102', 'V1152', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234',
       'V1249', 'V1256', 'V1263', 'V1293', 'V1315', 'V1443', 'V1478', 'V1517',
       'V1530', 'V1533', 'V1549', 'V1575', 'V1580', 'V1583', 'V1654', 'V1657',
       'V1673', 'V1697', 'V1719', 'V1731', 'V1744', 'V1760', 'V1772', 'V1787',
       'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1868', 'V1871',
       'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val 



In [12]:
##Run everything with 90/10 split
split = 0.9
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)

d[f"KNN_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train, y_test)
d[f"KNN_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train, y_test)
d[f"SVC_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train, y_test)
d[f"SVC_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train, y_test)
d[f"LR_PCA_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train, y_test)
d[f"LR_Feat_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train, y_test)

100%|██████████| 29/29 [02:14<00:00,  4.65s/it]


KNN optimal number of PCA components: 23
Cross val err:  0.0069290054839188375
Train err:  0.00500384911470364
Test err:  0.00692041522491349




100%|██████████| 99/99 [00:15<00:00,  6.45it/s]


KNN optimal number of features: 97
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V83', 'V87', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308',
       'V350', 'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539',
       'V541', 'V544', 'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687',
       'V691', 'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V914',
       'V922', 'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098',
       'V1101', 'V1126', 'V1152', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218',
       'V1234', 'V1249', 'V1256', 'V1263', 'V1293', 'V1443', 'V1478', 'V1517',
       'V1530', 'V1533', 'V1549', 'V1580', 'V1583', 'V1654', 'V1657', 'V1673',
       'V1697', 'V1719', 'V1731', 'V1744', 'V1760', 'V1772', 'V1787', 'V1799',
       'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1868', 'V1871', 'V1877',
       'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.00

100%|██████████| 29/29 [02:04<00:00,  4.30s/it]


SVC optimal number of PCA components: 23
Cross val err:  0.006543649029198129
Train err:  0.002694380292532772
Test err:  0.01384083044982698


100%|██████████| 99/99 [00:31<00:00,  3.18it/s]


SVC optimal number of features: 96
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V83', 'V87', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308',
       'V350', 'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539',
       'V541', 'V544', 'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687',
       'V691', 'V730', 'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V922',
       'V982', 'V1005', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101',
       'V1126', 'V1152', 'V1173', 'V1193', 'V1203', 'V1206', 'V1218', 'V1234',
       'V1249', 'V1256', 'V1263', 'V1293', 'V1443', 'V1478', 'V1517', 'V1530',
       'V1533', 'V1549', 'V1580', 'V1583', 'V1654', 'V1657', 'V1673', 'V1697',
       'V1719', 'V1731', 'V1744', 'V1760', 'V1772', 'V1787', 'V1799', 'V1812',
       'V1813', 'V1827', 'V1829', 'V1846', 'V1868', 'V1871', 'V1877', 'V1882',
       'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.006159033644582812
Train err:  0.0015396458814472824
Test err:  0.00692041522491349


100%|██████████| 29/29 [03:18<00:00,  6.86s/it]


KNN optimal number of PCA components: 30
Cross val err:  0.005389802875352068
Train err:  0.0
Test err:  0.01038062283737029


100%|██████████| 99/99 [02:35<00:00,  1.57s/it]


LR optimal number of features: 92
Most predictive features: Index(['V3', 'V16', 'V18', 'V29', 'V30', 'V35', 'V63', 'V64', 'V68', 'V72',
       'V87', 'V200', 'V217', 'V227', 'V274', 'V289', 'V307', 'V308', 'V350',
       'V394', 'V418', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V544',
       'V568', 'V600', 'V627', 'V657', 'V658', 'V673', 'V687', 'V691', 'V730',
       'V803', 'V818', 'V845', 'V850', 'V855', 'V889', 'V922', 'V982', 'V1005',
       'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1152', 'V1173',
       'V1193', 'V1203', 'V1206', 'V1218', 'V1234', 'V1249', 'V1256', 'V1263',
       'V1293', 'V1443', 'V1478', 'V1517', 'V1530', 'V1533', 'V1549', 'V1580',
       'V1583', 'V1654', 'V1657', 'V1673', 'V1697', 'V1719', 'V1744', 'V1760',
       'V1772', 'V1787', 'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846',
       'V1868', 'V1871', 'V1877', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.004232992441084882
Train err:  0.0
Test err:  0.010



In [13]:
#Save data of part 1
df_1 = pd.DataFrame(data =d, index = ['Train', 'Cross', 'Test', 'Opt_Feat_or_PCA'])
df_1.to_csv('./data.csv', sep=" ")

In [14]:
##Part 2 Theme 1 mislabeling

def mislabel(mislabel_fraction, y_train):
    labels = set(labels_df["x"])

    num_samples = len(y_train)
    num_mislabels = int(mislabel_fraction * num_samples)
    mislabel_indices = np.random.choice(num_samples, num_mislabels, replace=False)

    y_train_noise = y_train.copy()

    for i in mislabel_indices:
        correct = y_train[i]
        y_train_noise[i] = np.random.choice(list(labels - set([correct])))
    
    return y_train_noise

In [15]:
##Creating dictionary to save data of part 2
d = dict()

In [16]:
##Mislabel fraction 0.2, 70/30 split
mislabel_fraction = 0.2
split = 0.7
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)



100%|██████████| 29/29 [01:46<00:00,  3.67s/it]


KNN optimal number of PCA components: 11
Cross val err:  0.21237623762376223
Train err:  0.20198019801980194
Test err:  0.02076124567474047




100%|██████████| 99/99 [00:11<00:00,  8.33it/s]


KNN optimal number of features: 81
Most predictive features: Index(['V3', 'V18', 'V30', 'V68', 'V87', 'V200', 'V217', 'V227', 'V289',
       'V307', 'V308', 'V350', 'V392', 'V394', 'V462', 'V464', 'V475', 'V494',
       'V507', 'V539', 'V544', 'V568', 'V578', 'V600', 'V627', 'V663', 'V664',
       'V667', 'V673', 'V687', 'V691', 'V845', 'V855', 'V982', 'V1033',
       'V1066', 'V1097', 'V1098', 'V1101', 'V1102', 'V1126', 'V1147', 'V1152',
       'V1193', 'V1218', 'V1249', 'V1256', 'V1263', 'V1290', 'V1293', 'V1300',
       'V1478', 'V1517', 'V1530', 'V1533', 'V1535', 'V1549', 'V1572', 'V1575',
       'V1635', 'V1644', 'V1654', 'V1658', 'V1667', 'V1673', 'V1697', 'V1702',
       'V1719', 'V1744', 'V1772', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829',
       'V1846', 'V1871', 'V1882', 'V1896', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.2168316831683168
Train err:  0.2069306930693069
Test err:  0.03114186851211076


100%|██████████| 29/29 [02:01<00:00,  4.18s/it]


SVC optimal number of PCA components: 26
Cross val err:  0.2049504950495049
Train err:  0.20247524752475243
Test err:  0.00692041522491349


100%|██████████| 99/99 [01:07<00:00,  1.46it/s]


SVC optimal number of features: 52
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V200', 'V217', 'V289', 'V307', 'V350',
       'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V568', 'V600', 'V627',
       'V663', 'V673', 'V687', 'V845', 'V855', 'V982', 'V1033', 'V1066',
       'V1097', 'V1098', 'V1101', 'V1193', 'V1218', 'V1256', 'V1263', 'V1478',
       'V1517', 'V1530', 'V1533', 'V1549', 'V1575', 'V1654', 'V1673', 'V1697',
       'V1744', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829', 'V1846', 'V1882',
       'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.2074257425742574
Train err:  0.20346534653465342
Test err:  0.018454440599769306


100%|██████████| 29/29 [01:57<00:00,  4.06s/it]


KNN optimal number of PCA components: 22
Cross val err:  0.20792079207920788
Train err:  0.2054455445544554
Test err:  0.011534025374855816


100%|██████████| 99/99 [02:08<00:00,  1.30s/it]


LR optimal number of features: 33
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V217', 'V307', 'V350', 'V475', 'V494',
       'V539', 'V627', 'V687', 'V845', 'V1066', 'V1097', 'V1101', 'V1193',
       'V1218', 'V1256', 'V1263', 'V1478', 'V1517', 'V1533', 'V1549', 'V1654',
       'V1673', 'V1697', 'V1744', 'V1787', 'V1799', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.22326732673267313
Train err:  0.21386138613861383
Test err:  0.025374855824682796




In [17]:
##Mislabel fraction 0.2, 80/20 split
mislabel_fraction = 0.2
split = 0.8
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|██████████| 29/29 [02:00<00:00,  4.16s/it]


KNN optimal number of PCA components: 17
Cross val err:  0.216976082485844
Train err:  0.20311823300129928
Test err:  0.01730103806228378




100%|██████████| 99/99 [00:13<00:00,  7.29it/s]


KNN optimal number of features: 44
Most predictive features: Index(['V3', 'V18', 'V30', 'V68', 'V87', 'V289', 'V307', 'V308', 'V350',
       'V475', 'V494', 'V507', 'V539', 'V568', 'V627', 'V673', 'V687', 'V845',
       'V855', 'V982', 'V1066', 'V1097', 'V1098', 'V1101', 'V1193', 'V1218',
       'V1256', 'V1263', 'V1478', 'V1517', 'V1533', 'V1549', 'V1654', 'V1673',
       'V1697', 'V1744', 'V1787', 'V1799', 'V1812', 'V1829', 'V1846', 'V1882',
       'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.221308843000817
Train err:  0.20441749675184062
Test err:  0.02422145328719727


100%|██████████| 29/29 [02:23<00:00,  4.96s/it]


SVC optimal number of PCA components: 20
Cross val err:  0.20615075452385645
Train err:  0.20268514508445212
Test err:  0.00692041522491349


100%|██████████| 99/99 [01:32<00:00,  1.07it/s]


SVC optimal number of features: 97
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V68', 'V72', 'V87', 'V178', 'V200', 'V217',
       'V274', 'V289', 'V307', 'V308', 'V350', 'V392', 'V394', 'V462', 'V464',
       'V475', 'V494', 'V507', 'V539', 'V541', 'V544', 'V568', 'V578', 'V600',
       'V614', 'V627', 'V657', 'V663', 'V667', 'V673', 'V687', 'V691', 'V730',
       'V795', 'V803', 'V845', 'V855', 'V915', 'V959', 'V982', 'V1033',
       'V1066', 'V1097', 'V1098', 'V1101', 'V1102', 'V1126', 'V1147', 'V1152',
       'V1193', 'V1203', 'V1218', 'V1249', 'V1256', 'V1263', 'V1290', 'V1293',
       'V1300', 'V1443', 'V1478', 'V1517', 'V1530', 'V1533', 'V1535', 'V1549',
       'V1572', 'V1575', 'V1580', 'V1635', 'V1644', 'V1646', 'V1654', 'V1658',
       'V1667', 'V1673', 'V1697', 'V1702', 'V1719', 'V1731', 'V1744', 'V1772',
       'V1787', 'V1799', 'V1812', 'V1827', 'V1829', 'V1846', 'V1871', 'V1882',
       'V1896', 'V1931', 'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.20918293564714374
Train err:  0.20138588133391078
Test err:  0.00692041522491349


100%|██████████| 29/29 [02:08<00:00,  4.42s/it]


KNN optimal number of PCA components: 28
Cross val err:  0.2061526326168408
Train err:  0.20485058466868777
Test err:  0.0034602076124568004


100%|██████████| 99/99 [02:31<00:00,  1.53s/it]


LR optimal number of features: 56
Most predictive features: Index(['V3', 'V18', 'V30', 'V68', 'V87', 'V217', 'V289', 'V307', 'V308',
       'V350', 'V462', 'V475', 'V494', 'V507', 'V539', 'V568', 'V578', 'V627',
       'V673', 'V687', 'V845', 'V855', 'V982', 'V1033', 'V1066', 'V1097',
       'V1098', 'V1101', 'V1102', 'V1152', 'V1193', 'V1218', 'V1256', 'V1263',
       'V1290', 'V1300', 'V1478', 'V1517', 'V1530', 'V1533', 'V1535', 'V1549',
       'V1654', 'V1673', 'V1697', 'V1744', 'V1772', 'V1787', 'V1799', 'V1812',
       'V1827', 'V1829', 'V1846', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.21784282239813701
Train err:  0.2087483759203118
Test err:  0.02422145328719727




In [18]:
##Mislabel fraction 0.2, 90/10 split
mislabel_fraction = 0.2
split = 0.9
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|██████████| 29/29 [02:14<00:00,  4.62s/it]


KNN optimal number of PCA components: 11
Cross val err:  0.21592559656143473
Train err:  0.2009237875288684
Test err:  0.01730103806228378




100%|██████████| 99/99 [00:15<00:00,  6.25it/s]


KNN optimal number of features: 48
Most predictive features: Index(['V3', 'V18', 'V29', 'V68', 'V87', 'V217', 'V289', 'V307', 'V350',
       'V462', 'V475', 'V494', 'V507', 'V539', 'V568', 'V627', 'V673', 'V687',
       'V845', 'V855', 'V982', 'V1033', 'V1066', 'V1097', 'V1098', 'V1101',
       'V1152', 'V1193', 'V1218', 'V1256', 'V1263', 'V1478', 'V1517', 'V1533',
       'V1549', 'V1575', 'V1654', 'V1673', 'V1697', 'V1744', 'V1787', 'V1799',
       'V1812', 'V1829', 'V1846', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.2174714688009487
Train err:  0.20631254811393385
Test err:  0.02422145328719727


100%|██████████| 29/29 [02:28<00:00,  5.12s/it]


SVC optimal number of PCA components: 22
Cross val err:  0.20476508077664146
Train err:  0.201693610469592
Test err:  0.0


100%|██████████| 99/99 [01:57<00:00,  1.19s/it]


SVC optimal number of features: 97
Most predictive features: Index(['V3', 'V18', 'V29', 'V30', 'V68', 'V87', 'V178', 'V193', 'V200', 'V217',
       'V227', 'V274', 'V289', 'V307', 'V308', 'V350', 'V392', 'V394', 'V462',
       'V464', 'V475', 'V493', 'V494', 'V507', 'V539', 'V541', 'V544', 'V568',
       'V578', 'V600', 'V627', 'V657', 'V663', 'V667', 'V673', 'V687', 'V691',
       'V730', 'V759', 'V795', 'V803', 'V845', 'V850', 'V855', 'V914', 'V959',
       'V982', 'V1033', 'V1066', 'V1071', 'V1097', 'V1098', 'V1101', 'V1102',
       'V1126', 'V1147', 'V1152', 'V1193', 'V1203', 'V1218', 'V1249', 'V1256',
       'V1263', 'V1290', 'V1293', 'V1300', 'V1315', 'V1478', 'V1517', 'V1530',
       'V1533', 'V1549', 'V1572', 'V1575', 'V1635', 'V1644', 'V1654', 'V1658',
       'V1667', 'V1673', 'V1697', 'V1702', 'V1719', 'V1744', 'V1772', 'V1787',
       'V1799', 'V1812', 'V1813', 'V1827', 'V1829', 'V1846', 'V1871', 'V1882',
       'V1931', 'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.20438120646213132
Train err:  0.19938414164742113
Test err:  0.0


100%|██████████| 29/29 [02:05<00:00,  4.33s/it]


KNN optimal number of PCA components: 20
Cross val err:  0.20630576552541868
Train err:  0.20438799076212466
Test err:  0.01038062283737029


100%|██████████| 99/99 [02:43<00:00,  1.65s/it]


LR optimal number of features: 55
Most predictive features: Index(['V3', 'V18', 'V29', 'V68', 'V87', 'V217', 'V289', 'V307', 'V308',
       'V350', 'V394', 'V462', 'V464', 'V475', 'V494', 'V507', 'V539', 'V568',
       'V600', 'V627', 'V673', 'V687', 'V845', 'V855', 'V982', 'V1033',
       'V1066', 'V1097', 'V1098', 'V1101', 'V1147', 'V1152', 'V1193', 'V1218',
       'V1256', 'V1263', 'V1478', 'V1517', 'V1533', 'V1549', 'V1575', 'V1654',
       'V1673', 'V1697', 'V1744', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829',
       'V1846', 'V1871', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.2136208685341634
Train err:  0.20592763664357194
Test err:  0.02076124567474047




In [19]:
##Mislabel fraction 0.5, 70/30 split
mislabel_fraction = 0.5 
split = 0.7
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|██████████| 29/29 [01:46<00:00,  3.68s/it]


KNN optimal number of PCA components: 27
Cross val err:  0.598019801980198
Train err:  0.46336633663366333
Test err:  0.2364475201845444




100%|██████████| 99/99 [00:12<00:00,  7.83it/s]


KNN optimal number of features: 73
Most predictive features: Index(['V3', 'V18', 'V68', 'V77', 'V87', 'V200', 'V217', 'V289', 'V307',
       'V350', 'V362', 'V375', 'V392', 'V394', 'V475', 'V494', 'V539', 'V541',
       'V568', 'V578', 'V600', 'V627', 'V663', 'V667', 'V673', 'V676', 'V687',
       'V759', 'V795', 'V845', 'V855', 'V982', 'V1028', 'V1033', 'V1066',
       'V1097', 'V1101', 'V1147', 'V1193', 'V1218', 'V1256', 'V1263', 'V1286',
       'V1293', 'V1295', 'V1300', 'V1399', 'V1478', 'V1517', 'V1533', 'V1535',
       'V1549', 'V1575', 'V1635', 'V1644', 'V1654', 'V1658', 'V1667', 'V1673',
       'V1697', 'V1702', 'V1744', 'V1772', 'V1787', 'V1799', 'V1827', 'V1846',
       'V1882', 'V1896', 'V1931', 'V1936', 'V1971', 'V1999'],
      dtype='object')
Cross val err:  0.5925742574257427
Train err:  0.4579207920792079
Test err:  0.273356401384083


100%|██████████| 29/29 [02:28<00:00,  5.10s/it]


SVC optimal number of PCA components: 20
Cross val err:  0.504950495049505
Train err:  0.5004950495049505
Test err:  0.014994232987312617


100%|██████████| 99/99 [02:02<00:00,  1.23s/it]


SVC optimal number of features: 96
Most predictive features: Index(['V3', 'V18', 'V30', 'V68', 'V77', 'V87', 'V177', 'V178', 'V193', 'V200',
       'V217', 'V227', 'V278', 'V289', 'V307', 'V350', 'V362', 'V372', 'V375',
       'V392', 'V394', 'V464', 'V475', 'V493', 'V494', 'V507', 'V539', 'V541',
       'V568', 'V578', 'V600', 'V627', 'V663', 'V664', 'V667', 'V673', 'V676',
       'V687', 'V728', 'V759', 'V795', 'V845', 'V855', 'V871', 'V982', 'V1015',
       'V1028', 'V1033', 'V1045', 'V1066', 'V1097', 'V1101', 'V1102', 'V1147',
       'V1152', 'V1193', 'V1218', 'V1222', 'V1256', 'V1263', 'V1286', 'V1290',
       'V1293', 'V1295', 'V1300', 'V1397', 'V1399', 'V1478', 'V1517', 'V1530',
       'V1533', 'V1535', 'V1549', 'V1575', 'V1604', 'V1635', 'V1644', 'V1646',
       'V1654', 'V1658', 'V1667', 'V1673', 'V1697', 'V1702', 'V1744', 'V1772',
       'V1787', 'V1799', 'V1827', 'V1846', 'V1882', 'V1896', 'V1931', 'V1936',
       'V1971', 'V1999'],
      dtype='object')




Cross val err:  0.5089108910891089
Train err:  0.49653465346534653
Test err:  0.028835063437139596


100%|██████████| 29/29 [02:03<00:00,  4.25s/it]


KNN optimal number of PCA components: 11
Cross val err:  0.5084158415841584
Train err:  0.5099009900990099
Test err:  0.028835063437139596


100%|██████████| 99/99 [02:09<00:00,  1.30s/it]


LR optimal number of features: 15
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V217', 'V539', 'V627', 'V1101', 'V1256',
       'V1533', 'V1654', 'V1673', 'V1744', 'V1787', 'V1936'],
      dtype='object')
Cross val err:  0.5193069306930693
Train err:  0.5143564356435644
Test err:  0.05536332179930792


In [20]:
##Mislabel fraction 0.5, 80/20 split
mislabel_fraction = 0.5
split = 0.8
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|██████████| 29/29 [02:07<00:00,  4.39s/it]


KNN optimal number of PCA components: 6
Cross val err:  0.5855292935553239
Train err:  0.45127760935469896
Test err:  0.24221453287197237




100%|██████████| 99/99 [00:14<00:00,  7.05it/s]


KNN optimal number of features: 40
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V193', 'V289', 'V307', 'V350', 'V475',
       'V494', 'V507', 'V539', 'V627', 'V667', 'V845', 'V1066', 'V1097',
       'V1101', 'V1126', 'V1152', 'V1193', 'V1218', 'V1256', 'V1263', 'V1290',
       'V1478', 'V1533', 'V1549', 'V1635', 'V1654', 'V1673', 'V1697', 'V1702',
       'V1744', 'V1772', 'V1787', 'V1799', 'V1846', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.5842268360706538
Train err:  0.4482459939367691
Test err:  0.22491349480968859


100%|██████████| 29/29 [02:42<00:00,  5.59s/it]


SVC optimal number of PCA components: 11
Cross val err:  0.5032378323050775
Train err:  0.5002165439584236
Test err:  0.00865051903114189


100%|██████████| 99/99 [02:27<00:00,  1.49s/it]


SVC optimal number of features: 81
Most predictive features: Index(['V3', 'V18', 'V30', 'V68', 'V77', 'V87', 'V178', 'V193', 'V217', 'V289',
       'V307', 'V308', 'V350', 'V394', 'V475', 'V482', 'V494', 'V507', 'V539',
       'V541', 'V568', 'V600', 'V627', 'V663', 'V667', 'V673', 'V687', 'V691',
       'V845', 'V871', 'V959', 'V982', 'V1015', 'V1017', 'V1033', 'V1066',
       'V1097', 'V1098', 'V1101', 'V1102', 'V1126', 'V1147', 'V1152', 'V1193',
       'V1218', 'V1222', 'V1230', 'V1249', 'V1256', 'V1263', 'V1286', 'V1290',
       'V1300', 'V1315', 'V1397', 'V1478', 'V1517', 'V1530', 'V1533', 'V1535',
       'V1549', 'V1601', 'V1635', 'V1654', 'V1673', 'V1697', 'V1702', 'V1731',
       'V1744', 'V1772', 'V1787', 'V1799', 'V1812', 'V1827', 'V1829', 'V1846',
       'V1849', 'V1882', 'V1936', 'V1971', 'V1999'],
      dtype='object')




Cross val err:  0.5041026941243861
Train err:  0.49415331312256383
Test err:  0.01730103806228378


100%|██████████| 29/29 [01:54<00:00,  3.96s/it]


KNN optimal number of PCA components: 24
Cross val err:  0.5054023344695795
Train err:  0.5028150714595063
Test err:  0.01730103806228378


100%|██████████| 99/99 [02:37<00:00,  1.59s/it]

LR optimal number of features: 16
Most predictive features: Index(['V18', 'V68', 'V539', 'V627', 'V1097', 'V1101', 'V1193', 'V1256',
       'V1549', 'V1654', 'V1673', 'V1744', 'V1787', 'V1846', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.5170981585298289
Train err:  0.5153746210480727
Test err:  0.05536332179930792





In [21]:
##Mislabel fraction 0.5, 90/10 split
mislabel_fraction = 0.5
split = 0.9
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

100%|██████████| 29/29 [02:08<00:00,  4.44s/it]


KNN optimal number of PCA components: 8
Cross val err:  0.5912361049355269
Train err:  0.45650500384911474
Test err:  0.22491349480968859




100%|██████████| 99/99 [00:17<00:00,  5.80it/s]


KNN optimal number of features: 35
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V289', 'V350', 'V475', 'V539', 'V568',
       'V627', 'V845', 'V982', 'V1033', 'V1066', 'V1097', 'V1101', 'V1193',
       'V1218', 'V1256', 'V1263', 'V1300', 'V1478', 'V1517', 'V1533', 'V1549',
       'V1572', 'V1654', 'V1673', 'V1697', 'V1744', 'V1772', 'V1787', 'V1799',
       'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.5869949607232844
Train err:  0.4526558891454965
Test err:  0.2802768166089965


100%|██████████| 29/29 [03:02<00:00,  6.28s/it]


SVC optimal number of PCA components: 13
Cross val err:  0.5038661627389951
Train err:  0.5003849114703618
Test err:  0.01038062283737029


100%|██████████| 99/99 [03:08<00:00,  1.91s/it]


SVC optimal number of features: 91
Most predictive features: Index(['V3', 'V18', 'V30', 'V68', 'V77', 'V87', 'V178', 'V193', 'V200', 'V217',
       'V227', 'V278', 'V289', 'V307', 'V350', 'V359', 'V372', 'V392', 'V464',
       'V475', 'V493', 'V494', 'V539', 'V541', 'V568', 'V600', 'V614', 'V627',
       'V663', 'V673', 'V676', 'V687', 'V759', 'V795', 'V845', 'V855', 'V871',
       'V893', 'V959', 'V982', 'V1024', 'V1028', 'V1033', 'V1066', 'V1078',
       'V1089', 'V1097', 'V1101', 'V1102', 'V1147', 'V1152', 'V1175', 'V1193',
       'V1218', 'V1222', 'V1249', 'V1256', 'V1263', 'V1290', 'V1300', 'V1397',
       'V1399', 'V1478', 'V1517', 'V1530', 'V1533', 'V1535', 'V1549', 'V1572',
       'V1575', 'V1589', 'V1604', 'V1635', 'V1646', 'V1654', 'V1667', 'V1673',
       'V1697', 'V1702', 'V1744', 'V1772', 'V1787', 'V1799', 'V1812', 'V1827',
       'V1846', 'V1871', 'V1882', 'V1931', 'V1936', 'V1971'],
      dtype='object')




Cross val err:  0.5065584704313029
Train err:  0.49538106235565815
Test err:  0.03114186851211076


100%|██████████| 29/29 [02:09<00:00,  4.47s/it]


KNN optimal number of PCA components: 29
Cross val err:  0.5054031421372462
Train err:  0.5026943802925328
Test err:  0.02768166089965396


100%|██████████| 99/99 [01:32<00:00,  1.07it/s]


LR optimal number of features: 37
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V289', 'V350', 'V475', 'V539', 'V568',
       'V627', 'V845', 'V982', 'V1033', 'V1066', 'V1097', 'V1101', 'V1193',
       'V1218', 'V1256', 'V1263', 'V1300', 'V1478', 'V1517', 'V1533', 'V1549',
       'V1572', 'V1654', 'V1673', 'V1697', 'V1744', 'V1772', 'V1787', 'V1799',
       'V1846', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.5142544834741367
Train err:  0.5076982294072363
Test err:  0.03460207612456745




In [22]:
##Mislabel fraction 0.9, 70/30 split
mislabel_fraction = 0.9
split = 0.7
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

  3%|▎         | 1/29 [00:01<00:45,  1.64s/it]

100%|██████████| 29/29 [01:09<00:00,  2.40s/it]


KNN optimal number of PCA components: 17
Cross val err:  0.8103960396039604
Train err:  0.5787128712871288
Test err:  0.8961937716262975




100%|██████████| 99/99 [00:10<00:00,  9.26it/s]


KNN optimal number of features: 85
Most predictive features: Index(['V3', 'V19', 'V30', 'V61', 'V68', 'V77', 'V128', 'V163', 'V178', 'V201',
       'V231', 'V289', 'V308', 'V316', 'V350', 'V379', 'V424', 'V453', 'V469',
       'V475', 'V514', 'V541', 'V570', 'V594', 'V624', 'V631', 'V667', 'V763',
       'V764', 'V795', 'V822', 'V826', 'V831', 'V914', 'V915', 'V922', 'V939',
       'V942', 'V982', 'V1028', 'V1033', 'V1046', 'V1062', 'V1083', 'V1089',
       'V1101', 'V1171', 'V1175', 'V1193', 'V1195', 'V1198', 'V1217', 'V1296',
       'V1299', 'V1300', 'V1351', 'V1354', 'V1380', 'V1397', 'V1398', 'V1443',
       'V1478', 'V1487', 'V1494', 'V1530', 'V1565', 'V1589', 'V1590', 'V1604',
       'V1615', 'V1646', 'V1654', 'V1731', 'V1752', 'V1769', 'V1772', 'V1787',
       'V1813', 'V1846', 'V1882', 'V1919', 'V1936', 'V1962', 'V1966', 'V1985'],
      dtype='object')
Cross val err:  0.804950495049505
Train err:  0.5603960396039604
Test err:  0.889273356401384


100%|██████████| 29/29 [01:52<00:00,  3.90s/it]


SVC optimal number of PCA components: 11
Cross val err:  0.806930693069307
Train err:  0.7217821782178218
Test err:  0.9907727797001153


100%|██████████| 99/99 [02:02<00:00,  1.23s/it]


SVC optimal number of features: 83
Most predictive features: Index(['V3', 'V19', 'V30', 'V61', 'V68', 'V77', 'V128', 'V163', 'V178', 'V201',
       'V231', 'V289', 'V308', 'V316', 'V350', 'V379', 'V424', 'V453', 'V469',
       'V475', 'V514', 'V541', 'V570', 'V594', 'V624', 'V631', 'V667', 'V763',
       'V764', 'V795', 'V822', 'V826', 'V831', 'V914', 'V915', 'V922', 'V939',
       'V942', 'V982', 'V1033', 'V1046', 'V1062', 'V1083', 'V1089', 'V1101',
       'V1171', 'V1175', 'V1193', 'V1195', 'V1198', 'V1217', 'V1296', 'V1299',
       'V1300', 'V1354', 'V1380', 'V1397', 'V1398', 'V1443', 'V1478', 'V1487',
       'V1494', 'V1530', 'V1565', 'V1589', 'V1590', 'V1604', 'V1615', 'V1646',
       'V1654', 'V1731', 'V1752', 'V1769', 'V1772', 'V1787', 'V1813', 'V1846',
       'V1882', 'V1919', 'V1936', 'V1962', 'V1966', 'V1985'],
      dtype='object')




Cross val err:  0.7930693069306931
Train err:  0.5866336633663367
Test err:  0.9896193771626297


100%|██████████| 29/29 [01:09<00:00,  2.39s/it]


KNN optimal number of PCA components: 18
Cross val err:  0.8084158415841585
Train err:  0.7608910891089109
Test err:  0.9792387543252595


100%|██████████| 99/99 [01:23<00:00,  1.18it/s]


LR optimal number of features: 28
Most predictive features: Index(['V3', 'V68', 'V128', 'V178', 'V308', 'V379', 'V469', 'V541', 'V594',
       'V624', 'V631', 'V763', 'V822', 'V826', 'V915', 'V1193', 'V1300',
       'V1380', 'V1397', 'V1478', 'V1494', 'V1654', 'V1731', 'V1752', 'V1772',
       'V1787', 'V1882', 'V1966'],
      dtype='object')
Cross val err:  0.7950495049504951
Train err:  0.7485148514851485
Test err:  0.9700115340253749




In [23]:
##Mislabel fraction 0.9, 80/20 split
mislabel_fraction = 0.9
split = 0.8
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

  0%|          | 0/29 [00:00<?, ?it/s]

100%|██████████| 29/29 [01:21<00:00,  2.81s/it]


KNN optimal number of PCA components: 12
Cross val err:  0.8150641838277413
Train err:  0.570376786487657
Test err:  0.8927335640138409




100%|██████████| 99/99 [00:11<00:00,  8.38it/s]


KNN optimal number of features: 67
Most predictive features: Index(['V3', 'V50', 'V61', 'V68', 'V77', 'V91', 'V128', 'V145', 'V200', 'V201',
       'V217', 'V350', 'V354', 'V375', 'V392', 'V424', 'V474', 'V482', 'V493',
       'V514', 'V526', 'V570', 'V595', 'V627', 'V714', 'V819', 'V826', 'V843',
       'V845', 'V846', 'V871', 'V915', 'V1045', 'V1046', 'V1050', 'V1075',
       'V1089', 'V1097', 'V1101', 'V1126', 'V1152', 'V1193', 'V1217', 'V1256',
       'V1299', 'V1300', 'V1355', 'V1396', 'V1408', 'V1478', 'V1519', 'V1533',
       'V1554', 'V1575', 'V1635', 'V1646', 'V1654', 'V1658', 'V1667', 'V1772',
       'V1787', 'V1846', 'V1854', 'V1882', 'V1920', 'V1924', 'V1966'],
      dtype='object')
Cross val err:  0.8185395948953433
Train err:  0.5729753139887397
Test err:  0.9359861591695502


100%|██████████| 29/29 [02:08<00:00,  4.43s/it]


SVC optimal number of PCA components: 7
Cross val err:  0.8228695382708398
Train err:  0.7431788653096578
Test err:  1.0


100%|██████████| 99/99 [02:33<00:00,  1.55s/it]


SVC optimal number of features: 6
Most predictive features: Index(['V819', 'V1075', 'V1101', 'V1193', 'V1396', 'V1667'], dtype='object')




Cross val err:  0.8220065545445155
Train err:  0.7488090082286705
Test err:  0.9653979238754326


100%|██████████| 29/29 [01:14<00:00,  2.56s/it]


KNN optimal number of PCA components: 4
Cross val err:  0.8124705374163075
Train err:  0.7990472065829364
Test err:  0.9965397923875432


100%|██████████| 99/99 [01:36<00:00,  1.02it/s]


LR optimal number of features: 25
Most predictive features: Index(['V3', 'V77', 'V200', 'V201', 'V482', 'V514', 'V819', 'V871', 'V915',
       'V1050', 'V1075', 'V1089', 'V1097', 'V1101', 'V1193', 'V1217', 'V1299',
       'V1396', 'V1519', 'V1635', 'V1654', 'V1658', 'V1667', 'V1772', 'V1966'],
      dtype='object')
Cross val err:  0.8094449296184654
Train err:  0.7605023819835427
Test err:  0.972318339100346




In [24]:
##Mislabel fraction 0.9, 90/10 split
mislabel_fraction = 0.9
split = 0.9
X_train, X_test, y_train, y_test = pre_process(df, labels_df, split)
y_train_noise = mislabel(mislabel_fraction, y_train)

d[f"KNN_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_PCA(X_train, X_test, y_train_noise, y_test)
d[f"KNN_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = KNN_features(X_train, X_test, y_train_noise, y_test)
d[f"SVC_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_PCA(X_train, X_test, y_train_noise, y_test)
d[f"SVC_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = SVC_features(X_train, X_test, y_train_noise, y_test)
d[f"LR_PCA_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_PCA(X_train, X_test, y_train_noise, y_test)
d[f"LR_Feat_miss_{mislabel_fraction:.1f}_{(1-split)*100:.0f}_{split*100:.0f}"] = LR_features(X_train, X_test, y_train_noise, y_test)

  0%|          | 0/29 [00:00<?, ?it/s]

100%|██████████| 29/29 [01:21<00:00,  2.81s/it]


KNN optimal number of PCA components: 2
Cross val err:  0.814476063435601
Train err:  0.565050038491147
Test err:  0.9204152249134948




100%|██████████| 99/99 [00:14<00:00,  6.90it/s]


KNN optimal number of features: 32
Most predictive features: Index(['V3', 'V68', 'V128', 'V178', 'V264', 'V469', 'V594', 'V627', 'V673',
       'V691', 'V826', 'V845', 'V861', 'V892', 'V915', 'V1033', 'V1097',
       'V1101', 'V1175', 'V1193', 'V1249', 'V1263', 'V1300', 'V1530', 'V1572',
       'V1589', 'V1604', 'V1654', 'V1846', 'V1882', 'V1965', 'V1971'],
      dtype='object')
Cross val err:  0.8179279679857714
Train err:  0.560816012317167
Test err:  0.9204152249134948


100%|██████████| 29/29 [02:22<00:00,  4.91s/it]


SVC optimal number of PCA components: 3
Cross val err:  0.8117815325329776
Train err:  0.7829099307159353
Test err:  0.9930795847750865


100%|██████████| 99/99 [03:12<00:00,  1.94s/it]


SVC optimal number of features: 25
Most predictive features: Index(['V3', 'V68', 'V178', 'V264', 'V469', 'V594', 'V627', 'V673', 'V691',
       'V826', 'V861', 'V892', 'V915', 'V1033', 'V1097', 'V1175', 'V1193',
       'V1249', 'V1263', 'V1300', 'V1572', 'V1589', 'V1654', 'V1846', 'V1965'],
      dtype='object')




Cross val err:  0.8148547502593745
Train err:  0.710161662817552
Test err:  1.0


100%|██████████| 29/29 [01:19<00:00,  2.75s/it]


KNN optimal number of PCA components: 11
Cross val err:  0.8056173113976582
Train err:  0.770977675134719
Test err:  0.9930795847750865


100%|██████████| 99/99 [01:38<00:00,  1.01it/s]


LR optimal number of features: 25
Most predictive features: Index(['V3', 'V68', 'V178', 'V264', 'V469', 'V594', 'V627', 'V673', 'V691',
       'V826', 'V861', 'V892', 'V915', 'V1033', 'V1097', 'V1175', 'V1193',
       'V1249', 'V1263', 'V1300', 'V1572', 'V1589', 'V1654', 'V1846', 'V1965'],
      dtype='object')
Cross val err:  0.8009908107306951
Train err:  0.7590454195535027
Test err:  0.9896193771626297




In [25]:
#Save data of part 2
df_miss = pd.DataFrame(data =d, index = ['Train', 'Cross', 'Test', 'Opt_Feat_or_PCA'])
df_miss.to_csv('./data_miss.csv', sep=" ")

In [26]:
##Print data from both parts
td = pd.read_csv('./data.csv', sep = " ", header=0, index_col=0)
print(td)
td_miss= pd.read_csv('./data_miss.csv', sep = " ", header=0, index_col=0)
print(td_miss)



                 KNN_PCA_30_70  KNN_Feat_30_70  SVC_PCA_30_70  SVC_Feat_30_70   
Train                 0.005941        0.005941       0.004950        0.001485  \
Cross                 0.007921        0.007426       0.005941        0.003960   
Test                  0.006920        0.010381       0.008074        0.008074   
Opt_Feat_or_PCA      22.000000       90.000000      10.000000       87.000000   

                 LR_PCA_30_70  LR_Feat_30_70  KNN_PCA_20_80  KNN_Feat_20_80   
Train                0.000000        0.00000       0.004764        0.005197  \
Cross                0.005446        0.00297       0.007361        0.007795   
Test                 0.003460        0.00692       0.013841        0.012111   
Opt_Feat_or_PCA     30.000000       87.00000      11.000000       81.000000   

                 SVC_PCA_20_80  SVC_Feat_20_80  LR_PCA_20_80  LR_Feat_20_80   
Train                 0.003032        0.001299      0.000000       0.000000  \
Cross                 0.005631        0.