In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

#features = [f"V{x}" for x in range(1,2000)]
df = pd.read_csv('Data/TCGAdata.txt', sep=" " ,header=0)
labels_df = pd.read_csv('Data/TCGAlabels', sep=" " ,header=0)

#Standardizing the rows (transposing as fit_transform standardizes along columns)
scaled_df = pd.DataFrame(np.transpose(scaler.fit_transform(df.transpose())), columns=df.columns)

In [6]:
# PCA

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif

from tqdm import tqdm

# Range of PCA-components

X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels_df.values.ravel(), test_size=0.2, random_state=42)

In [13]:
##KNN PCA

max_num_components = 25
num_components_range = range(1, max_num_components)

KNN_mean_scores = np.zeros(max_num_components)

# Loop over different numbers of components
for n_components in tqdm(num_components_range):
    
    #PCA

    KNN_pipeline = make_pipeline(PCA(n_components=n_components), KNeighborsClassifier(n_neighbors=5))
    
    KNN_scores = cross_val_score(KNN_pipeline, X_train, y_train, cv=5)
    KNN_mean_score = KNN_scores.mean()

    #KNN_mean_scores.append(KNN_mean_score)
    KNN_mean_scores[n_components] = KNN_mean_score
    
KNN_optimal_n_components = np.where(KNN_mean_scores==KNN_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(KNN_mean_scores)

print("KNN optimal number of PCA components:", KNN_optimal_n_components)

opt_pipeline = make_pipeline(PCA(n_components=KNN_optimal_n_components), KNeighborsClassifier(n_neighbors=5))

opt_pipeline.fit(X_train, y_train)
train_pred = opt_pipeline.predict(X_train)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = opt_pipeline.predict(X_test)
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

100%|███████████████████████████████████████████| 24/24 [02:57<00:00,  7.39s/it]


KNN optimal number of PCA components: 23
Cross val err:  0.006063423200082574
Train err:  0.0047639670853183436
Test err:  0.01211072664359858


In [14]:
##KNN features
max_num_features = 30

num_features = range(1, max_num_features)
KNN_mean_scores = np.zeros(max_num_features)

# Loop over different numbers of features
for k in tqdm(num_features):
    
    feature_selector = SelectKBest(f_classif, k=k)

    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    
    model = KNeighborsClassifier(n_neighbors=5)
    
    KNN_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    KNN_mean_score = KNN_scores.mean()

    #KNN_mean_scores.append(KNN_mean_score)

KNN_optimal_k_features = np.where(KNN_mean_scores==KNN_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(KNN_mean_scores)

print("KNN optimal number of features:", KNN_optimal_k_features)

feature_selector = SelectKBest(f_classif, k=KNN_optimal_k_features)
X_train_selected = feature_selector.fit_transform(X_train, y_train)
model.fit(X_train_selected, y_train)


# Get the indices of the selected features
selected_feature_indices = feature_selector.get_support(indices=True)

# Get the names of the most predictive features
selected_features = X_train.columns[selected_feature_indices]
print("Most predictive features:", selected_features)


train_pred = model.predict(X_train_selected)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)



  0%|                                                    | 0/29 [00:00<?, ?it/s]


AttributeError: 'numpy.ndarray' object has no attribute 'append'

In [5]:
##SVC PCA

SVC_mean_scores = np.zeros(max_num_components)

#num_components_range = range(1, max_num_components)

# Loop over different numbers of components
for n_components in tqdm(num_components_range):
    
    #PCA

    SVC_pipeline = make_pipeline(PCA(n_components=n_components), SVC())
    
    SVC_scores = cross_val_score(SVC_pipeline, X_train, y_train, cv=5)
    SVC_mean_score = SVC_scores.mean()

    #SVC_mean_scores.append(SVC_mean_score)

SVC_optimal_n_components = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(SVC_mean_scores)

print("SVC optimal number of PCA components:", SVC_optimal_n_components)

opt_pipeline = make_pipeline(PCA(n_components=SVC_optimal_n_components), SVC())

opt_pipeline.fit(X_train, y_train)
train_pred = opt_pipeline.predict(X_train)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = opt_pipeline.predict(X_test)
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

SVC optimal number of PCA components: 24
Cross val err:  0.005197622334281782
Train err:  0.0017323516673884987
Test err:  0.01384083044982698


In [6]:
##SVC features
max_num_features = 50
num_features = range(1, max_num_features)
SVC_mean_scores = np.zeros(max_num_features)

# Loop over different numbers of components
for k in tqdm(num_features):
    
    feature_selector = SelectKBest(f_classif, k=k)

    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    
    model = SVC()
    
    SVC_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    SVC_mean_score = SVC_scores.mean()

    #SVC_mean_scores.append(SVC_mean_score)

SVC_optimal_k_features = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(SVC_mean_scores)

print("SVC optimal number of features:", SVC_optimal_k_features)

feature_selector = SelectKBest(f_classif, k=SVC_optimal_k_features)
X_train_selected = feature_selector.fit_transform(X_train, y_train)
model.fit(X_train_selected, y_train)

# Get the indices of the selected features
selected_feature_indices = feature_selector.get_support(indices=True)

# Get the names of the most predictive features
selected_features = X_train.columns[selected_feature_indices]
print("Most predictive features:", selected_features)


train_pred = model.predict(X_train_selected)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

SVC optimal number of features: 43
Most predictive features: Index(['V3', 'V18', 'V29', 'V64', 'V68', 'V87', 'V307', 'V308', 'V462', 'V475',
       'V507', 'V539', 'V544', 'V627', 'V657', 'V687', 'V730', 'V803', 'V845',
       'V1005', 'V1071', 'V1097', 'V1098', 'V1101', 'V1173', 'V1193', 'V1206',
       'V1218', 'V1256', 'V1263', 'V1517', 'V1533', 'V1654', 'V1673', 'V1697',
       'V1744', 'V1787', 'V1799', 'V1812', 'V1829', 'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.007363063545276161
Train err:  0.004330879168471191
Test err:  0.01211072664359858




In [7]:
##Logistic regression PCA

LR_mean_scores = np.zeros(max_num_components)

#num_components_range = range(1, 25)

# Loop over different numbers of components
for n_components in tqdm(num_components_range):
    
    #PCA

    LR_pipeline = make_pipeline(PCA(n_components=n_components), LogisticRegression(random_state=16, max_iter=10000))
    
    LR_scores = cross_val_score(LR_pipeline, X_train, y_train, cv=5)
    LR_mean_score = LR_scores.mean()

    #LR_mean_scores.append(LR_mean_score)

LR_optimal_n_components = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(LR_mean_scores)

print("KNN optimal number of PCA components:", LR_optimal_n_components)

opt_pipeline = make_pipeline(PCA(n_components=LR_optimal_n_components), LogisticRegression(random_state=16, max_iter=10000))

opt_pipeline.fit(X_train, y_train)
train_pred = opt_pipeline.predict(X_train)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = opt_pipeline.predict(X_test)
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

KNN optimal number of PCA components: 18
Cross val err:  0.005198561380773903
Train err:  0.0
Test err:  0.00519031141868509


In [8]:
##Logistic Regression features
max_num_features = 30
num_features = range(1, max_num_features)
LR_mean_scores = np.zeros(max_num_features)

# Loop over different numbers of components
for k in tqdm(num_features):
    
    feature_selector = SelectKBest(f_classif, k=k)

    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    
    model = LogisticRegression(random_state=16, max_iter=10000)
    
    LR_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    LR_mean_score = LR_scores.mean()

    #LR_mean_scores.append(LR_mean_score)

LR_optimal_k_features = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(LR_mean_scores)

print("LR optimal number of features:", LR_optimal_k_features)

feature_selector = SelectKBest(f_classif, k=LR_optimal_k_features)
X_train_selected = feature_selector.fit_transform(X_train, y_train)
model.fit(X_train_selected, y_train)

# Get the indices of the selected features
selected_feature_indices = feature_selector.get_support(indices=True)

# Get the names of the most predictive features
selected_features = X_train.columns[selected_feature_indices]
print("Most predictive features:", selected_features)

train_pred = model.predict(X_train_selected)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

LR optimal number of features: 28
Most predictive features: Index(['V18', 'V68', 'V462', 'V507', 'V539', 'V627', 'V657', 'V687', 'V730',
       'V803', 'V845', 'V1005', 'V1071', 'V1098', 'V1101', 'V1193', 'V1256',
       'V1533', 'V1654', 'V1673', 'V1744', 'V1787', 'V1799', 'V1812', 'V1829',
       'V1871', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.016894385441023063
Train err:  0.0038977912516240387
Test err:  0.02422145328719727




In [9]:
##Part 2 Theme 2 mislabeling

mislabel_fraction = 0.2
labels = set(labels_df["x"])

num_samples = len(y_train)
num_mislabels = int(mislabel_fraction * num_samples)
mislabel_indices = np.random.choice(num_samples, num_mislabels, replace=False)

y_train_noise = y_train.copy()

for i in mislabel_indices:
    correct = y_train[i]
    y_train_noise[i] = np.random.choice(list(labels - set([correct])))


In [10]:
##KNN PCA

num_components_range = range(1, max_num_components)

KNN_mean_scores = np.zeros(max_num_components)

# Loop over different numbers of components
for n_components in tqdm(num_components_range):
    
    #PCA

    KNN_pipeline = make_pipeline(PCA(n_components=n_components), KNeighborsClassifier(n_neighbors=5))
    # Kanske borde göras så att den kör cross validation på korrekt data?
    KNN_scores = cross_val_score(KNN_pipeline, X_train, y_train_noise, cv=5)
    KNN_mean_score = KNN_scores.mean()

    #KNN_mean_scores.append(KNN_mean_score)

KNN_optimal_n_components = num_components_range[KNN_mean_scores.index(max(KNN_mean_scores))]
cross_val_err = 1 - max(KNN_mean_scores)

print("KNN optimal number of PCA components with mislabels:", KNN_optimal_n_components)

opt_pipeline = make_pipeline(PCA(n_components=KNN_optimal_n_components), KNeighborsClassifier(n_neighbors=5))

opt_pipeline.fit(X_train, y_train_noise)
train_pred = opt_pipeline.predict(X_train)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = opt_pipeline.predict(X_test)
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

KNN optimal number of PCA components with mislabels: 6
Cross val err:  0.21914903606877567
Train err:  0.03508012126461668
Test err:  0.03633217993079585


In [11]:
###KNN features
max_num_features = 30

num_features = range(1, max_num_features)
KNN_mean_scores = np.zeros(max_num_features)

# Loop over different numbers of features
for k in tqdm(num_features):
    
    feature_selector = SelectKBest(f_classif, k=k)

    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    
    model = KNeighborsClassifier(n_neighbors=5)
    
    KNN_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    KNN_mean_score = KNN_scores.mean()

    #KNN_mean_scores.append(KNN_mean_score)

KNN_optimal_k_features = np.where(KNN_mean_scores==KNN_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(KNN_mean_scores)

print("KNN optimal number of features:", KNN_optimal_k_features)

feature_selector = SelectKBest(f_classif, k=KNN_optimal_k_features)
X_train_selected = feature_selector.fit_transform(X_train, y_train)
model.fit(X_train_selected, y_train)


# Get the indices of the selected features
selected_feature_indices = feature_selector.get_support(indices=True)

# Get the names of the most predictive features
selected_features = X_train.columns[selected_feature_indices]
print("Most predictive features:", selected_features)


train_pred = model.predict(X_train_selected)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)


KNN optimal number of features: 28
Most predictive features with noise: Index(['V3', 'V18', 'V68', 'V87', 'V289', 'V350', 'V475', 'V494', 'V507',
       'V539', 'V627', 'V1097', 'V1101', 'V1193', 'V1218', 'V1256', 'V1263',
       'V1517', 'V1533', 'V1549', 'V1654', 'V1673', 'V1744', 'V1787', 'V1799',
       'V1846', 'V1882', 'V1936'],
      dtype='object')
Cross val err:  0.22478049788245025
Train err:  0.029449978345604144
Test err:  0.02941176470588236




In [12]:
##SVC PCA

SVC_mean_scores = np.zeros(max_num_components)

#num_components_range = range(1, max_num_components)

# Loop over different numbers of components
for n_components in tqdm(num_components_range):
    
    #PCA

    SVC_pipeline = make_pipeline(PCA(n_components=n_components), SVC())
    
    SVC_scores = cross_val_score(SVC_pipeline, X_train, y_train, cv=5)
    SVC_mean_score = SVC_scores.mean()

    #SVC_mean_scores.append(SVC_mean_score)

SVC_optimal_n_components = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(SVC_mean_scores)

print("SVC optimal number of PCA components:", SVC_optimal_n_components)

opt_pipeline = make_pipeline(PCA(n_components=SVC_optimal_n_components), SVC())

opt_pipeline.fit(X_train, y_train)
train_pred = opt_pipeline.predict(X_train)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = opt_pipeline.predict(X_test)
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

SVC optimal number of PCA components: 9
Cross val err:  0.2048586265506006
Train err:  0.005197055002165385
Test err:  0.01730103806228378


In [13]:
##SVC features
max_num_features = 50
num_features = range(1, max_num_features)
SVC_mean_scores = np.zeros(max_num_features)

# Loop over different numbers of components
for k in tqdm(num_features):
    
    feature_selector = SelectKBest(f_classif, k=k)

    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    
    model = SVC()
    
    SVC_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    SVC_mean_score = SVC_scores.mean()

    #SVC_mean_scores.append(SVC_mean_score)

SVC_optimal_k_features = np.where(SVC_mean_scores==SVC_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(SVC_mean_scores)

print("SVC optimal number of features:", SVC_optimal_k_features)

feature_selector = SelectKBest(f_classif, k=SVC_optimal_k_features)
X_train_selected = feature_selector.fit_transform(X_train, y_train)
model.fit(X_train_selected, y_train)

# Get the indices of the selected features
selected_feature_indices = feature_selector.get_support(indices=True)

# Get the names of the most predictive features
selected_features = X_train.columns[selected_feature_indices]
print("Most predictive features:", selected_features)


train_pred = model.predict(X_train_selected)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

SVC optimal number of features: 49
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V217', 'V289', 'V307', 'V308', 'V350',
       'V462', 'V475', 'V494', 'V507', 'V539', 'V568', 'V627', 'V673', 'V687',
       'V803', 'V845', 'V855', 'V982', 'V1033', 'V1066', 'V1097', 'V1098',
       'V1101', 'V1102', 'V1193', 'V1218', 'V1256', 'V1263', 'V1478', 'V1517',
       'V1533', 'V1549', 'V1654', 'V1673', 'V1697', 'V1744', 'V1787', 'V1799',
       'V1812', 'V1829', 'V1846', 'V1871', 'V1882', 'V1936', 'V1971'],
      dtype='object')
Cross val err:  0.21049008836427496
Train err:  0.00822867042009523
Test err:  0.01903114186851207




In [14]:
##Logistic regression PCA

LR_mean_scores = np.zeros(max_num_components)

#num_components_range = range(1, 25)

# Loop over different numbers of components
for n_components in tqdm(num_components_range):
    
    #PCA

    LR_pipeline = make_pipeline(PCA(n_components=n_components), LogisticRegression(random_state=16, max_iter=10000))
    
    LR_scores = cross_val_score(LR_pipeline, X_train, y_train, cv=5)
    LR_mean_score = LR_scores.mean()

    #LR_mean_scores.append(LR_mean_score)

LR_optimal_n_components = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(LR_mean_scores)

print("KNN optimal number of PCA components:", LR_optimal_n_components)

opt_pipeline = make_pipeline(PCA(n_components=LR_optimal_n_components), LogisticRegression(random_state=16, max_iter=10000))

opt_pipeline.fit(X_train, y_train)
train_pred = opt_pipeline.predict(X_train)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = opt_pipeline.predict(X_test)
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

KNN optimal number of PCA components: 23
Cross val err:  0.2052934050764853
Train err:  0.00606323083585969
Test err:  0.01557093425605538


In [15]:
##Logistic Regression features
max_num_features = 30
num_features = range(1, max_num_features)
LR_mean_scores = np.zeros(max_num_features)

# Loop over different numbers of components
for k in tqdm(num_features):
    
    feature_selector = SelectKBest(f_classif, k=k)

    X_train_selected = feature_selector.fit_transform(X_train, y_train)
    
    model = LogisticRegression(random_state=16, max_iter=10000)
    
    LR_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    LR_mean_score = LR_scores.mean()

    #LR_mean_scores.append(LR_mean_score)

LR_optimal_k_features = np.where(LR_mean_scores==LR_mean_scores.max())[0][0]+1
cross_val_err = 1 - max(LR_mean_scores)

print("LR optimal number of features:", LR_optimal_k_features)

feature_selector = SelectKBest(f_classif, k=LR_optimal_k_features)
X_train_selected = feature_selector.fit_transform(X_train, y_train)
model.fit(X_train_selected, y_train)

# Get the indices of the selected features
selected_feature_indices = feature_selector.get_support(indices=True)

# Get the names of the most predictive features
selected_features = X_train.columns[selected_feature_indices]
print("Most predictive features:", selected_features)

train_pred = model.predict(X_train_selected)
train_error = 1 - accuracy_score(y_train, train_pred)

test_pred = model.predict(X_test[X_test.columns[selected_feature_indices]])
test_error = 1 - accuracy_score(y_test, test_pred)

print("Cross val err: ", cross_val_err)
print("Train err: ", train_error)
print("Test err: ", test_error)

LR optimal number of features: 26
Most predictive features: Index(['V3', 'V18', 'V68', 'V87', 'V289', 'V475', 'V507', 'V539', 'V627',
       'V1097', 'V1101', 'V1193', 'V1218', 'V1256', 'V1263', 'V1517', 'V1533',
       'V1549', 'V1654', 'V1673', 'V1744', 'V1787', 'V1799', 'V1846', 'V1882',
       'V1936'],
      dtype='object')
Cross val err:  0.22435041458902627
Train err:  0.025119099177132953
Test err:  0.02941176470588236


