EC503 Project by Xulun Huang

Data Preparation

In [16]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold

# Define the column names based on the dataset description
column_names = [
    'ID', 'Diagnosis',
    'Mean Radius', 'Mean Texture', 'Mean Perimeter', 'Mean Area', 'Mean Smoothness',
    'Mean Compactness', 'Mean Concavity', 'Mean Concave Points', 'Mean Symmetry', 'Mean Fractal Dimension',
    'SE Radius', 'SE Texture', 'SE Perimeter', 'SE Area', 'SE Smoothness',
    'SE Compactness', 'SE Concavity', 'SE Concave Points', 'SE Symmetry', 'SE Fractal Dimension',
    'Worst Radius', 'Worst Texture', 'Worst Perimeter', 'Worst Area', 'Worst Smoothness',
    'Worst Compactness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension'
]

# Load the data
data = pd.read_csv('../datas/wdbc.data', names=column_names)

# Display the first few rows
print(data.head())


         ID Diagnosis  Mean Radius  Mean Texture  Mean Perimeter  Mean Area  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   Mean Smoothness  Mean Compactness  Mean Concavity  Mean Concave Points  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  Worst Radius  Worst Texture  Worst Perimeter  Worst Ar

Data Processing

In [17]:
# Map Diagnosis to numerical values
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

# Drop the ID column
data.drop('ID', axis=1, inplace=True)

# Separate features and target variable
X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']

# Feature scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Split into k-fold cross validation

In [18]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test= train_test_split( X_scaled, y, test_size=0.2)

In [19]:
from sklearn.model_selection import StratifiedKFold

k = 5  # Number of folds
skf = StratifiedKFold(n_splits=k, shuffle=True)



Apply Lasso

In [20]:
# Initialize lists to store metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
selected_features_list = []

# For each fold
for fold, (train_index, test_index) in enumerate(skf.split(X_scaled, y), 1):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Perform feature selection using LASSO on the training set
    lasso = Lasso(alpha=0.01)
    lasso.fit(X_train_fold, y_train_fold)
    
    # Get coefficients
    lasso_coef = lasso.coef_
    
    # Select features where coefficients are not zero
    selected_features_mask = lasso_coef != 0
    selected_features = X.columns[selected_features_mask]
    selected_features_list.append(selected_features)
    
    # Transform training and test sets
    X_train_selected = X_train_fold[:, selected_features_mask]
    X_test_selected = X_test_fold[:, selected_features_mask]
    
    # Train classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_selected, y_train_fold)
    
    # Predict on test set
    y_pred = clf.predict(X_test_selected)
    
    # Calculate metrics
    acc = accuracy_score(y_test_fold, y_pred)
    prec = precision_score(y_test_fold, y_pred)
    rec = recall_score(y_test_fold, y_pred)
    f1 = f1_score(y_test_fold, y_pred)
    
    # Store metrics
    accuracy_scores.append(acc)
    precision_scores.append(prec)
    recall_scores.append(rec)
    f1_scores.append(f1)
    
    # Print selected features for the fold
    print(f"Fold {fold} selected features (LASSO):")
    print(selected_features.tolist())
    print("\n")

# At the end, calculate mean and std of metrics
print("LASSO Feature Selection Results:")
print(f"Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
print(f"Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
print(f"F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")


Fold 1 selected features (LASSO):
['Mean Texture', 'Mean Concave Points', 'SE Radius', 'SE Compactness', 'Worst Radius', 'Worst Texture', 'Worst Smoothness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry']


Fold 2 selected features (LASSO):
['Mean Texture', 'Mean Concave Points', 'Mean Fractal Dimension', 'SE Radius', 'SE Smoothness', 'SE Concavity', 'SE Symmetry', 'Worst Radius', 'Worst Texture', 'Worst Smoothness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension']


Fold 3 selected features (LASSO):
['Mean Texture', 'Mean Concave Points', 'Mean Fractal Dimension', 'SE Radius', 'SE Smoothness', 'SE Concavity', 'Worst Radius', 'Worst Texture', 'Worst Smoothness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry']


Fold 4 selected features (LASSO):
['Mean Texture', 'Mean Concave Points', 'Mean Fractal Dimension', 'SE Radius', 'SE Smoothness', 'SE Concavity', 'Worst Radius', 'Worst Texture', 'Worst Smoothness', 'Worst Concavi

In [21]:
# from sklearn.linear_model import Lasso
# from sklearn.metrics import accuracy_score
# from sklearn.linear_model import LogisticRegression

# # Fit LASSO model
# lasso = Lasso(alpha=0.01)  # You can adjust alpha as needed
# lasso.fit(X_train, y_train)

# # Get coefficients
# lasso_coef = lasso.coef_

# # Select features where coefficients are not zero
# selected_features = X.columns[(lasso_coef != 0)]
# print("Selected features by LASSO:")
# print(selected_features)

# # Reduce the dataset to selected features
# X_train_lasso = X_train[:, (lasso_coef != 0)]
# X_test_lasso = X_test[:, (lasso_coef != 0)]

# # Train a classifier on the selected features
# clf = LogisticRegression()
# clf.fit(X_train_lasso, y_train)
# y_pred = clf.predict(X_test_lasso)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# print(f"LASSO Feature Selection Accuracy: {accuracy}")


Apply Group-Lasso

In [24]:

from group_lasso import GroupLasso

# Define groups (0: Mean features, 1: SE features, 2: Worst features)
groups = []
for col in X.columns:
    if 'Mean' in col:
        groups.append(0)
    elif 'SE' in col:
        groups.append(1)
    elif 'Worst' in col:
        groups.append(2)
    else:
        groups.append(3)  # Just in case

groups = np.array(groups)

# Initialize lists to store metrics
accuracy_scores_gl = []
precision_scores_gl = []
recall_scores_gl = []
f1_scores_gl = []
selected_features_list_gl = []

# For each fold
for fold, (train_index, test_index) in enumerate(skf.split(X_scaled, y), 1):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Perform feature selection using Group LASSO on the training set
    group_lasso = GroupLasso(
    groups=groups,
    group_reg=0.05,
    l1_reg=0.01,
    n_iter=1000,
    scale_reg="group_size",
    supress_warning=True,  # Correct spelling as per group-lasso package
    fit_intercept=True
    )

    
    group_lasso.fit(X_train_fold, y_train_fold.values)
    
    # Get the mask of selected features
    mask = group_lasso.sparsity_mask_.flatten()
    selected_features = X.columns[mask]
    selected_features_list_gl.append(selected_features)
    
    # Transform training and test sets
    X_train_selected = X_train_fold[:, mask]
    X_test_selected = X_test_fold[:, mask]
    
    # Train classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_selected, y_train_fold)
    
    # Predict on test set
    y_pred = clf.predict(X_test_selected)
    
    # Calculate metrics
    acc = accuracy_score(y_test_fold, y_pred)
    prec = precision_score(y_test_fold, y_pred)
    rec = recall_score(y_test_fold, y_pred)
    f1 = f1_score(y_test_fold, y_pred)
    
    # Store metrics
    accuracy_scores_gl.append(acc)
    precision_scores_gl.append(prec)
    recall_scores_gl.append(rec)
    f1_scores_gl.append(f1)
    
    # Print selected features for the fold
    print(f"Fold {fold} selected features (Group LASSO):")
    print(selected_features.tolist())
    print("\n")

# At the end, calculate mean and std of metrics
print("Group LASSO Feature Selection Results:")
print(f"Accuracy: {np.mean(accuracy_scores_gl):.4f} ± {np.std(accuracy_scores_gl):.4f}")
print(f"Precision: {np.mean(precision_scores_gl):.4f} ± {np.std(precision_scores_gl):.4f}")
print(f"Recall: {np.mean(recall_scores_gl):.4f} ± {np.std(recall_scores_gl):.4f}")
print(f"F1-Score: {np.mean(f1_scores_gl):.4f} ± {np.std(f1_scores_gl):.4f}")


Fold 1 selected features (Group LASSO):
['Worst Radius', 'Worst Texture', 'Worst Perimeter', 'Worst Area', 'Worst Smoothness', 'Worst Compactness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension']


Fold 2 selected features (Group LASSO):
['Worst Radius', 'Worst Texture', 'Worst Perimeter', 'Worst Area', 'Worst Smoothness', 'Worst Compactness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension']


Fold 3 selected features (Group LASSO):
['Worst Radius', 'Worst Texture', 'Worst Perimeter', 'Worst Area', 'Worst Smoothness', 'Worst Compactness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension']


Fold 4 selected features (Group LASSO):
['Worst Radius', 'Worst Texture', 'Worst Perimeter', 'Worst Area', 'Worst Smoothness', 'Worst Compactness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension']


Fold 5 selected features (Group LASSO):
['Worst 

In [None]:
# from group_lasso import GroupLasso
# import numpy as np

# # Define groups (0: Mean features, 1: SE features, 2: Worst features)
# groups = []
# for i, col in enumerate(X.columns):
#     if 'Mean' in col:
#         groups.append(0)
#     elif 'SE' in col:
#         groups.append(1)
#     elif 'Worst' in col:
#         groups.append(2)
#     else:
#         groups.append(3)  # Just in case

# groups = np.array(groups)

# # Fit Group LASSO model
# group_lasso = GroupLasso(
#     groups=groups,
#     group_reg=0.05,
#     l1_reg=0.01,
#     n_iter=1000,
#     scale_reg="group_size",
#     supress_warning=True,  # Note: Correct spelling is 'suppress_warning'
#     fit_intercept=True
# )
# group_lasso.fit(X_train, y_train)

# # Get the mask of selected features
# mask = group_lasso.sparsity_mask_

# # Ensure mask is 1-D
# mask = mask.flatten()

# selected_features_gl = X.columns[mask]
# print("Selected features by Group LASSO:")
# print(selected_features_gl)

# # Reduce the dataset to selected features
# X_train_gl = X_train[:, mask]
# X_test_gl = X_test[:, mask]

# # Train a classifier on the selected features
# clf_gl = LogisticRegression()
# clf_gl.fit(X_train_gl, y_train)
# y_pred_gl = clf_gl.predict(X_test_gl)

# # Evaluate the model
# accuracy_gl = accuracy_score(y_test, y_pred_gl)
# print(f"Group LASSO Feature Selection Accuracy: {accuracy_gl}")



Selected features by Group LASSO:
Index(['Worst Radius', 'Worst Texture', 'Worst Perimeter', 'Worst Area',
       'Worst Smoothness', 'Worst Compactness', 'Worst Concavity',
       'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension'],
      dtype='object')
Group LASSO Feature Selection Accuracy: 0.9736842105263158


Apply Elastic Net

In [25]:
# Initialize lists to store metrics
accuracy_scores_en = []
precision_scores_en = []
recall_scores_en = []
f1_scores_en = []
selected_features_list_en = []

# For each fold
for fold, (train_index, test_index) in enumerate(skf.split(X_scaled, y), 1):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Perform feature selection using Elastic Net on the training set
    elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.5)
    elastic_net.fit(X_train_fold, y_train_fold)
    
    # Get coefficients
    en_coef = elastic_net.coef_
    
    # Select features where coefficients are not zero
    selected_features_mask = en_coef != 0
    selected_features = X.columns[selected_features_mask]
    selected_features_list_en.append(selected_features)
    
    # Transform training and test sets
    X_train_selected = X_train_fold[:, selected_features_mask]
    X_test_selected = X_test_fold[:, selected_features_mask]
    
    # Train classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_selected, y_train_fold)
    
    # Predict on test set
    y_pred = clf.predict(X_test_selected)
    
    # Calculate metrics
    acc = accuracy_score(y_test_fold, y_pred)
    prec = precision_score(y_test_fold, y_pred)
    rec = recall_score(y_test_fold, y_pred)
    f1 = f1_score(y_test_fold, y_pred)
    
    # Store metrics
    accuracy_scores_en.append(acc)
    precision_scores_en.append(prec)
    recall_scores_en.append(rec)
    f1_scores_en.append(f1)
    
    # Print selected features for the fold
    print(f"Fold {fold} selected features (Elastic Net):")
    print(selected_features.tolist())
    print("\n")

# At the end, calculate mean and std of metrics
print("Elastic Net Feature Selection Results:")
print(f"Accuracy: {np.mean(accuracy_scores_en):.4f} ± {np.std(accuracy_scores_en):.4f}")
print(f"Precision: {np.mean(precision_scores_en):.4f} ± {np.std(precision_scores_en):.4f}")
print(f"Recall: {np.mean(recall_scores_en):.4f} ± {np.std(recall_scores_en):.4f}")
print(f"F1-Score: {np.mean(f1_scores_en):.4f} ± {np.std(f1_scores_en):.4f}")


Fold 1 selected features (Elastic Net):
['Mean Radius', 'Mean Texture', 'Mean Compactness', 'Mean Concave Points', 'Mean Fractal Dimension', 'SE Radius', 'SE Area', 'SE Smoothness', 'SE Compactness', 'SE Concavity', 'SE Concave Points', 'Worst Radius', 'Worst Texture', 'Worst Smoothness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension']


Fold 2 selected features (Elastic Net):
['Mean Radius', 'Mean Texture', 'Mean Compactness', 'Mean Concave Points', 'Mean Fractal Dimension', 'SE Radius', 'SE Area', 'SE Smoothness', 'SE Compactness', 'SE Concavity', 'SE Concave Points', 'Worst Radius', 'Worst Texture', 'Worst Smoothness', 'Worst Concavity', 'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension']


Fold 3 selected features (Elastic Net):
['Mean Texture', 'Mean Compactness', 'Mean Concave Points', 'Mean Fractal Dimension', 'SE Radius', 'SE Area', 'SE Smoothness', 'SE Compactness', 'SE Concavity', 'Worst Radius', 'Worst Texture', 'Worst

In [None]:
# from sklearn.linear_model import ElasticNet

# # Fit Elastic Net model
# elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.5)  # Adjust alpha and l1_ratio as needed
# elastic_net.fit(X_train, y_train)

# # Get coefficients
# en_coef = elastic_net.coef_

# # Select features where coefficients are not zero
# selected_features_en = X.columns[(en_coef != 0)]
# print("Selected features by Elastic Net:")
# print(selected_features_en)

# # Reduce the dataset to selected features
# X_train_en = X_train[:, (en_coef != 0)]
# X_test_en = X_test[:, (en_coef != 0)]

# # Train a classifier on the selected features
# clf_en = LogisticRegression()
# clf_en.fit(X_train_en, y_train)
# y_pred_en = clf_en.predict(X_test_en)

# # Evaluate the model
# accuracy_en = accuracy_score(y_test, y_pred_en)
# print(f"Elastic Net Feature Selection Accuracy: {accuracy_en}")


Selected features by Elastic Net:
Index(['Mean Radius', 'Mean Texture', 'Mean Compactness',
       'Mean Concave Points', 'Mean Fractal Dimension', 'SE Radius', 'SE Area',
       'SE Smoothness', 'SE Compactness', 'SE Concavity', 'SE Concave Points',
       'Worst Radius', 'Worst Texture', 'Worst Smoothness', 'Worst Concavity',
       'Worst Concave Points', 'Worst Symmetry', 'Worst Fractal Dimension'],
      dtype='object')
Elastic Net Feature Selection Accuracy: 0.9736842105263158


Comparing results

In [26]:
# print("Number of features selected by LASSO:", len(selected_features))
# print("LASSO Accuracy:", accuracy)

# print("\nNumber of features selected by Elastic Net:", len(selected_features_en))
# print("Elastic Net Accuracy:", accuracy_en)

# print("\nNumber of features selected by Group LASSO:", len(selected_features_gl))
# print("Group LASSO Accuracy:", accuracy_gl)


Conclusion