In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('Cancer Dataset.csv')

# Split the dataset into features (X) and labels (y)
X = data.drop(columns=['diagnosis'])
y = data['diagnosis']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize an empty set to store selected features
selected_features = set()

# Create an empty variable to store the best accuracy and the best feature
best_accuracy = 0.0
best_feature = None

# Create an SVM classifier with a Linear kernel
svm_linear = SVC(kernel='linear')
sub = ()
# Loop through each feature in X_train.columns
for feature in X_train.columns:
    # If the feature is already in selected_features, continue to the next feature
    if feature in selected_features:
        continue

    # Create a features_subset by combining selected_features and the current feature
    features_subset = list(selected_features) + [feature]

    # Fit the svm_linear model on X_train with features_subset and y_train
    svm_linear.fit(X_train[features_subset], y_train)

    # Make predictions on X_test with features_subset
    y_pred = svm_linear.predict(X_test[features_subset])

    # Calculate accuracy by comparing y_test and the predictions
    accuracy = accuracy_score(y_test, y_pred)

    # Print the current features_subset and accuracy
    print(f'Features Subset: {features_subset}, Accuracy: {accuracy}')

    # If accuracy is better than best_accuracy, update best_accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_feature = feature
        sub = features_subset

    # Add the current feature to selected_features
    selected_features.add(feature)

# Print the best_feature and best_accuracy
print(f'Best Feature: {sub}, Best Accuracy: {best_accuracy}')


Features Subset: ['radius_mean'], Accuracy: 0.9298245614035088
Features Subset: ['radius_mean', 'texture_mean'], Accuracy: 0.9035087719298246
Features Subset: ['texture_mean', 'radius_mean', 'perimeter_mean'], Accuracy: 0.9385964912280702
Features Subset: ['texture_mean', 'radius_mean', 'perimeter_mean', 'area_mean'], Accuracy: 0.9298245614035088
Features Subset: ['texture_mean', 'radius_mean', 'area_mean', 'perimeter_mean', 'smoothness_mean'], Accuracy: 0.9385964912280702
Features Subset: ['texture_mean', 'perimeter_mean', 'smoothness_mean', 'area_mean', 'radius_mean', 'compactness_mean'], Accuracy: 0.9385964912280702
Features Subset: ['texture_mean', 'perimeter_mean', 'smoothness_mean', 'area_mean', 'compactness_mean', 'radius_mean', 'concavity_mean'], Accuracy: 0.9385964912280702
Features Subset: ['texture_mean', 'perimeter_mean', 'smoothness_mean', 'concavity_mean', 'area_mean', 'compactness_mean', 'radius_mean', 'concave points_mean'], Accuracy: 0.9385964912280702
Features Subset:

Q2. Download the following dataset and perform the specified operations:

A.	Load and preprocess the dataset.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset (assuming you have already downloaded it)
data = pd.read_csv('Cancer Dataset.csv')

# Split the data into features (X) and labels (y)
X = data.drop(columns=['diagnosis'])  # Replace 'target_column_name' with the actual target column name
y = data['diagnosis']

# Split the dataset into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (optional but often recommended for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)


B.	Perform the 10-fold cross validation for training SVM with Linear Kernel

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Create an SVM classifier with a Linear kernel
svm_linear = SVC(kernel='linear')

# Perform 10-fold cross-validation on the training data
scores = cross_val_score(svm_linear, X_train, y_train, cv=10)

# Print the average accuracy of the cross-validation
print("Average Accuracy (Linear Kernel):", scores.mean())


Average Accuracy (Linear Kernel): 0.9714009661835747


C.	Change Kernel Function (RBF, Polynomial) and compute the performance (Accuracy and F-Score).

In [16]:
# Create SVM classifiers with different kernel functions
svm_rbf = SVC(kernel='rbf')
svm_poly = SVC(kernel='poly')

# Fit the classifiers on the training data
svm_rbf.fit(X_train, y_train)
svm_poly.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rbf = svm_rbf.predict(X_test)
y_pred_poly = svm_poly.predict(X_test)


from sklearn.metrics import f1_score

# Compute performance metrics for RBF kernel with 'macro' average
f1_rbf = f1_score(y_test, y_pred_rbf, average='macro')

# Compute performance metrics for Polynomial kernel with 'macro' average
f1_poly = f1_score(y_test, y_pred_poly, average='macro')

# Assuming you have already trained and predicted with the Polynomial kernel
from sklearn.metrics import accuracy_score

# Compute accuracy for Polynomial kernel
accuracy_poly = accuracy_score(y_test, y_pred_poly)


# Print the performance metrics
print("Performance Metrics for RBF Kernel:")
print("Accuracy:", accuracy_rbf)
print("F-Score:", f1_rbf)

# Print accuracy
print("\nPerformance Metrics for Polynomial Kernel:")
print("Accuracy:", accuracy_poly)
print("F-Score:", f1_poly)


Performance Metrics for RBF Kernel:
Accuracy: 0.9824561403508771
F-Score: 0.9422297297297297

Performance Metrics for Polynomial Kernel:
Accuracy: 0.9473684210526315
F-Score: 0.9422297297297297
