In [1]:
import pandas as pd
import sklearn.datasets as datasets
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import seaborn as sns

# Load the breast cancer dataset
cancer_data = datasets.load_breast_cancer()
X = pd.DataFrame(data=cancer_data.data, columns=cancer_data.feature_names)
y = pd.Series(cancer_data.target)

In [10]:
print(f"Descriptive statistics for features:\n{X.describe()}")
print(f"Initial 5 data points of features:\n{X.head()}")
print(f"Data types of features\n{X.dtypes}")

Descriptive statistics for features:
       mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000

In [12]:
print(f"Descriptive statistics for a label:\n{y.describe()}")
print(f"Initial 5 data points of a label:\n{y.head()}")
print(f"Data types of a label\n{y.dtypes}")

Descriptive statistics for a label:
count    569.000000
mean       0.627417
std        0.483918
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
dtype: float64
Initial 5 data points of a label:
0    0
1    0
2    0
3    0
4    0
dtype: int32
Data types of a label
int32


# Plotting features

In [None]:
import matplotlib.pyplot as plt

sns.heatmap(X.corr(),annot=True)

sns.histplot(y)

sns.pairplot(data=X.loc[:,["mean area", "area error", "worst perimeter", "worst area"]])
plt.show()
X.boxplot()

# Modeling

In [15]:
# Create an SVM classifier with a linear kernel
svm_lin = SVC(kernel='linear')

# Perform cross-validation
scores1 = cross_val_score(svm_lin, X, y, cv=5)
average_score1 = scores1.mean()



svm_rbf = SVC(kernel="rbf")

scores2 = cross_val_score(svm_rbf, X, y, cv=5)
average_score2=scores2.mean()


svm_poly=SVC(kernel="poly")

scores3=cross_val_score(svm_poly, X, y, cv=5)
average_score3=scores3.mean()


from sklearn.ensemble import RandomForestClassifier
RFC=RandomForestClassifier()

scores4=cross_val_score(RFC, X, y, cv=5)
average_score4=scores4.mean()


import xgboost as xgb
XGB=xgb.XGBClassifier(objective="binary:logistic")
scores5=cross_val_score(XGB,X,y,cv=5)
average_score5=scores5.mean()


In [16]:
print(f"Average cross-validation score (linear SVM): {average_score1:.3f}")
print(f"Average cross-validation score (RBF SVM): {average_score2:.3f}")
print(f"Average cross-validation score (Polynomial SVM): {average_score3:.3f}")
print(f"Average cross-validation score (Random Forest): {average_score4:.3f}")
print(f"Average cross-validation score (XGBoost): {average_score5:.3f}")


Average cross-validation score (linear SVM): 0.946
Average cross-validation score (RBF SVM): 0.912
Average cross-validation score (Polynomial SVM): 0.909
Average cross-validation score (Random Forest): 0.963
Average cross-validation score (XGBoost): 0.970


In [None]:
import matplotlib.pyplot as plt
import numpy as np

svm_lin.fit(X, y)

# Plot the decision boundary
plt.figure(figsize=(10, 6))

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X["mean area"].min() - 1, X["mean area"].max() + 1
y_min, y_max = X["area error"].min() - 1, X["area error"].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))

Z = svm_lin.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.8)
plt.scatter(X["mean area"],X["area error"], c=y
            ,edgecolors='k', cmap=plt.cm.coolwarm)
plt.xlabel('mean area')
plt.ylabel('area error')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('SVM with Linear Kernel')
plt.show()
