In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

#### loading data, exploratory analysis & preparing data

In [None]:
# Load the data from CSV file
data = pd.read_csv("Data/breast-cancer.csv")

# Print the first few rows to inspect the data
print(data.head(1))


In [None]:
#check data info
print(data.shape)
print(data.describe())
print(data.dtypes)

In [None]:
#check data for null values
print(data.isnull().sum())

In [None]:
#plot histogram of diagnosis (diagnoses is a dependet variable or target variable)
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
data["diagnosis"].value_counts().plot(kind="bar")
plt.xlabel("Diagnosis")
plt.ylabel("Count")
plt.title("Distribution of Diagnosis")
plt.show()

In [None]:
import seaborn as sns

# Plot histograms of each feature
sns.set(style="whitegrid")
plt.figure(figsize=(12, 10))
for i, column in enumerate(data.columns[2:], 1):
    plt.subplot(5, 6, i)
    sns.histplot(data[column], kde=True)
    plt.xlabel(column)
plt.tight_layout()
plt.show()

In [None]:
#plot relationships between features with diagnosis
plt.figure(figsize=(12, 10))
sns.pairplot(data, hue="diagnosis", vars=data.columns[2:])
plt.show()

#### preprocessing data

In [None]:
# heparating the features (X) and the target variable (y)
X = data.drop("diagnosis", axis=1)
y = data["diagnosis"]

# handling missing values (replace them with mean)
X = X.fillna(X.mean())

#Scaling the features (standardization)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
#Using sklearn's train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Modeling based on Logistic Regression and SVM models

In [None]:
# Define the models with different hyperparameter values
logreg_model = LogisticRegression()
svm_model = SVC()

# Define the hyperparameter values to try for each model
logreg_params = {'C': [0.1, 1, 10]}
svm_params = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}

# Perform k-fold cross-validation for each model and hyperparameter combination
k = 5  # Number of folds for cross-validation

# Logistic Regression
logreg_scores = []
for param in logreg_params['C']:
    logreg_model.C = param
    scores = cross_val_score(logreg_model, X, y, cv=k)
    logreg_scores.append(scores.mean())

# SVM
svm_scores = []
for param_C in svm_params['C']:
    for param_gamma in svm_params['gamma']:
        svm_model.C = param_C
        svm_model.gamma = param_gamma
        scores = cross_val_score(svm_model, X, y, cv=k)
        svm_scores.append(scores.mean())

# Compare and select the best model based on cross-validation scores
best_logreg_param = logreg_params['C'][logreg_scores.index(max(logreg_scores))]
best_svm_param_C, best_svm_param_gamma = svm_params['C'][svm_scores.index(max(svm_scores)) // len(svm_params['gamma'])], \
                                        svm_params['gamma'][svm_scores.index(max(svm_scores)) % len(svm_params['gamma'])]

# Print results ( the cross-validation scores and the best hyperparameter values)
print("Logistic Regression cross-validation scores:", logreg_scores)
print("Best Logistic Regression hyperparameter (C) value:", best_logreg_param)
print("SVM cross-validation scores:", svm_scores)
print("Best SVM hyperparameter (C, gamma) values:", best_svm_param_C, ",", best_svm_param_gamma)

In [None]:
# Convert categorical labels to binary values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Making predictions on the testing set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # Predicted probabilities for positive class

In [None]:

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion matrix
print("Confusion Matrix\n")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# AUC ROC
auc_roc = roc_auc_score(y_test, y_prob)
print("AUC ROC:", auc_roc)

# Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_prob)
pr_auc = auc(recall, precision)

In [None]:
# Plotting Precision-Recall curve
plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', step='post')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall curve (AUC={:.2f})'.format(pr_auc))
plt.show()