In [None]:
import numpy as np
import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [None]:
breast_cancer_ds = sklearn.datasets.load_breast_cancer()

In [None]:
df = pd.DataFrame(breast_cancer_ds.data, columns = breast_cancer_ds.feature_names)

In [None]:
df['label'] = breast_cancer_ds.target

In [None]:
plt.figure(figsize=(10, 6))
df['label'].value_counts().plot(kind='bar')
plt.title('Distribution of Benign vs Malignant Cases')
plt.xlabel('Diagnosis (1: Benign, 0: Malignant)')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
print(df)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['label'].value_counts()

1 represents benign
0 represents malignant

In [None]:
df.groupby('label').mean()

In [None]:
X= df.drop(columns='label', axis=1)
Y= df['label']

In [None]:
print(Y)

In [None]:
print(X)

Split into Training Data and Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

In [None]:
#Cross-Validation
cv_scores = cross_val_score(model, X_train, Y_train, cv=5)
print('\nCross-validation scores:', cv_scores)
print('Average CV score:', cv_scores.mean())

In [None]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': abs(model.coef_[0])
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Most Important Features')
plt.show()

#Model  Evaluation

Accuracy Score

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [None]:
print('Accuracy on trained data' ,training_data_accuracy)

In [None]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [None]:
print('accuracy on test data', test_data_accuracy)

In [None]:
print('\nClassification Report:')
print(classification_report(Y_test, X_test_prediction))

In [None]:
cm = confusion_matrix(Y_test, X_test_prediction)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
prediction_proba = model.predict_proba(X_test)
print('\nPrediction probabilities for first 5 test samples:')
print(prediction_proba[:5])

Building a predictive system

In [None]:
input_data = [17.99, 10.38, 122.8, 1001.0, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419, 0.07871,
              1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587, 0.03003,
              0.006193, 25.38, 17.33, 184.6, 2019.0, 0.1622, 0.6656, 0.7119, 0.2654,
              0.4601, 0.1189]

In [None]:
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The Breast cancer is Malignant')

else:
  print('The Breast Cancer is Benign')
