In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import joblib

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [4]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [5]:
print(X)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [6]:
X.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

In [10]:
model2 = KNeighborsClassifier()
model2.fit(X_train_scaled, y_train)

In [11]:
y_pred = model.predict(X_test_scaled)
print("Accuracy for Random Forest :", accuracy_score(y_test, y_pred))

joblib.dump(model, 'breast_cancer_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("✅ Model and scaler saved successfully!")

Accuracy for Random Forest : 0.9649122807017544
✅ Model and scaler saved successfully!


In [12]:
y_pred = model2.predict(X_test_scaled)
print("Accuracy for KNN :", accuracy_score(y_test, y_pred))

joblib.dump(model2, 'breast_cancer_knn.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("✅ Model and scaler saved successfully!")

Accuracy for KNN : 0.9473684210526315
✅ Model and scaler saved successfully!


In [13]:
pip install streamlit




In [14]:
%%writefile app.py

import streamlit as st
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA

# Load breast cancer dataset
data = load_breast_cancer()
features = data.feature_names
X = pd.DataFrame(data.data, columns=features)
y = pd.Series(data.target)

# Feature Descriptions (short)
feature_descriptions = {
    "radius": "Mean distance from center to points on the perimeter",
    "texture": "Standard deviation of gray-scale values",
    "perimeter": "Perimeter of the nucleus",
    "area": "Area of the nucleus",
    "smoothness": "Local variation in radius lengths",
    "compactness": "(Perimeter² / Area) - 1.0",
    "concavity": "Severity of concave portions of the contour",
    "concave points": "Number of concave portions of the contour",
    "symmetry": "Symmetry of the cell nuclei",
    "fractal_dimension": "Roughness or complexity of the contour"
}

# -------- SIDEBAR --------
st.sidebar.title("⚙️ Settings")
model_option = st.sidebar.selectbox("Choose a Model", ["Random Forest", "K-Nearest Neighbors"])

# Load model and scaler
model_file = "breast_cancer_model.pkl" if model_option == "Random Forest" else "breast_cancer_knn.pkl"
model = joblib.load(model_file)
scaler = joblib.load("scaler.pkl")

# Sliders for feature input (all in sidebar)
st.sidebar.markdown("### 🧮 Input Feature Sliders")
user_input = []
for feature in features:
    val = st.sidebar.slider(
        label=feature,
        min_value=float(X[feature].min()),
        max_value=float(X[feature].max()),
        value=float(X[feature].mean()),
        step=0.01
    )
    user_input.append(val)

# -------- MAIN PAGE --------
st.title("🔬 Breast Cancer Prediction Dashboard")
st.markdown("Use this app to predict whether a tumor is **malignant** (cancerous) or **benign** (non-cancerous) using a trained machine learning model.")

with st.expander("ℹ️ How to Use"):
    st.markdown("""
    - Use the sidebar to choose a model and input feature values.
    - The app will predict the tumor type and show confidence.
    - Visualizations below show how your input compares to real cases.
    """)

# Graphs and Analysis
st.subheader("📈 Feature Distribution")
selected_feature = st.selectbox("Select a feature to visualize", features)

fig1, ax1 = plt.subplots(figsize=(8, 3))
sns.histplot(X[selected_feature], kde=True, color='skyblue', ax=ax1)
ax1.axvline(user_input[features.tolist().index(selected_feature)], color='red', linestyle='--', label="Your Input")
ax1.set_title(f"Distribution of {selected_feature}")
ax1.legend()
st.pyplot(fig1)

st.subheader("🌐 PCA Projection (2D)")
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
input_array = np.array([user_input])
input_pca = pca.transform(input_array)

fig2, ax2 = plt.subplots()
ax2.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='coolwarm', alpha=0.5, label="Dataset")
ax2.scatter(input_pca[:, 0], input_pca[:, 1], color='black', marker='X', s=200, label="Your Input")
ax2.set_xlabel("PCA 1")
ax2.set_ylabel("PCA 2")
ax2.set_title("2D PCA - Tumor Dataset")
ax2.legend()
st.pyplot(fig2)

# 📘 Tumor Type Info
st.markdown("### 📘 Understanding Tumor Types")
st.info("""
🔴 **Malignant** tumors are cancerous and can grow/spread quickly.
🟢 **Benign** tumors are non-cancerous and usually not life-threatening.
""")

# 📗 Feature Prefix Descriptions
st.markdown("### 📗 Feature Explanation (Prefix-based)")
desc_df = pd.DataFrame.from_dict(feature_descriptions, orient='index', columns=['Description']).reset_index()
desc_df.columns = ['Feature Prefix', 'Description']
st.dataframe(desc_df, use_container_width=True)

# Prediction
st.markdown("---")
st.subheader("🔍 Prediction Result")

input_scaled = scaler.transform([user_input])
prediction = model.predict(input_scaled)[0]
prob = model.predict_proba(input_scaled)[0]

if prediction == 1:
    st.success(f"🟢 The tumor is likely **Benign** with `{prob[1]*100:.2f}%` confidence.")
else:
    st.error(f"🔴 The tumor is likely **Malignant** with `{prob[0]*100:.2f}%` confidence.")

with st.expander("📊 Show Raw Prediction Probabilities"):
    st.write(f"Malignant: `{prob[0]:.4f}`")
    st.write(f"Benign: `{prob[1]:.4f}`")

# Footer
st.markdown("---")
st.caption("Developed by Rishi Karmakar • 2025")


Writing app.py
