

**Step 1: Load and Explore the Dataset**



In [None]:
import pandas as pd
from google.colab import files

# Upload CSV file
print("📂 Upload CSV file containing breast cancer data")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Load dataset
df = pd.read_csv(file_name)
print("✅ Dataset Loaded Successfully!")

# Display first few rows
print(df.head())

# Show dataset info (column types, missing values)
print(df.info())

# Check for missing values
print("Missing Values in Dataset:")
print(df.isnull().sum())


📂 Upload CSV file containing breast cancer data


Saving cleaned_breast_cancer dataset.csv to cleaned_breast_cancer dataset.csv
✅ Dataset Loaded Successfully!
   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          1        17.99         10.38          122.80     1001.0   
1          1        20.57         17.77          132.90     1326.0   
2          1        19.69         21.25          130.00     1203.0   
3          1        11.42         20.38           77.58      386.1   
4          1        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symm

**Step 2: Data Preprocessing**

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Fill missing values with column mean
df.fillna(df.mean(), inplace=True)
print("✅ Missing values handled!")

# Encode the "diagnosis" column (Benign → 0, Malignant → 1)
label_column = "diagnosis"
label_encoder = LabelEncoder()
df[label_column] = label_encoder.fit_transform(df[label_column])
df.rename(columns={label_column: "label"}, inplace=True)
print("✅ Label Encoding Done!")
print(df["label"].value_counts())  # Check label distribution

# Separate features (X) and labels (y)
X = df.drop(columns=["label"])  # Feature columns
y = df["label"]  # Target column

# Normalize features using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
print("✅ Feature Normalization Completed!")


✅ Missing values handled!
✅ Label Encoding Done!
label
0    357
1    212
Name: count, dtype: int64
✅ Feature Normalization Completed!


**Step 3: Split Data into Training and Testing Sets**

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("✅ Train-Test Split Done!")
print(f"Training Samples: {X_train.shape[0]}")
print(f"Testing Samples: {X_test.shape[0]}")


✅ Train-Test Split Done!
Training Samples: 455
Testing Samples: 114


**Step 4: Train the Machine Learning Model**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

print("✅ Model Training Completed!")


✅ Model Training Completed!


**Step 5: Evaluate the Model**

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Model Accuracy: {accuracy:.4f}")

# Display classification report
print("🔹 Classification Report:\n", classification_report(y_test, y_pred))


✅ Model Accuracy: 0.9737
🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        72
           1       1.00      0.93      0.96        42

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114



**Step 6: Make Predictions from User-Provided TXT File**

In [None]:
import numpy as np
from google.colab import files

def predict_cancer_from_txt(input_filename, output_filename="prediction_output.txt"):
    try:
        with open(input_filename, "r") as file:
            line = file.readline().strip()
            input_values = list(map(float, line.split(",")))

        # Extract feature names
        feature_names = df.drop(columns=["label"]).columns.tolist()

        # Ensure input length matches feature length
        if len(input_values) != len(feature_names):
            raise ValueError(f"⚠️ Mismatch: Expected {len(feature_names)} values, got {len(input_values)}.")

        # Convert input to NumPy array and normalize it
        input_array = np.array(input_values).reshape(1, -1)
        input_array = scaler.transform(input_array)

        # Make prediction
        prediction = rf_model.predict(input_array)
        probability = rf_model.predict_proba(input_array)[0][1] * 100  # Malignancy probability

        # Show result
        result = "Malignant (Cancer Detected)" if prediction[0] == 1 else "Benign (No Cancer)"
        print(f"\n✅ Prediction Result: {result}")
        print(f"🩸 Probability of Cancer: {probability:.2f}%")

        # Save result to TXT file
        with open(output_filename, "w") as file:
            file.write("🔍 Breast Cancer Prediction Result\n")
            file.write("=" * 40 + "\n")
            for feature, value in zip(feature_names, input_values):
                file.write(f"{feature}: {value}\n")
            file.write(f"\n🔹 Prediction: {result}\n")
            file.write(f"🩸 Probability of Cancer: {probability:.2f}%\n")

        print(f"\n📂 Prediction result saved to '{output_filename}'")
        files.download(output_filename)

    except Exception as e:
        print(f"⚠️ Error: {e}")

# Upload user input TXT file
print("📂 Please upload a TXT file with feature values...")
uploaded = files.upload()
input_filename = list(uploaded.keys())[0]

# Predict cancer from the uploaded TXT file
predict_cancer_from_txt(input_filename)


📂 Please upload a TXT file with feature values...


Saving userinputdata.txt to userinputdata (1).txt

✅ Prediction Result: Malignant (Cancer Detected)
🩸 Probability of Cancer: 95.00%

📂 Prediction result saved to 'prediction_output.txt'




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**MULTIPLLE USER DATA WITH %**

In [None]:
import numpy as np
import pandas as pd
from google.colab import files

def predict_cancer_from_txt(input_filename, output_filename="prediction_output.txt"):
    try:
        # Ensure model and scaler are available
        if "df" not in globals() or "scaler" not in globals() or "rf_model" not in globals():
            raise ValueError("⚠️ Model, scaler, or dataset is not loaded. Ensure they are defined.")

        # Read the TXT file (multiple lines)
        with open(input_filename, "r") as file:
            lines = file.readlines()

        # Extract feature names from dataset
        feature_names = df.drop(columns=["label"]).columns.tolist()

        results = []  # Store results for all inputs
        for i, line in enumerate(lines):
            try:
                line = line.strip()
                input_values = list(map(float, line.split(",")))

                # Ensure input length matches feature length
                if len(input_values) != len(feature_names):
                    raise ValueError(f"⚠️ Row {i+1}: Expected {len(feature_names)} values, got {len(input_values)}.")

                # Convert input to NumPy array and normalize
                input_array = np.array(input_values).reshape(1, -1)
                input_array = scaler.transform(input_array)

                # Make prediction
                prediction = rf_model.predict(input_array)
                probability = rf_model.predict_proba(input_array)[0][1] * 100  # Malignancy probability

                # Store result
                result = "Malignant (Cancer Detected)" if prediction[0] == 1 else "Benign (No Cancer)"
                results.append((i+1, result, probability))

                print(f"\n✅ Row {i+1} Prediction: {result}")
                print(f"🩸 Probability of Cancer: {probability:.2f}%")

            except Exception as e:
                print(f"⚠️ Error in row {i+1}: {e}")

        # Save results to TXT file
        with open(output_filename, "w") as file:
            file.write("🔍 Breast Cancer Prediction Results\n")
            file.write("=" * 50 + "\n")
            for row_id, result, probability in results:
                file.write(f"Patient {row_id}: {result}\n")
                file.write(f"🩸 Probability of Cancer: {probability:.2f}%\n")
                file.write("-" * 50 + "\n")

        print(f"\n📂 Prediction results saved to '{output_filename}'")
        files.download(output_filename)

    except Exception as e:
        print(f"⚠️ Error: {e}")

# Upload user input TXT file
print("📂 Please upload a TXT file with multiple feature sets (one per line)...")
uploaded = files.upload()
input_filename = list(uploaded.keys())[0]

# Predict for all patients in the file
predict_cancer_from_txt(input_filename)


📂 Please upload a TXT file with multiple feature sets (one per line)...


Saving ml random BM mixed data.txt to ml random BM mixed data (1).txt

✅ Row 1 Prediction: Benign (No Cancer)
🩸 Probability of Cancer: 3.00%

✅ Row 2 Prediction: Benign (No Cancer)
🩸 Probability of Cancer: 0.00%

✅ Row 3 Prediction: Malignant (Cancer Detected)
🩸 Probability of Cancer: 99.00%

✅ Row 4 Prediction: Malignant (Cancer Detected)
🩸 Probability of Cancer: 92.00%

📂 Prediction results saved to 'prediction_output.txt'




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>