In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib



In [6]:
# Step 1: Load the CSV file
df = pd.read_csv('breast_cancer_dataset.csv')



# Step 2: Check for missing values
# print("\nMissing values:\n", df.isnull().sum())

# Step 3: Handle missing values (if any)
df.fillna(df.mean(), inplace=True)

# Check for duplicate rows
# print("\nNumber of duplicate rows:", df.duplicated().sum())

# Step 4: Split features and target
X = df.drop(columns=['target'])  # Features
y = df['target']  # Target variable (0 = malignant, 1 = benign)

# Step 5: Standardize the features (scale data)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(df.info())
print("\nData successfully preprocessed and split into training and testing sets.")



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [16]:

# Step 4: Standardize the features (scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, 'scaler.pkl')

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 6: Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)  # Train the model

# Step 7: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9649122807017544

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114


Confusion Matrix:
[[40  3]
 [ 1 70]]


In [10]:


# Save the model to a file
joblib.dump(model, 'random_forest_cancer_cell_classification.joblib')

print("Model saved successfully!")


Model saved successfully!


In [12]:
loaded_model = joblib.load('random_forest_cancer_cell_classification.joblib')

In [14]:
# Example: New data (make sure it has the same number of features as the training data)
new_data = [[17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189]]  # This is just an example; use real values

# Scale the new data using the same scaler as the training data
new_data_scaled = scaler.transform(new_data)

# Predict using the loaded model
new_prediction = loaded_model.predict(new_data_scaled)

print("Prediction for new data:", new_prediction)


Prediction for new data: [0]


