In [1]:
# train_and_save_model.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from imblearn.over_sampling import SMOTE
import joblib
import json

# Load the data
data = pd.read_csv('/Users/ajibolaoluwatobiloba/Desktop/personal project/BREAST CANCER/breast-cancer.csv')

# Separate features and target
X = data.drop(['id', 'diagnosis'], axis=1)
y = data['diagnosis'].map({'M': 1, 'B': 0})

# Calculate correlation with target
correlation = X.apply(lambda x: x.corr(y))
correlation_sorted = correlation.abs().sort_values(ascending=False)

# Select top 6 features based on correlation
top_features = correlation_sorted.head(6).index.tolist()
print("Top 6 features based on correlation:")
print(top_features)

# Use top features for modeling
X = data[top_features]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Train XGBoost model
model = xgb.XGBClassifier(random_state=42, scale_pos_weight=2, eval_metric='auc', use_label_encoder=False)
model.fit(X_train_resampled, y_train_resampled)

# Save the model
joblib.dump(model, 'xgboost_breast_cancer__model.joblib')

# Save the scaler
joblib.dump(scaler, 'scalerr.joblib')

# Save the feature names
with open('feature__names.json', 'w') as f:
    json.dump(top_features, f)

print("Model, scaler, and feature names saved successfully.")

Top 6 features based on correlation:
['concave points_worst', 'perimeter_worst', 'concave points_mean', 'radius_worst', 'perimeter_mean', 'area_worst']
Model, scaler, and feature names saved successfully.
