In [None]:
pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pickle 
# df = pd.read_csv("/Users/varshinisakthi/Downloads/breast-cancer-wisconsin.data")
# df.columns = ["id", "ClumpThick", "UniSize", "UniShape", "MargAd", "SingEpiCelSize", "Bare Nuc", "BlandChr", "NormalNuc", "Mito", "Class"]
# df.to_csv("data.csv", index=None, header=True)
# Preprocessing 
# In the class column 2 represents benign and 4 represents malignant

data = pd.read_csv("data.csv")

data.drop(['id'], inplace = True, axis = 1)

data.replace('?', np.nan, inplace=True)
data = data.astype(float)
data.fillna(data.median(), inplace=True)

data["Class"] = data["Class"].map(lambda x: 1 if x == 4 else 0)
# def retBin(x):
#     if x == 4:
#         return 1
#     else:
#         return 0

# data["Class"] = data["Class"].map(retBin)

X = data.drop("Class", axis = 1).values
y = data["Class"].values

# Training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0, stratify=y)

# Logistic Regression
logreg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(solver="liblinear"))])
logreg_pipeline.fit(X_train, y_train)

print("Logistic Regression Accuracy:", logreg_pipeline.score(X_test, y_test))
pickle.dump(logreg_pipeline, open("LogisticRegression.m", "wb"))

# SVM
svm_pipeline = Pipeline([
    ("scaler",StandardScaler()),
    ("classifier", SVC(
        kernel="rbf",
        C=1.0,
        gamma="scale",
        probability=True))
])
svm_pipeline.fit(X_train, y_train)
print("SVM Accuracy:", svm_pipeline.score(X_test, y_test))

pickle.dump(svm_pipeline, open("SVM.m", "wb"))

# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=0)

rf_model.fit(X_train, y_train)
print("Random Forest Accuracy:", rf_model.score(X_test, y_test))
pickle.dump(rf_model, open("RandomForest.m", "wb"))

for name, importance in zip(data.drop("Class", axis=1).columns,
                            rf_model.feature_importances_):
    print(f"{name}: {importance:.3f}")

# XGBoost
xgb_model = XGBClassifier(
    n_estimators=200, max_depth=4, learning_rate=0.1, subsample=0.9,
    colsample_bytree=0.9, eval_metric="logloss", random_state=0)

xgb_model.fit(X_train, y_train)
print("XGBoost Accuracy:", xgb_model.score(X_test, y_test))
pickle.dump(xgb_model, open("XGBoost.m", "wb"))

# Prediction
classes = ["Benign", "Malignant"]
sample = np.array([[5,10,10,10,7,7,3,8,9]])
model =pickle.load(open("XGBoost.m", "rb"))
prediction = model.predict(sample)[0]
print("Prediction:", classes[prediction])
