#Load in dependencies

In [None]:
import pandas as pd
import numpy as np
import warnings

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt
from sklearn import svm

Load in the data

In [None]:
filePath = "../Res/wdbc.data"
#Data does not have columns. Let's add in some.

cols = ["id","diagnosis",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave_points_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_se",
    "texture_se",
    "perimeter_se",
    "area_se",
    "smoothness_se",
    "compactness_se",
    "concavity_se",
    "concave_points_se",
    "symmetry_se",
    "fractal_dimension_se",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave_points_worst",
    "symmetry_worst",
    "fractal_dimension_worst",
    "Unnamed: 32",]


#convert into a pandas dataframe
data = pd.read_csv(filePath, names = cols)
#Drop the unncecessary columns we wouldn't need for the predictions.
data.drop(["Unnamed: 32", "id","concave_points_mean","fractal_dimension_mean",
    "radius_se",
    "texture_se",
    "perimeter_se",
    "area_se",
    "smoothness_se",
    "compactness_se",
    "concavity_se",
    "concave_points_se",
    "symmetry_se",
    "fractal_dimension_se",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave_points_worst",
    "symmetry_worst",
    "fractal_dimension_worst",], axis=1, inplace=True)

Hotkeying the Diagnosis

In [None]:
data["diagnosis"] = data["diagnosis"].map({"M":1,"B":0})
#splitting feature set from target
feature_data = data.drop("diagnosis",axis = 1)
label =data["diagnosis"]

Split data for training and testing

In [None]:
feature_data_train,feature_data_test,label_train,label_test = train_test_split(feature_data,label,test_size=0.2,random_state=122)

Scaling the features

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(feature_data_train)
x_test = scaler.transform(feature_data_test)

We plan on testing for several models and picking the one with the highest accuracy. So, first a function to test 'em all.

In [None]:
def evaluateModels(model,x_train,x_test,y_train,y_test):
    
    train_prediction = model.predict(x_train)
    test_prediction = model.predict(x_test)

    #And now we print the model's accuracy.

    print(f"\nModel: {model}")
    print(f"Training Accuracy: {accuracy_score(y_train,train_prediction)*100}")
    print(f"Testing Acccuracy: {accuracy_score(y_test,test_prediction)*100}")


In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
warnings.filterwarnings("ignore")

In [None]:
#Let's consider the models.
models = [SVC(kernel = "linear",C=1,random_state = 0),DecisionTreeClassifier(criterion="entropy", random_state=0),GaussianNB(),LogisticRegression(max_iter =200),RandomForestClassifier(n_estimators=100, random_state=0),XGBClassifier(use_label_encoder = False,eval_metrics = "logloss"),KNeighborsClassifier(n_neighbors=9),]

In [None]:
for model in models:
    model.fit(feature_data_train,label_train)
    evaluateModels(model,feature_data_train,feature_data_test,label_train,label_test)

Select the best of the models

In [None]:
#Log reg's accuracy is consistent with the train and test values.

model = LogisticRegression(max_iter=200)

# Train it on your data
model.fit(feature_data_train, label_train)

Testing the model

In [None]:
# Testing the model:
features = [
    13.03,
    18.42,
    82.61,
    523.8,
    0.08983,    
    0.03766,
    0.02562,
    0.02923,
]

features = np.array(features).reshape(1,-1)
features_scaled = scaler.transform(features)

Predictions

In [None]:
prediction = model.predict(features_scaled)
if prediction[0] ==1:
    print("Prediction is: Malignant")
else:
    print("Prediction is: Bening")

Save the model

In [None]:
# Now save both model and scaler
import joblib
joblib.dump(model, '../models/logistic_regression_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')