In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [30]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt


%matplotlib inline

<IPython.core.display.Javascript object>

In [13]:
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"
df = pd.read_csv(
    data_url,
    header=None,
    names=[
        "BI-RADS",
        "Age",
        "Shape",
        "Margin",
        "Density",
        "Severity",
    ],
)
df.head(1)

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5,67,3,5,3,1


<IPython.core.display.Javascript object>

'Age' catagory has some '?' values. replace them with "nan". 

In [14]:
df = df.apply(pd.to_numeric, errors="coerce", axis="columns")

<IPython.core.display.Javascript object>

Check data types

In [5]:
df.dtypes

BI-RADS     float64
Age         float64
Shape       float64
Margin      float64
Density     float64
Severity    float64
dtype: object

<IPython.core.display.Javascript object>

Determine how many 'nan' values there are in each column.

In [6]:
df.isna().sum()

BI-RADS      2
Age          5
Shape       31
Margin      48
Density     76
Severity     0
dtype: int64

<IPython.core.display.Javascript object>

It appears there few enough in each column to just drop them.

In [15]:
df = df.dropna()

<IPython.core.display.Javascript object>

Pepare the data to be split to X, y values and train/test split.

In [16]:
X = df.drop(columns=["Severity", "BI-RADS"])
y = df["Severity"]

<IPython.core.display.Javascript object>

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

<IPython.core.display.Javascript object>

Complete the prepossessing steps

In [18]:
cat_cols=['Shape', "Margin"]
drop_cats=[1,1]
num_cols = ['Age', 'Density']


<IPython.core.display.Javascript object>

In [20]:
preprocessing = ColumnTransformer(
    [("encode_cats", OneHotEncoder(drop=drop_cats), cat_cols)], remainder="passthrough"
)

<IPython.core.display.Javascript object>

In [21]:
preprocessing.fit(X_train)

X_train = preprocessing.transform(X_train)
X_test = preprocessing.transform(X_test)


<IPython.core.display.Javascript object>

In [31]:
model = DecisionTreeClassifier(max_depth=5)
model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5)

<IPython.core.display.Javascript object>

In [32]:
print("training score: {}".format(model.score(X_train, y_train)))
print("testing score: {}".format(model.score(X_test, y_test)))

training score: 0.8358433734939759
testing score: 0.7469879518072289


<IPython.core.display.Javascript object>

In [33]:
y_pred = model.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["actual_not_severe", "actual_severe"],
    columns=["pred_not_severe", "pred_severe"],
)

Unnamed: 0,pred_not_severe,pred_severe
actual_not_severe,61,24
actual_severe,18,63


<IPython.core.display.Javascript object>

In [34]:
model = RandomForestClassifier(n_estimators=30, max_depth=3)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, n_estimators=30)

<IPython.core.display.Javascript object>

In [35]:
print("training score: {}".format(model.score(X_train, y_train)))
print("testing score: {}".format(model.score(X_test, y_test)))

training score: 0.8117469879518072
testing score: 0.8012048192771084


<IPython.core.display.Javascript object>

In [36]:
y_pred = model.predict(X_test)

pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["actual_not_severe", "actual_severe"],
    columns=["pred_not_severe", "pred_severe"],
)

Unnamed: 0,pred_not_severe,pred_severe
actual_not_severe,63,22
actual_severe,11,70


<IPython.core.display.Javascript object>

For this data set, a decision tree performs about as well as the Random Forest model without doing any further tuning. On larger datasets with more predictors, Doing a grid seach of the Random Forest Classifier could get qui