## Data Comprehension

In [53]:
import pandas as pd
import numpy as np


from sklearn.linear_model import LogisticRegression ,LogisticRegressionCV

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

from sklearn.metrics import recall_score

df = pd.read_csv('./data/cancer_data.csv', index_col=0)

Is the target categorical or continuous? 

What does a row represent?


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569 entries, 0 to 568
Data columns (total 31 columns):
mean radius                569 non-null float64
mean texture               569 non-null float64
mean perimeter             569 non-null float64
mean area                  569 non-null float64
mean smoothness            569 non-null float64
mean compactness           569 non-null float64
mean concavity             569 non-null float64
mean concave points        569 non-null float64
mean symmetry              569 non-null float64
mean fractal dimension     569 non-null float64
radius error               569 non-null float64
texture error              569 non-null float64
perimeter error            569 non-null float64
area error                 569 non-null float64
smoothness error           569 non-null float64
compactness error          569 non-null float64
concavity error            569 non-null float64
concave points error       569 non-null float64
symmetry error             569 

In [9]:
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,malignant
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.697715
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.459652
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


The target varaible is `malignant` with 0=benign, 1=malignant

Each row is a specific mass

Each feature is a characteristic of that mass

ie 
`mean_radius` = mean radius of mass from each point to the center 

In [10]:
df.max()

mean radius                  28.11000
mean texture                 39.28000
mean perimeter              188.50000
mean area                  2501.00000
mean smoothness               0.16340
mean compactness              0.34540
mean concavity                0.42680
mean concave points           0.20120
mean symmetry                 0.30400
mean fractal dimension        0.09744
radius error                  2.87300
texture error                 4.88500
perimeter error              21.98000
area error                  542.20000
smoothness error              0.03113
compactness error             0.13540
concavity error               0.39600
concave points error          0.05279
symmetry error                0.07895
fractal dimension error       0.02984
worst radius                 36.04000
worst texture                49.54000
worst perimeter             251.20000
worst area                 4254.00000
worst smoothness              0.22260
worst compactness             1.05800
worst concav

## Data Prep

Get rid of spaces from column headings

In [13]:
df.columns = df.columns.str.replace(" ", "_")

## EDA

## Modeling 

- train test split


In [29]:
X = df.drop(columns=['malignant'])
y = df.malignant

X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


The below runs an initial model for the entire set of data to check for recall since we are more worried about the instance of false negatives than false positives in the current context of cancer. 

In [31]:

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state=1 )

In [36]:
log_reg_vanilla = LogisticRegression(random_state = 1, max_iter=10**4)

log_reg_vanilla.fit(X_train, y_train)

y_hat_train = log_reg_vanilla.predict(X_train)
y_hat_test = log_reg_vanilla.predict(X_test)

recall_test = recall_score(y_test, y_hat_test)
recall_train = recall_score(y_train, y_hat_train)

recall_test, recall_train

In [37]:
recall_test = recall_score(y_test, y_hat_test)
recall_train = recall_score(y_train, y_hat_train)

In [38]:
recall_test, recall_train

(0.925, 0.924187725631769)

We will use cross validation to insure that the above scores are generalizable. 

In [49]:
logReg = LogisticRegression(random_state = 1, max_iter=10**4)
cross_val_vanilla = cross_val_score(logReg, X_train,y_train, scoring='recall', cv=10)

In [50]:
np.mean(cross_val_vanilla), np.std(cross_val_vanilla)

(0.9206349206349206, 0.057560204331646825)

This is good but we still do not have trianing metrics to diagnose over and under fiting so we will use 
`cross validate`


In [59]:
cv_dict = cross_validate(
    logReg,
    X_train,
    y_train,
    scoring = 'recall',
    cv = 10,
    return_train_score = True
)

In [61]:
np.mean(cv_dict['train_score']), np.mean(cv_dict['test_score'])

(0.9205783132530121, 0.9206349206349206)