In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

data=load_breast_cancer()
#pd.DataFrame(data['data'])
#pd.DataFrame(data['target'])

#NK: it seems like data['data'] is raw data, data['target'] is actual value (cancerous v. noncancerous)

In [2]:
# print(data.DESCR) #prints description of dataset 
help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)
    Split arrays or matrices into random train and test subsets
    
    Quick utility that wraps input validation and
    ``next(ShuffleSplit().split(X, y))`` and application to input data
    into a single call for splitting (and optionally subsampling) data in a
    oneliner.
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    
    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to t

Supervised Learning with Decision Trees: Unstandardized Data

In [3]:
#NK: split the data into a train and test set
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=0)
#NK: instancing the estimator. "criterion" is a hyperparameter setting
model = DecisionTreeClassifier(criterion='entropy')

#NK: use the fit method with instance estimator for training. Send the training data and label data together as an
#argument to supervised learning algorithm. 
model.fit(X_train,y_train) #sup

#NK: the instance estimator that HAS COMPLETED training with fitting can be applied with the predict method. 
# "Predict" converts the estimated results of the model regarding the entered data. 
y_pred=model.predict(X_test) #is an estimated value, so the actual values for X_test may vary. Measure the accuracy by comparing the two values.
y_pred

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0])

In [4]:
#this is comparing the predicted test data with what the actual y values should be 
pred_2d = y_pred.reshape(len(y_pred),1)
df1=pd.DataFrame(pred_2d) 
y_test_2d=y_test.reshape(len(y_test),1)
df1=pd.DataFrame(pred_2d)
df2=pd.DataFrame(y_test_2d) 
df_concat=pd.concat([df1,df2],axis=1) 
df_concat.columns=['pred','real']

df_size=df_concat[df_concat['pred']!=df_concat['real']]
#df_size shows results where the predicted value and actual value differ

mf_size=df_concat[df_concat['pred']==df_concat['real']]
#mf_size shows results where the predicted value and actual value were the same. 

y_len = len(y_test) #test size 
u_size=len(df_size) #unmatched
m_size=len(mf_size) #matched 
print("size y_test:",y_len)
print("unmatch size:",u_size)
print("matched size:",m_size)
print("unmatch rate:",u_size/y_len)
print("match rate:",m_size/y_len)

size y_test: 143
unmatch size: 7
matched size: 136
unmatch rate: 0.04895104895104895
match rate: 0.951048951048951


Supervised Learning with Decision Trees: Standardized Data

In [5]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler 

scaler=StandardScaler() #class for standardization. Standardization can increase the data accuracy. 

In [6]:
data=load_breast_cancer()
#pd.DataFrame(data['data'])
#pd.DataFrame(data['target'])
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=10)

In [7]:
#BEFORE PRE_PROCESSING. differences among column values are huge before standardization 
df_scale_previous=pd.DataFrame(data['data'])

#PRE_PROCESSING STARTS HERE
#.fit computes the mean and std to be used for later scaling
scaler.fit(X_train)

#.transform performs standardization by centering and scaling. 
X_train = scaler.transform(X_train)

df_scale_after=pd.DataFrame(X_train)

#after standardization, the column values do not significantly deviate from 0. 
#Better performance would be possible compared to before standardization. 

Supervised Learning with Decision Trees: Standardized Data (fit_transform)

In [8]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler 

scaler=StandardScaler() #class for standardization. Standardization can increase the data accuracy. 

In [9]:
data=load_breast_cancer()
#pd.DataFrame(data['data'])
#pd.DataFrame(data['target'])
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=10)

In [10]:
# scale X_train & X_test
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
# removes the mean and scales to unit variance. Fit and Transform is combined as fit_transform. 


# We ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature. 
# We then scale it by dividing non-constant features by their standard deviation. 

# Scaled data using StandardScaler() has zero mean and unit variance. 




In [11]:
model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
pred_2d = y_pred.reshape(len(y_pred),1)
df1=pd.DataFrame(pred_2d) 
y_test_2d=y_test.reshape(len(y_test),1)
df1=pd.DataFrame(pred_2d)
df2=pd.DataFrame(y_test_2d) 
df_concat=pd.concat([df1,df2],axis=1) 
df_concat.columns=['pred','real']
df_size=df_concat[df_concat['pred']!=df_concat['real']]
mf_size=df_concat[df_concat['pred']==df_concat['real']]
y_len = len(y_test)
u_size=len(df_size)
m_size=len(mf_size)
print("size y_test:",y_len)
print("unmatch size:",u_size)
print("matched size:",m_size)
print("unmatch rate:",u_size/y_len)
print("match rate:",m_size/y_len)

size y_test: 143
unmatch size: 14
matched size: 129
unmatch rate: 0.0979020979020979
match rate: 0.9020979020979021


In [None]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler 
scaler=StandardScaler() 

data=load_iris()
#pd.DataFrame(data['data'])
#pd.DataFrame(data['target'])
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=10)
# scale X_train & X_test
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
pred_2d = y_pred.reshape(len(y_pred),1)
df1=pd.DataFrame(pred_2d) 
y_test_2d=y_test.reshape(len(y_test),1)
df1=pd.DataFrame(pred_2d)
df2=pd.DataFrame(y_test_2d) 
df_concat=pd.concat([df1,df2],axis=1) 
df_concat.columns=['pred','real']
df_size=df_concat[df_concat['pred']!=df_concat['real']]
mf_size=df_concat[df_concat['pred']==df_concat['real']]
y_len = len(y_test)
u_size=len(df_size)
m_size=len(mf_size)
print("size y_test:",y_len)
print("unmatch size:",u_size)
print("matched size:",m_size)
print("unmatch rate:",u_size/y_len)
print("match rate:",m_size/y_len)