# Create ML pipeline(workflow) using python libraries

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [2]:
dataset = load_breast_cancer()

In [3]:
df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [5]:
df.shape

(569, 30)

In [6]:
df['class'] = dataset.target

In [7]:
logit_ml = Pipeline([('scaling', StandardScaler()),
                     ('pca', PCA(n_components = 3)),
                     ('model_logit', LogisticRegression())])

In [8]:
dtree_ml = Pipeline([('scaling', StandardScaler()),
                     ('pca', PCA(n_components = 3)),
                     ('model_logit', DecisionTreeClassifier())])

In [9]:
nb_ml = Pipeline([('scaling', StandardScaler()),
                     ('pca', PCA(n_components = 3)),
                     ('model_logit', GaussianNB())])

In [10]:
from sklearn.ensemble import RandomForestClassifier
Random_ml = Pipeline([('scaling',StandardScaler()),
                      ('pca',PCA(n_components = 3 )),
                     ('model_logit',RandomForestClassifier())])

In [11]:
X = df.iloc[:,:-1]
X.shape

(569, 30)

In [12]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [13]:
y = df.iloc[:,-1]
y.shape

(569,)

In [14]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int32

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1) #random_state: save the testing set 

In [32]:
my_pipeline = [logit_ml, dtree_ml, nb_ml]

In [33]:
pipeline_dict = {0:'Logistic_Regression', 1:'Decision_Tree', 2:'Naive_Bayes'}

In [34]:
for i in my_pipeline:
    i.fit(X_train, y_train)

In [35]:
for i, model in enumerate(my_pipeline):
    print(f"{pipeline_dict[i]}'s training accuracy is: {model.score(X_train, y_train)}") #performs the model.predict step 

Logistic_Regression's training accuracy is: 0.9671361502347418
Decision_Tree's training accuracy is: 1.0
Naive_Bayes's training accuracy is: 0.9225352112676056


In [36]:
for i, model in enumerate(my_pipeline):
    print(f"{pipeline_dict[i]}'s testing accuracy is: {model.score(X_test, y_test)}") 

Logistic_Regression's testing accuracy is: 0.9370629370629371
Decision_Tree's testing accuracy is: 0.9440559440559441
Naive_Bayes's testing accuracy is: 0.9090909090909091


### Logistic Regression

In [37]:
logit_ml.fit(X_train, y_train)

In [38]:
y_pred = logit_ml.predict(X_test)

In [39]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy:",acc)

cf = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(cf)

Accuracy: 0.9370629370629371

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92        55
           1       0.94      0.95      0.95        88

    accuracy                           0.94       143
   macro avg       0.93      0.93      0.93       143
weighted avg       0.94      0.94      0.94       143



### Decision Tree

In [40]:
dtree_ml.fit(X_train, y_train)

In [41]:
y_pred_dt = dtree_ml.predict(X_test)

In [42]:
acc = accuracy_score(y_test, y_pred_dt)
print("Accuracy:",acc)

cf = classification_report(y_test, y_pred_dt)
print("\nClassification Report:")
print(cf)

Accuracy: 0.9300699300699301

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91        55
           1       0.93      0.95      0.94        88

    accuracy                           0.93       143
   macro avg       0.93      0.92      0.93       143
weighted avg       0.93      0.93      0.93       143



### Naive Bayes

In [43]:
nb_ml.fit(X_train, y_train)

In [44]:
y_pred_nb = nb_ml.predict(X_test)
acc = accuracy_score(y_test, y_pred_nb)
print("Accuracy:",acc)

cf = classification_report(y_test, y_pred_nb)
print("\nClassification Report:")
print(cf)

Accuracy: 0.9090909090909091

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.84      0.88        55
           1       0.90      0.95      0.93        88

    accuracy                           0.91       143
   macro avg       0.91      0.90      0.90       143
weighted avg       0.91      0.91      0.91       143



### Randform Forest

In [45]:
Random_ml.fit(X_train, y_train)

In [46]:
y_pred_rn = Random_ml.predict(X_test)
acc = accuracy_score(y_test, y_pred_rn)
print("Accuracy:",acc)

cf = classification_report(y_test, y_pred_rn)
print("\nClassification Report:")
print(cf)

Accuracy: 0.9230769230769231

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90        55
           1       0.93      0.94      0.94        88

    accuracy                           0.92       143
   macro avg       0.92      0.92      0.92       143
weighted avg       0.92      0.92      0.92       143

