In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets
import pandas as pd
import numpy as np

#load data
dataset = datasets.load_iris()
feature_names = [fn[:-5] for fn in dataset.feature_names]
frame = pd.DataFrame(dataset.data, columns=feature_names)
frame['target'] = dataset.target
X_train, X_test, y_train, y_test = train_test_split(
  frame[feature_names], frame.target, random_state=42) #test_size=0.25

#build model
model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=200)
model.fit(X_train, y_train)

#evaluation
print('Train accuracy: ',model.score(X=X_train, y=y_train))
print('Test accuracy: ',model.score(X=X_test, y=y_test))

Train accuracy:  0.9642857142857143
Test accuracy:  1.0


In [None]:
Train accuracy:  0.9642857142857143
Test accuracy:  1.0

In [None]:
print(X_train.shape)
print(X_test.shape)

(112, 4)
(38, 4)


#Create Model PipeLine

In [None]:
class SimplePipeline:
  def __init__(self):
    self.frame = None
    self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
    self.model = None
    self.load_dataset()

  def load_dataset(self):
    dataset = datasets.load_iris()
    self.feature_names = [fn[:-5] for fn in dataset.feature_names]
    self.frame = pd.DataFrame(dataset.data, columns=feature_names)
    self.frame['target'] = dataset.target
    self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
        frame[feature_names], frame.target, random_state=42) #test_size=0.25

  def run_pipeline(sel
  'f):
    self.load_dataset()
    #self.normalization()
    self.train()

  def train(self):
    self.model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=200)
    self.model.fit(self.X_train, self.y_train)

  def get_accuracy(self):
    #print('Train accuracy: ',self.model.score(X=self.X_train, y=self.y_train))
    #print('Test accuracy: ', self.model.score(X=self.X_test, y=self.y_test))
    return self.model.score(X=self.X_test, y=self.y_test)

  def predict(self, input_data):
    return self.model.predict(input_data)


In [None]:
pipeline = SimplePipeline()
pipeline.run_pipeline()
pipeline.get_accuracy()

1.0

#Unit Testing ML Model
** Unit testing -- inputs **

In [None]:
pipeline.frame.describe()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [None]:
iris_schema = {
    'sepal length': {
        'range': {
            'min': 4.0,
            'max': 8.0
        },
        'dtype': float,
    },
    'sepal width': {
        'range': {
            'min': 2.0,
            'max': 5.0
        },
        'dtype': float,
    },
    'petal length': {
        'range': {
            'min': 1.0,
            'max': 7.0
        },
        'dtype': float,
    },
    'petal width': {
        'range': {
            'min': 0.1,
            'max': 3.0
        },
        'dtype': float,
    }
}

In [None]:
import unittest
import sys

class TestIrisInputData(unittest.TestCase):
    def setUp(self):
      self.pipeline = SimplePipeline()
      self.pipeline.run_pipeline()

    def test_input_data_range(self):
      max_values = self.pipeline.frame.max()
      min_values = self.pipeline.frame.min()
      for feature in self.pipeline.feature_names:
        self.assertTrue(max_values[feature] <= iris_schema[feature]['range']['max'],
                        feature + "[Max] >= " + str(iris_schema[feature]['range']['max']))
        self.assertTrue(min_values[feature] >= iris_schema[feature]['range']['min'],
                        feature + "[Min] <= " + str(iris_schema[feature]['range']['min']))

    def test_input_data_types(self):
      data_types = self.pipeline.frame.dtypes

      for feature in self.pipeline.feature_names:
        self.assertEqual(data_types[feature], iris_schema[feature]['dtype'])



In [None]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisInputData)
unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)

..
----------------------------------------------------------------------
Ran 2 tests in 0.097s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>



```
# This is formatted as code
```

**Unit test -- Model Config

```
# This is formatted as code
```

**

In [None]:
class PipelineWithConfig(SimplePipeline):
  def __init__(self, config):
    super().__init__()
    self.config = config

  def train(self):
    self.model = LogisticRegression(solver=self.config.get('solver'),
                                    multi_class=self.config.get('multi_class'),
                                    max_iter=self.config.get('max_iter'))
    self.model.fit(self.X_train, self.y_train)

  #def get_accuracy(self):
  #  return self.model.score(X=self.X_test, y=self.y_test)

In [None]:
config = {'solver': 'lbfgs', 'multi_class': 'auto','max_iter': 200}
pipeline2 = PipelineWithConfig(config)
pipeline2.run_pipeline()
print('Test Accuracy:', pipeline2.get_accuracy())

Test Accuracy: 1.0


In [None]:
ENABLED_MODEL_SOLVERS = {'lbfgs', 'newton-cg'}

class TestIrisConfig(unittest.TestCase):
  def setUp(self):
    config = test_config #{'solver': 'newton-cg', 'multi_class': 'auto','max_iter': 200}
    self.pipeline = PipelineWithConfig(config)
    self.pipeline.run_pipeline()

  def test_pipeline_config(self):
    model_params = self.pipeline.model.get_params()
    self.assertTrue(model_params['solver'] in ENABLED_MODEL_SOLVERS, model_params['solver'] + " incorrect parameter" )

In [None]:
test_config={'solver': 'saga', 'multi_class': 'auto','max_iter': 200}
suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisConfig)
unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)

F
FAIL: test_pipeline_config (__main__.TestIrisConfig)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-34-6863462ff384>", line 11, in test_pipeline_config
    self.assertTrue(model_params['solver'] in ENABLED_MODEL_SOLVERS, model_params['solver'] + " incorrect parameter" )
AssertionError: False is not true : saga incorrect parameter

----------------------------------------------------------------------
Ran 1 test in 0.031s

FAILED (failures=1)


<unittest.runner.TextTestResult run=1 errors=0 failures=1>

In [None]:
from sklearn.preprocessing import StandardScaler

class PipelineWithDataEngineering(SimplePipeline):
    def __init__(self):
        super().__init__()
        self.scaler = StandardScaler()
        self.scaler.fit(self.X_train)

    def apply_scaler(self):
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)

    def predict(self, input_data):
        scaled_input_data = self.scaler.transform(input_data)
        return self.model.predict(scaled_input_data)

    def run_pipeline(self):
        self.load_dataset()
        self.apply_scaler()  # updated in the this class
        self.train()

In [None]:
import unittest
from sklearn.metrics import mean_squared_error, accuracy_score

class TestIrisPredictions(unittest.TestCase):
    def setUp(self):
        self.pipeline_v1 = SimplePipeline()
        self.pipeline_v2 = PipelineWithDataEngineering()
        self.pipeline_v1.run_pipeline()
        self.pipeline_v2.run_pipeline()
        self.benchmark_predictions = [1.0] * len(self.pipeline_v1.y_test)

    def test_accuracy_higher_than_benchmark(self):
        benchmark_accuracy = accuracy_score(
            y_true=self.pipeline_v1.y_test,
            y_pred=self.benchmark_predictions)

        predictions = self.pipeline_v1.predict(self.pipeline_v1.X_test)

        actual_accuracy = accuracy_score(
            y_true=self.pipeline_v2.y_test,
            y_pred=predictions)

        print(f'model accuracy: {actual_accuracy}, benchmark accuracy: {benchmark_accuracy}')
        self.assertTrue(actual_accuracy > benchmark_accuracy)

    def test_accuracy_compared_to_previous_version(self):
        v1_accuracy = self.pipeline_v1.get_accuracy()
        v2_accuracy = self.pipeline_v2.get_accuracy()
        print(f'pipeline v1 accuracy: {v1_accuracy}')
        print(f'pipeline v2 (with normalization) accuracy: {v2_accuracy}')

        self.assertTrue(v2_accuracy >= v1_accuracy)

In [None]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisPredictions)
unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)

..

pipeline v1 accuracy: 1.0
pipeline v2 (with normalization) accuracy: 1.0
model accuracy: 1.0, benchmark accuracy: 0.2894736842105263



----------------------------------------------------------------------
Ran 2 tests in 0.134s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>