In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd

# Fetch Data

In [None]:
iris = load_iris()

In [None]:
flower_designation_reference = {0: 'setosa', 1: 'versicolor', 2: 'virginica'} # Definitions of Discrete Target Values 

In [None]:
X = pd.DataFrame({feature: iris['data'][:, i] for i, feature in enumerate(iris['feature_names'])}) # Extract features
y = pd.DataFrame({'flower designation': iris['target']}) # Extract labels

# Missing Values

In [None]:
Xy = X.join(y)

In [None]:
missing = Xy[Xy.isnull().any(axis=1)]
missing # No missing -> proceed to training split

# Random Sampling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # ≈ random_seed

In [None]:
training_set, test_set = X_train.join(y_train), X_test.join(y_test)

# Descriptive Statistics

In [None]:
training_set.describe()

# Visualization

In [None]:
training_set.hist(figsize=(10, 8))
plt.show()

In [None]:
training_set.plot(kind="scatter", x="petal width (cm)", y="sepal width (cm)", figsize=(40, 30), alpha=1,
            s=100, label="Type",
            c="flower designation", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()

# Normalization / Sample Classification

In [None]:
# After a bref literature review, most sources applied standardization/transformations at the classification/regression step.
# We include a classifier only to show the normalization step
# Not sure if this serves a purpose beyond not making numerous copies of amended data frames
# StandardScalar and zscore may not be identical; may create a new class inheriting from Pipeline that has a zscore method

Iris_classifier = Pipeline([
        ("scalar", StandardScaler()), # Normalize all input features ≈ replace w/ zscore
        ("linear_svc", LinearSVC(C=1, loss="hinge")), # Sample Classification method
    ])

In [None]:
Iris_classifier.fit(X_train, y_train)

In [None]:
predictions = pd.DataFrame({'predicted value': Iris_classifier.predict(X_test)})

In [None]:
results = predictions.join(y_test.reset_index(drop=True)) # Join Predictions and Test Labels
results