In [46]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import datasets
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import make_pipeline

def load_data():
    #load the dataset
    #return the dataset
    return datasets.load_iris()


def dataset_to_pandas():
    #put the dataset into a pandas DF using the feature names as columnsç
    #rename the column name so the dont include the '(cm)'
    #add 2 columns one with the target and another with the class
    
    # First attempt was columns=new_columns, but then I remembered list comprehensions exist
    #     new_columns = [feature.strip(" (cm)") for feature in dataset.feature_names]

    # Initially the dataframe is just the sepal and petal data
    df = pd.DataFrame(dataset.data.tolist(), columns=[feature.strip(" (cm)") for feature in dataset.feature_names])
    # For some reason numpy ndarrays can't use .replace() so I put the data into a series.
    target_series = pd.Series(dataset.target)
    class_series = target_series.replace({0: "setosa", 1:"versicolor",2:"virginica"})
    # Finally add these new series to the dataframe
    df["target"] = target_series
    df["class"] = class_series
    return df

dataset = load_data()
df = dataset_to_pandas()

def target_to_numpy():
    target_array = np.array(df.target)
    return target_array
def data_to_numpy():
    data_array = np.array([ df["sepal length"],df["sepal width"] ] ).transpose()
    return data_array
# Create the data arrays
Y = target_to_numpy()
X = data_to_numpy()

"""
############################################## Pipeline below
"""



# Pipeline
# X is the data/parameters/info etc, the things we make predictions FROM
# Y is the result we are attempting to predict. type of flower, price, weather etc.


# Split up the data, leaving some to test on
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,train_size=0.8)

# Data doesn't always need to be normalised!
# # Normalise the data to make it all the same range
# X_train = preprocessing.normalize(X_train)
# X_test = preprocessing.normalize(X_test)

#create and fit the scaler object on the training data
scaler = StandardScaler()
# Fit the model to the training data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# Make predictions using logistic regression, on the training data
clf = LogisticRegression(solver="lbfgs",multi_class="multinomial").fit(X_train, Y_train)
# Score using the test data

predicitions = clf.predict(X_test)
score = clf.score(X_test, Y_test)
print("Here are some predictions...")
print(predicitions)
print("These predictions are {}% accurate!".format(round(score*100,2)))


cv = cross_validate(clf, X_train, Y_train)

def print_scores(cv):
    #print out cross validation scores
    print("-----")
    print("CROSS VALIDATION")
    [print('Crossvalidation fold: {}  Accruacy: {}'.format(n, score)) for n, score in enumerate(cv['test_score'])]
    #print out the mean of the cross validation
    print('Mean train cross validation score {}'.format(cv['test_score'].mean()))

print_scores(cv)

Here are some predictions...
[0 0 1 2 0 2 2 2 2 1 0 1 2 1 1 2 1 1 1 0 0 0 1 0 0 2 2 1 0 2]
These predictions are 76.67% accurate!
-----
CROSS VALIDATION
Crossvalidation fold: 0  Accruacy: 0.8333333333333334
Crossvalidation fold: 1  Accruacy: 0.9166666666666666
Crossvalidation fold: 2  Accruacy: 0.75
Crossvalidation fold: 3  Accruacy: 0.75
Crossvalidation fold: 4  Accruacy: 0.7916666666666666
Mean train cross validation score 0.8083333333333333
