## This notebook uses different classification models to classify Iris flowers based off of their dimensions.

### The data set is in iris_flowers.csv.  The columns in the data set are:
    sepal_length: Sepal length, in centimeters, used as input.
    sepal_width: Sepal width, in centimeters, used as input.
    petal_length: Petal length, in centimeters, used as input.
    petal_width: Petal width, in centimeters, used as input.
    class: Iris Setosa, Versicolor or Virginica, used as target.

In [1]:
# data preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# estimaters
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# performance metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report


In [2]:
def get_class_labels(row):
    class_dict = {'iris_setosa':0, 'iris_versicolor':1, 'iris_virginica':2}
    return class_dict[row['class']]

In [3]:
def load_data():
    '''Function to load in data set
    Args:
        None
    Returns:
        iris_df (df) df with iris data, columns ['sepal_length', 'sepal_width', 'petal_length', ' petal_width',
           'iris_setosa', 'iris_versicolor', 'iris_virginica']
    '''
    iris_df = pd.read_csv('iris_flowers.csv')
    iris_df['class_label'] = iris_df.apply(get_class_labels, axis=1)
    
    return iris_df

In [4]:
def get_train_test(iris_df):    
    '''Function to return train and test data, features are scaled to 0 mean and unit variance
    Args:
        iris_df (df): df with iris data
    Returns:
        X_train (2D np.array): train features
        X_test (2D np.array): test features 
        Y_train (2D np.array): train labels
        Y_test (2D np.array): test labels 
    '''
    # get features and labels
    Y = iris_df['class_label'].values
    X = iris_df[['sepal_length', 'sepal_width', 'petal_length', ' petal_width']].values
    # perform train test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    # transform features to have 0 mean and unit variance
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    class_dict = {0:'iris_setosa', 1:'iris_versicolor', 2:'iris_virginica'}
    
    # print train and test set distributions
    for i in range(3):
        print(f'{class_dict[i]}: train {(Y_train==i).sum()} test {(Y_test==i).sum()}')

    return X_train, X_test, Y_train, Y_test

#X_train, X_test, Y_train, Y_test = get_train_test(iris_df)

In [5]:
def fit_predict(estimator, X_train, X_test, Y_train):
    '''Function to fit and predict labels
    Args:
        estimator (sklearn estimator) type of estimator
        X_train (2D np.array): train features
        X_test (2D np.array): test features 
        Y_train (2D np.array): train labels
    Returns:
        Y_pred
    '''
    # instantiate model
    model = estimator
    # fit model
    model.fit(X_train, Y_train)
    # predict
    Y_pred = model.predict(X_test)
    
    return Y_pred 


In [16]:
def main():
    # load data
    iris_df = load_data()
    # perform train test split and feature scaling
    X_train, X_test, Y_train, Y_test = get_train_test(iris_df)
    models_dict = {'RandomForestClassifier':RandomForestClassifier(), 'LogisticRegression':LogisticRegression(), 
                  'svm':SVC()}

    for model in models_dict:
        print(f'Evaluating {model} model.')
        # fit and predict
        Y_pred = fit_predict(models_dict[model], X_train, X_test, Y_train)
        # evaluate performance
        print(classification_report(Y_test, Y_pred))
        
    Y_pred = fit_predict(SVC(), X_train, X_test, Y_train)
    print(classification_report(Y_test, Y_pred))
    

In [17]:
main()

iris_setosa: train 31 test 19
iris_versicolor: train 35 test 15
iris_virginica: train 34 test 16
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.94      1.00      0.97        15
           2       1.00      0.94      0.97        16

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50

