In [22]:
# data manipulation
import numpy as np
import pandas as pd
# data transfer
import joblib
# raw dataset
from sklearn.datasets import load_iris

In [23]:
iris = load_iris()

iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [43]:
iris.DESCR.split("\n")

['.. _iris_dataset:',
 '',
 'Iris plants dataset',
 '--------------------',
 '',
 '**Data Set Characteristics:**',
 '',
 '    :Number of Instances: 150 (50 in each of three classes)',
 '    :Number of Attributes: 4 numeric, predictive attributes and the class',
 '    :Attribute Information:',
 '        - sepal length in cm',
 '        - sepal width in cm',
 '        - petal length in cm',
 '        - petal width in cm',
 '        - class:',
 '                - Iris-Setosa',
 '                - Iris-Versicolour',
 '                - Iris-Virginica',
 '                ',
 '    :Summary Statistics:',
 '',
 '                    Min  Max   Mean    SD   Class Correlation',
 '    sepal length:   4.3  7.9   5.84   0.83    0.7826',
 '    sepal width:    2.0  4.4   3.05   0.43   -0.4194',
 '    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)',
 '    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)',
 '',
 '    :Missing Attribute Values: None',
 '    :Class Distribution: 33.3% 

In [88]:
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [89]:
data.shape

(150, 4)

In [90]:
data.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [91]:
data = data.rename(columns={
    'sepal length (cm)':'sepal_length', 
    'sepal width (cm)':'sepal_width', 
    'petal length (cm)':'petal_length',
    'petal width (cm)':'petal_width'
})
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [92]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [93]:
data['Target'] = iris.target
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [94]:
data['Target'].value_counts()

0    50
1    50
2    50
Name: Target, dtype: int64

In [95]:
data.isnull().any()

sepal_length    False
sepal_width     False
petal_length    False
petal_width     False
Target          False
dtype: bool

In [96]:
y = data['Target']
X = data.drop(columns=['Target'], axis=1)

In [97]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=7)
print(train_X.shape, test_X.shape, train_y.shape, test_y.shape)

(112, 4) (38, 4) (112,) (38,)


In [98]:
training_data = pd.concat([train_X, train_y], axis=1)
print(training_data.shape)
training_data.head()

(112, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Target
17,5.1,3.5,1.4,0.3,0
102,7.1,3.0,5.9,2.1,2
124,6.7,3.3,5.7,2.1,2
76,6.8,2.8,4.8,1.4,1
132,6.4,2.8,5.6,2.2,2


In [99]:
testing_data = pd.concat([test_X, test_y], axis=1)
print(testing_data.shape)
testing_data.head()

(38, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Target
149,5.9,3.0,5.1,1.8,2
84,5.4,3.0,4.5,1.5,1
40,5.0,3.5,1.3,0.3,0
66,5.6,3.0,4.5,1.5,1
106,4.9,2.5,4.5,1.7,2


In [100]:
# save
training_data.to_csv('trainingdata.csv', index=False)
testing_data.to_csv('testingdata.csv', index=False)

In [101]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_X_scaled = scaler.fit_transform(train_X)

#save scaler
joblib.dump(scaler, 'standard_scaler.sav')

test_X_scaled = scaler.transform(test_X)

['standard_scaler.sav']

In [128]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#:~:text=%E2%80%98auto%E2%80%99%20selects%20%E2%80%98ovr%E2%80%99%20if%20the%20data%20is%20binary%2C%20or%20if%20solver%3D%E2%80%99liblinear%E2%80%99%2C%20and%20otherwise%20selects%20%E2%80%98multinomial%E2%80%99.

In [129]:
model.fit(train_X_scaled, train_y)

LogisticRegression()

In [130]:
y_pred = model.predict(test_X_scaled)

In [131]:
from sklearn.metrics import accuracy_score, auc

In [132]:
accuracy = accuracy_score(test_y, y_pred)
accuracy

0.8947368421052632

In [133]:
# save model
joblib.dump(model, 'model.sav')

['model.sav']