In [56]:
import pandas as pd
import numpy as np
import yaml

In [57]:
SEED: int = 666
TEST_SIZE: float = 0.2
PENALTY: str = 'l2'
C: float = 1.0
MULTI_CLASS: str = 'multinomial'
DATA: str = 'data.csv'

In [58]:
df = pd.read_csv(DATA, index_col=0)
df

Unnamed: 0,target,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
...,...,...,...,...,...
145,virginica,6.7,3.0,5.2,2.3
146,virginica,6.3,2.5,5.0,1.9
147,virginica,6.5,3.0,5.2,2.0
148,virginica,6.2,3.4,5.4,2.3


In [59]:
target = 'target'
features = list(df.columns)
features.remove(target)
X = df[features]
y = df[target]

In [60]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y)
y = le.fit_transform(y)

le.classes_

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED)

In [62]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [63]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=0.01, random_state=SEED, penalty=PENALTY, multi_class=MULTI_CLASS)
lr.fit(X_train_std, y_train)

In [64]:
preds = lr.predict(X_test_std)
preds

array([2, 2, 1, 2, 0, 1, 1, 2, 1, 1, 2, 0, 0, 0, 2, 1, 0, 2, 2, 2, 2, 0,
       2, 0, 2, 2, 0, 1, 2, 2])

In [65]:
y_test

array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0,
       2, 0, 1, 1, 0, 1, 2, 2])

In [66]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='micro')
recall = recall_score(y_test, preds, average='micro')
f1 = f1_score(y_test, preds, average='micro')

In [73]:
metrics = {}

metrics['accuracy lr'] = float(accuracy)
metrics['presicion lr'] = float(precision)
metrics['recall lr'] = float(recall)
metrics['f1 lr'] = float(f1)

with open('./outputs/metrics_lr.yaml', 'w') as file:
    yaml.dump(metrics, file, default_flow_style=False)

In [68]:
metrics

{'accuracy': 0.8333333333333334,
 'presicion': 0.8333333333333334,
 'recall': 0.8333333333333334,
 'f1': 0.8333333333333334}

In [70]:
test_classes = pd.DataFrame()

test_classes['actual_class'] = le.inverse_transform(y_test)
test_classes['predicted_class'] = le.inverse_transform(preds)
test_classes

Unnamed: 0,actual_class,predicted_class
0,versicolor,virginica
1,virginica,virginica
2,versicolor,versicolor
3,virginica,virginica
4,setosa,setosa
5,versicolor,versicolor
6,versicolor,versicolor
7,virginica,virginica
8,versicolor,versicolor
9,versicolor,versicolor


In [71]:
train_classes = pd.DataFrame()

train_classes['actual_class'] = le.inverse_transform(y_train)
train_classes['predicted_class'] = le.inverse_transform(lr.predict(X_train_std))
train_classes

Unnamed: 0,actual_class,predicted_class
0,setosa,setosa
1,setosa,setosa
2,versicolor,versicolor
3,virginica,virginica
4,versicolor,versicolor
...,...,...
115,versicolor,virginica
116,versicolor,virginica
117,versicolor,virginica
118,versicolor,versicolor


In [75]:
test_classes.to_csv('outputs/test_classes_lr.csv', index=False)
train_classes.to_csv('outputs/train_classes_lr.csv', index=False)

In [77]:
import pickle

with open('./outputs/model_lr.pckl', 'wb') as file:
    pickle.dump(lr, file)