In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from vowpalwabbit import pyvw
from vowpalwabbit.DFtoVW import (
    DFtoVW,
    Feature,
    MulticlassLabel,
)
import tensorboardX as tx
from datetime import datetime
from vowpalwabbit.DFtoVWtoTensorboard import DFtoVWtoTensorboard

In [2]:
df = pd.read_csv('./Iris.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [4]:
# converting "Species" categorical column to integer
def categorical_to_int(specie):
    if specie == 'Iris-setosa':
        return 1
    elif specie == 'Iris-versicolor':
        return 2
    elif specie == 'Iris-virginica':
        return 3
    
df['Species'] = df['Species'].apply(categorical_to_int)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,1
1,2,4.9,3.0,1.4,0.2,1
2,3,4.7,3.2,1.3,0.2,1
3,4,4.6,3.1,1.5,0.2,1
4,5,5.0,3.6,1.4,0.2,1


In [5]:
# Splitting train test
train, test = train_test_split(df, test_size=0.1)   

In [6]:
print("Length of test:", len(train))
test

Length of test: 135


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
78,79,6.0,2.9,4.5,1.5,2
5,6,5.4,3.9,1.7,0.4,1
46,47,5.1,3.8,1.6,0.2,1
36,37,5.5,3.5,1.3,0.2,1
24,25,4.8,3.4,1.9,0.2,1
128,129,6.4,2.8,5.6,2.1,3
15,16,5.7,4.4,1.5,0.4,1
68,69,6.2,2.2,4.5,1.5,2
70,71,5.9,3.2,4.8,1.8,2
4,5,5.0,3.6,1.4,0.2,1


In [7]:
print("Length of test:", len(test))
test

Length of test: 15


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
78,79,6.0,2.9,4.5,1.5,2
5,6,5.4,3.9,1.7,0.4,1
46,47,5.1,3.8,1.6,0.2,1
36,37,5.5,3.5,1.3,0.2,1
24,25,4.8,3.4,1.9,0.2,1
128,129,6.4,2.8,5.6,2.1,3
15,16,5.7,4.4,1.5,0.4,1
68,69,6.2,2.2,4.5,1.5,2
70,71,5.9,3.2,4.8,1.8,2
4,5,5.0,3.6,1.4,0.2,1


In [8]:
target_col = "Species"

# features = [Feature(col) for col in df.columns if col != target_col]  # target column not part of Features
features = [Feature(col) for col in df.columns if col != target_col and col != 'Id']  # 'Id' column also not part of Features along with target col
label = MulticlassLabel(label=target_col)  # target column is a multi class label column
tag = 'Id'

df_to_vw = DFtoVW(df=df, features=features, label=label, tag=tag)

In [9]:
vw = pyvw.vw('--oaa 3 -P 1')  # -oaa is One Agent All algo for multi class problem (seems supervised) -P 1 outputs metrics for each example

In [10]:
# Before running this run command `rm -rf ./logs` in this directory to remove any previous logs for tensorboard
df_to_tb = DFtoVWtoTensorboard(df_to_vw, vw)
df_to_tb.fit(train)  # by default metrics are logged to tensorboard, now run in the current directory command `tensorboard --logdir ./logs`

average_loss:0.000000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.500000	since_last:1.000000	label: 3	prediction: 1	num_features: 5
average_loss:0.666667	since_last:1.000000	label: 1	prediction: 3	num_features: 5
average_loss:0.500000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.600000	since_last:1.000000	label: 3	prediction: 1	num_features: 5
average_loss:0.500000	since_last:0.000000	label: 1	prediction: 1	num_features: 5
average_loss:0.571429	since_last:1.000000	label: 3	prediction: 1	num_features: 5
average_loss:0.500000	since_last:0.000000	label: 3	prediction: 3	num_features: 5
average_loss:0.555556	since_last:1.000000	label: 2	prediction: 3	num_features: 5
average_loss:0.600000	since_last:1.000000	label: 2	prediction: 3	num_features: 5
average_loss:0.636364	since_last:1.000000	label: 3	prediction: 2	num_features: 5
average_loss:0.666667	since_last:1.000000	label: 2	prediction: 3	num_features: 5
average_loss:0.615385	since_