In [None]:
import pathlib
import os
import pandas as pd
import joblib
import yaml
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
print("Running train component")

In [None]:
data = "data_train.csv"
output_model = "model.joblib"
output_params = "params_used.yml"

svm_gamma = 0.1
svm_c = 100

create_output_directories = True

# Could add TfidfTransformer settings here too

In [None]:
# Create directory for output files if not exists
# (required as kubeflow pipelines may pass a temp filename with
# parents that do not exist yet)
if create_output_directories:
    for fname in [output_model, output_params]:
        print(f"Creating directory for output file {fname}")
        dir_path = os.path.dirname(os.path.realpath(fname))
        dir_path = pathlib.Path(dir_path)
        dir_path.mkdir(parents=True, exist_ok=True)
        print(f"Directory created: {dir_path}")

In [None]:
# Output params to file for posterity
params = {
    'data': data,
    'output_model': output_model,
    'svm_gamma': svm_gamma,
    'svm_c': svm_c,
}
with open(output_params, 'w') as fout:
    yaml.dump(params, fout)

In [None]:
FEATURE = 'text_feature'
TARGET = 'target'

In [None]:
print("Reading training data")
df = pd.read_csv(data)

In [None]:
pl = Pipeline(
    steps=[
        ("tfidf", TfidfVectorizer(stop_words='english')),
        ("svm", SVC(C=svm_c, gamma=svm_gamma, random_state=42))
    ]
)

In [None]:
X = df[FEATURE].to_numpy()

In [None]:
y = df[TARGET].to_numpy()

In [None]:
print("training pipeline")
pl.fit(X, y)

In [None]:
print(f"Saving output model to {output_model}")
joblib.dump(pl, output_model, compress=3)