# Imports

In [1]:
import sys
import joblib
from pprint import pprint

In [2]:
sys.path.append("../scr/data")
sys.path.append("../scr/models")

In [3]:
from data_pipeline import clean_data, split_data
from data_preprocess import preprocess_data
from train import train_model
from evaluate import evaluate_model

# Data preparation

## Cleaning (from *bronze* to *silver*)

### Run `clean_data`

In [4]:
df = clean_data("../data/01_bronze/Books_10k.jsonl")
df.head()

Unnamed: 0,text_aug,scaled_rating
0,Not a watercolor book! Seems like copies imo. ...,0.0
1,Missing the sketch pad Missing the sketch pad....,0.0
2,Crease down entire side of every page!!! Every...,0.0
3,Written From a Lens of Fear. Only read and bel...,0.0
4,Good if your little one is unsure/scared of th...,0.0


### Saving *silver* data

In [5]:
df.to_json('../data/02_silver/books_cleaned.jsonl', orient='records', lines=True)

## Preprocessing (from *silver* to *gold*)

### Run `preprocess_data`

In [6]:
df = preprocess_data(df)
df.head()

Unnamed: 0,text_aug,scaled_rating
0,not a watercolor book seems like copies imo it...,0.0
1,missing the sketch pad missing the sketch pad ...,0.0
2,crease down entire side of every page every pa...,0.0
3,written from a lens of fear only read and beli...,0.0
4,good if your little one is unsurescared of the...,0.0


### Saving *gold* data

In [7]:
df.to_json('../data/03_gold/books_preprocessed.jsonl', orient='records', lines=True)

# Model training

## Split data

In [8]:
X_train, X_test, y_train, y_test = split_data(df)

## Train model

In [9]:
model_pipeline, train_metrics_dict = train_model(X_train, y_train)
pprint(train_metrics_dict)

{'mae': 0.02181883742919363, 'mse': 0.001907966575185082}


## Perform model evaluation

In [10]:
test_metrics_dict = evaluate_model(X_test, y_test, model_pipeline)
pprint(test_metrics_dict)

{'mae': 0.19684615148816859, 'mse': 0.06181051628915917}


## Save model

In [11]:
artifact_name = "model_pipeline"
artifact_path = f"../scr/models/artifacts/{artifact_name}.pkl"
joblib.dump(model_pipeline, artifact_path)

['../scr/models/artifacts/model_pipeline.pkl']