In [2]:
import pickle
import numpy as np
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

import spacy

nlp = spacy.load("en_core_web_lg")

Read and download data

In [3]:
def read_data(filename: str) -> pd.DataFrame:
    """Read data into dataframe"""
    df = pd.read_csv(filename, index_col=0)
    df['vector'] = df['vector'].apply(convert)
    return df

def convert(item):
    item = item.strip()  # remove spaces at the end
    item = item[1:-1]    # remove `[ ]`
    item = np.fromstring(item, sep=' ')  # convert string to `numpy.array`
    return item

In [5]:
df_train = read_data("../data/train_data.csv")
df_val = read_data("../data/valid_data.csv")

In [6]:
def add_features(df_train, df_val):
    """"""
    X_train = df_train['vector']
    y_train = df_train['label']

    X_val = df_val['vector']
    y_val = df_val['label']

    X_train = np.stack(X_train)
    y_train = np.stack(y_train)

    X_val = np.stack(X_val)
    y_val = np.stack(y_val)
    return X_train, y_train, X_val, y_val

In [23]:
df_train['vector'] = np.stack(df_train['vector'])

In [24]:
df_val['vector'] = np.stack(df_val['vector'])

In [7]:
X_train, y_train, X_val, y_val = add_features(df_train, df_val)

In [8]:
params = dict(max_depth=20, n_estimators=100, min_samples_leaf=10, random_state=0)

model = RandomForestClassifier(**params, n_jobs=-1)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

rmse = mean_squared_error(y_pred, y_val, squared=False)
print(params, rmse)

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 0.10151922665097515


In [18]:
train_preds = model.predict(X_train)
df_train['prediction'] = train_preds

In [15]:
df_val['prediction'] = y_pred

In [11]:
with open('models/rf.bin', 'wb') as f_out:
    pickle.dump(model, f_out)

In [13]:
df_val.to_parquet('data/reference.parquet')

Evidently report

In [35]:
column_mapping = ColumnMapping(
    target = 'label',
    prediction = 'prediction',
    text_features = 'vector',
    numerical_features = [],
    categorical_features = []
)

In [36]:
report = Report(metrics=[
    ColumnDriftMetric(column_name="prediction"),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric()
])

In [20]:
df_train.head(3)

Unnamed: 0,vector,label,prediction
22178,"[0.7785875, -2.5283103, 2.5746224, 0.547385, 3...",0,0
13814,"[0.70389611, -2.2027533, 1.3356761, -0.9061375...",0,0
21765,"[0.55377287, -1.3223277, 0.03835047, -1.590861...",0,0


In [37]:
report.run(reference_data = df_train, current_data = df_val, column_mapping = column_mapping)

ValidationError: 1 validation error for DatasetColumns
text_feature_names
  value is not a valid list (type=type_error.list)

In [28]:
report.as_dict()

ValueError: could not broadcast input array from shape (300,9897) into shape (1,9897)