In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [10]:
def load_train_test_dataset_pd(train_split="train[:15%]+train[-15%:]", test_split="test[:15%]+test[-15%:]"):
    train_dataset = load_dataset("rotten_tomatoes", split=train_split)
    test_dataset = load_dataset("rotten_tomatoes", split=test_split)
    train_df = train_dataset.to_pandas()
    train_df.sample(frac=1)
    test_df = test_dataset.to_pandas()
    return (train_df, test_df)

In [1]:
def create_train_test_data(train_df, test_df, vectorize, column_name="text"):
    train_df["vector"] = train_df[column_name].apply(lambda x: vectorize(x))
    test_df["vector"] = test_df[column_name].apply(lambda x: vectorize(x))
    X_train = np.stack(train_df["vector"].values, axis=0)
    X_test = np.stack(test_df["vector"].values, axis=0)
    y_train = train_df["label"].to_numpy()
    y_test = test_df["label"].to_numpy()
    return (X_train, X_test, y_train, y_test)

In [12]:
def train_classifier(X_train, y_train):
    clf = LogisticRegression(C=0.1)
    clf = clf.fit(X_train, y_train)
    return clf

In [6]:
def test_classifier(test_df, clf, target_names=None):
    test_df["prediction"] = test_df["vector"].apply(lambda x: clf.predict([x])[0])
    print(classification_report(test_df["label"], test_df["prediction"], target_names=target_names))