In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from joblib import Memory
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
pip install prefect

Collecting prefect
  Downloading prefect-2.18.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiosqlite>=0.17.0 (from prefect)
  Downloading aiosqlite-0.20.0-py3-none-any.whl (15 kB)
Collecting alembic<2.0.0,>=1.7.5 (from prefect)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting apprise<2.0.0,>=1.1.0 (from prefect)
  Downloading apprise-1.7.6-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asyncpg>=0.23 (from prefect)
  Downloading asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m46.6 MB/s[0m eta [3

In [4]:
from prefect import task, flow

In [18]:
from sklearn import metrics

In [19]:
@task
def load_data(file_path):
    """
    Load data from a CSV file.
    """
    return pd.read_csv(file_path)


@task
def split_inputs_output(data, inputs, output):
    """
    Split features and target variables.
    """
    X = data[inputs]
    y = data[output]
    return X, y


@task
def split_train_test(X, y, test_size=0.25, random_state=0):
    """
    Split data into train and test sets.
    """
    return train_test_split(X, y, test_size=test_size, random_state=random_state)


@task
def preprocess_data(X_train, X_test, y_train, y_test):
    """
    Rescale the data.
    """
    vocab = CountVectorizer()
    X_train_num = vocab.fit_transform(X_train)
    X_test_num = vocab.transform(X_test)
    return X_train_num, X_test_num, y_train, y_test


@task
def train_model(X_train_num, y_train, hyperparameters):
    """
    Training the machine learning model.
    """
    clf = MultinomialNB()
    clf.fit(X_train_num, y_train)
    return clf


@task
def evaluate_model(model, X_train_num, y_train, X_test_num, y_test):
    """
    Evaluating the model.
    """
    y_train_pred = model.predict(X_train_num)
    y_test_pred = model.predict(X_test_num)

    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)

    return train_score, test_score

In [20]:
# Workflow

@flow(name="Naivebayes Training Flow")
def workflow():
    DATA_PATH = "/content/my_data.csv"
    INPUTS = 'Review text'
    OUTPUT = 'Sentiment'
    HYPERPARAMETERS = {'vectorization': [CountVectorizer()]}

    # Load data
    iris = load_data(DATA_PATH)

    # Identify Inputs and Output
    X, y = split_inputs_output(iris, INPUTS, OUTPUT)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    # Preprocess the data
    X_train_num, X_test_num, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)

    # Build a model
    model = train_model(X_train_num, y_train, HYPERPARAMETERS)

    # Evaluation
    train_score, test_score = evaluate_model(model, X_train_num, y_train, X_test_num, y_test)

    print("Train Score:", train_score)
    print("Test Score:", test_score)

In [21]:
if __name__ == "__main__":
    workflow()

Train Score: 0.9505652100556774
Test Score: 0.930161943319838
