In [1]:
# Import necessary libraries
import logging
import yaml
import warnings
import os
import sys

In [4]:
# Import pipeline modules from module
from module.preprocess import Preprocessor
from module.train import Trainer
from module.evaluate import ModelEvaluator
from module.tuning import Tuner

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [5]:
# Ignore warnings (Optional)
warnings.filterwarnings('ignore')

In [6]:
# Load the configuration from the YAML file
config_path = './module/config.yaml'
with open(config_path, 'r') as stream:
    config = yaml.safe_load(stream)

In [7]:
# Set up logging for the notebook
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p',
                    level=logging.INFO)

In [12]:
# Define a function to run the pipeline
def run_pipeline():
    logging.info("Pipeline execution started.")

    # Step 1: Data Ingestion
    import nltk
    from nltk.corpus import twitter_samples
    nltk.download('twitter_samples')

    all_positive_tweets_sen = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets_sen = twitter_samples.strings('negative_tweets.json')
    logging.info("Data loading completed.")

    # Step 2: Preprocessing
    logging.info("Preprocessing started.")
    preprocessor = Preprocessor()
    train_x, test_x, y_train, y_test = preprocessor.preprocess_data(
        all_positive_tweets_sen, all_negative_tweets_sen)

    X_train = [' '.join(tokens) for tokens in train_x]
    X_test = [' '.join(tokens) for tokens in test_x]
    logging.info("Preprocessing finished.")

    # Step 3: Model Training
    logging.info("Model training started.")
    trainer = Trainer()
    trainer.train_all_models(X_train, y_train, X_test, y_test)
    logging.info("Model training completed.")

    # Step 4: Model Evaluation
    logging.info("Model evaluation started.")
    evaluator = ModelEvaluator()
    logistic_errors = evaluator.error_analysis(trainer.train_model(
        'Logistic Regression', X_train, y_train), X_test, y_test, test_x)
    logging.info("Model evaluation completed.")

    # Show errors in the notebook
    print("Logistic Regression errors:")
    display(logistic_errors)

    # # Step 5: Model Tuning (optional)
    # logging.info("Model tuning started.")
    # tuner = Tuner()
    # rf_best_model, rf_best_params = tuner.tune_random_forest(X_train, y_train)
    # logging.info(
    #     f"Random Forest tuning completed. Best parameters: {rf_best_params}")

    logging.info("Pipeline execution finished successfully.")

In [13]:
# Run the pipeline
run_pipeline()

10/07/2024 07:42:08 PM INFO Pipeline execution started.
[nltk_data] Downloading package twitter_samples to C:\Users\Hải
[nltk_data]     Nam\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
10/07/2024 07:42:09 PM INFO Data loading completed.
10/07/2024 07:42:09 PM INFO Preprocessing started.
10/07/2024 07:42:12 PM INFO Preprocessing finished.
10/07/2024 07:42:12 PM INFO Model training started.


Logistic Regression Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      1000
         1.0       1.00      1.00      1.00      1000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

SVC Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1000
         1.0       1.00      1.00      1.00      1000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



10/07/2024 07:42:20 PM INFO Model training completed.
10/07/2024 07:42:20 PM INFO Model evaluation started.


Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      1000
         1.0       0.99      1.00      1.00      1000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Naive Bayes Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96      1000
         1.0       0.96      0.96      0.96      1000

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



10/07/2024 07:42:20 PM INFO Model evaluation completed.


Logistic Regression errors:


Unnamed: 0,sentence,predicted_class,real_class
753,"[park, get, sunlight]",0.0,1.0
1298,"[u, prob, fun, david]",1.0,0.0
1544,"[pat, jay]",1.0,0.0
1756,"[belov, grandmoth]",1.0,0.0
1773,"[that, life, get, call, peopl, havent, seen, 2...",1.0,0.0
1853,"[sr, financi, analyst, expedia, inc, bellevu, ...",1.0,0.0


10/07/2024 07:42:20 PM INFO Pipeline execution finished successfully.
