1. Define a `tokenizer` and `tokenizer_porter` function to be used in the model training pipeline 

In [14]:
# Tokenizer function
def tokenizer(text):
    return text.split()

In [15]:
# Tokenizer porter from the NLTK Porter Stemning algorithm
# PIP install
!pip install nltk



In [16]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

2. Define a `stop_word` function to be used in the model training pipeline

In [17]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def stop_word():
    return stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3. Read the movie data and break them to train and test sets

In [18]:
# Read the data
import pandas as pd

def split_data(data_path):
    data = pd.read_csv(data_path, encoding='utf-8')

    # Data splits
    X_train = data.loc[:25000, 'review'].values
    y_train = data.loc[:25000, 'sentiment'].values
    X_test = data.loc[25000:, 'review'].values
    y_test = data.loc[25000:, 'sentiment'].values
    
    return X_train, y_train, X_test, y_test

4. Logistic Regression Model training pipeline, with `GridSearchCV` as hyperparameter search strategy, Bag of Words for word embedding, and `LIBLINEAR` solver as the classifier.

The earlier defined `tokenizer` and `tokenizer_porter` are also used for words' tokenization.

In [24]:
# Import the neccesary libraries
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

def train_test_data(train_x, train_y, test_x, test_y):
    # Initialize the Bag of Words Embeddings
    tfidf = TfidfVectorizer(strip_accents = None, lowercase = False, preprocessor = None)

    # Set the parameter grid for the GridSearchCV
    small_param_grid = [
            {
                'vect__ngram_range': [(1, 1)],
                'vect__stop_words': [None],
                'vect__tokenizer': [tokenizer, tokenizer_porter],
                'clf__penalty': ['l2'],
                'clf__C': [1.0, 10.0]
            },
            {
                'vect__ngram_range': [(1, 1)],
                'vect__stop_words': [stop_word, None],
                'vect__tokenizer': [tokenizer],
                'vect__use_idf': [False],
                'vect__norm': [None],
                'clf__penalty': ['l2'],
                'clf__C': [1.0, 10.0]
            },
            {
                'vect__ngram_range': [(1, 1)],
                'vect__stop_words': [stop_word, None],
                'vect__tokenizer': [tokenizer],
                'vect__use_idf': [True],
                'vect__norm': [None],
                'clf__penalty': ['l2', 'l1'],
                'clf__C': [1.0, 10.0]
            },
        ]

    # Initialize the Logistic Regression-Bag of Words model training pipeline
    lr_tfidf = Pipeline([
            ('vect', tfidf),
            ('clf', LogisticRegression(solver='liblinear'))
        ])

    # Attach the Logistic Regression-Bag of Words model training pipeline to the Hyperparameter search grid
    gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid, scoring = 'accuracy', cv = 10, verbose = 2, n_jobs = 1)

    # Fit the Grid search Logistic Regression-Bag of Words model training pipeline with the training set
    gs_lr_tfidf.fit(train_x, train_y)
    
    print("==============================================")
    print(f'Best parameter set: {gs_lr_tfidf.best_params_}')

    print("==============================================")
    print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')

    print("==============================================")
    clf = gs_lr_tfidf.best_estimator_
    print(f'Test Accuracy: {clf.score(test_x, test_y):.3f}')


In [25]:

def call_pipeline(data_path, data_type):
    
    print(f'This is the pipeline for {data_type}')

    # Call the split_data function
    data_split = split_data(data_path)

    # Index the data split to get the respective splits
    X_train =  data_split[0]
    y_train = data_split[1]
    X_test = data_split[2]
    y_test = data_split[3]

    # Call the train test pipeline
    train_test_data(X_train, y_train, X_test, y_test)

In [26]:
# Unprocessed
unprocessed_data_path = r"C:\Users\Admin\Documents\Intro-to-ML\NLP\sentiment-analysis\movie-review-data\movie_data.csv"
call_pipeline(unprocessed_data_path , "Unprocessed Movie Data")



This is the pipeline for Unprocessed Movie Data
Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   5.5s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   5.3s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   5.1s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   5.1s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   5.3s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1,

[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False; total time=  12.4s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False; total time=  11.5s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False; total time=  11.4s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False; total time=  11.5s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False;

[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   9.2s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   9.5s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   9.8s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   9.5s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; tota

[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   9.2s
[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   9.1s
[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   9.1s
[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   9.0s
[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True;

60 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x000001ECD94FB6D0>}
CV Accuracy: 0.890
Test Accuracy: 0.893


In [27]:
# Preprocessed
preprocessed_data_path = r"C:\Users\Admin\Documents\Intro-to-ML\NLP\sentiment-analysis\movie-review-data\preprocessed_movie_data.csv"
call_pipeline(preprocessed_data_path, "Preprocessed Movie Data")

This is the pipeline for Preprocessed Movie Data
Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   0.1s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   0.1s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1

[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=False;

[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   0.0s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; tota

[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   0.1s
[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   0.1s
[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   0.1s
[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True; total time=   0.0s
[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001ECD94FB6D0>, vect__use_idf=True;

60 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n

Test Accuracy: 0.505
