In [7]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
# Load the corpus and labels
corpus = [
    "This is a positive sentence.",
    "This is a negative sentence.",
    "This is a neutral sentence.",
    "This is a positive sentence.",
    "This is a negative sentence."
]
labels = [1, 0, 2, 1, 0]

In [9]:
# Define a custom transformer to tokenize text
def tokenize_transformer(documents):
    return [word_tokenize(doc) for doc in documents]

In [10]:
# Define a custom transformer to remove stop words
def stop_words_transformer(documents):
    stop_words = set(stopwords.words('english'))
    return [[word for word in doc if word.lower() not in stop_words] for doc in documents]

In [11]:
# Define a custom transformer to lemmatize words
def lemmatize_transformer(documents):
    lemmatizer = WordNetLemmatizer()
    return [[lemmatizer.lemmatize(word) for word in doc] for doc in documents]

In [12]:
# Define a custom transformer to join the words back into a single string
def join_words_transformer(documents):
    return [' '.join(doc) for doc in documents]

In [13]:
# Wrap the functions as transformers
class TokenizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return tokenize_transformer(X)

In [14]:
class StopWordsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return stop_words_transformer(X)

In [15]:

class LemmatizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return lemmatize_transformer(X)

In [16]:
class JoinWordsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return join_words_transformer(X)

In [17]:
# Define the pipeline
pipeline = Pipeline([
    ('tokenize', TokenizeTransformer()),
    ('remove_stop_words', StopWordsTransformer()),
    ('lemmatize', LemmatizeTransformer()),
    ('join_words', JoinWordsTransformer()),
    ('vectorize', TfidfVectorizer()),
    ('classify', MultinomialNB())
])

In [18]:
# Train the model
pipeline.fit(corpus, labels)

Pipeline(steps=[('tokenize', TokenizeTransformer()),
                ('remove_stop_words', StopWordsTransformer()),
                ('lemmatize', LemmatizeTransformer()),
                ('join_words', JoinWordsTransformer()),
                ('vectorize', TfidfVectorizer()),
                ('classify', MultinomialNB())])

In [19]:
# Evaluate the model
predictions = pipeline.predict(corpus)
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

