In [12]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
# Load the corpus and labels
corpus = [
    "This is the first sentence.",
    "This is the second sentence.",
    "This is the third sentence.",
    "This is the fourth sentence.",
    "This is the fifth sentence."
]
labels = [0, 0, 1, 1, 1]

In [14]:
# Define a custom transformer to tokenize text
class TokenizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [word_tokenize(doc) for doc in X]

In [15]:
# Define a custom transformer to remove stop words
class StopWordsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [[word for word in doc if word.lower() not in self.stop_words] for doc in X]

In [16]:
# Define a custom transformer to lemmatize words
class LemmatizeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [[self.lemmatizer.lemmatize(word) for word in doc] for doc in X]

In [17]:
# Define a custom transformer to join the words back into a single string
class JoinWordsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [' '.join(doc) for doc in X]

In [18]:
# Define the pipeline
pipeline = Pipeline([
    ('tokenize', TokenizeTransformer()),
    ('remove_stop_words', StopWordsTransformer()),
    ('lemmatize', LemmatizeTransformer()),
    ('join_words', JoinWordsTransformer()),
    ('vectorize', TfidfVectorizer()),
    ('classify', MultinomialNB())
])

In [19]:
# Train the model
pipeline.fit(corpus, labels)

Pipeline(steps=[('tokenize', TokenizeTransformer()),
                ('remove_stop_words', StopWordsTransformer()),
                ('lemmatize', LemmatizeTransformer()),
                ('join_words', JoinWordsTransformer()),
                ('vectorize', TfidfVectorizer()),
                ('classify', MultinomialNB())])

In [20]:
# Evaluate the model
predictions = pipeline.predict(corpus)
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         3

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

