In [None]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# from custom_transformer import StartingVerbExtractor

url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

  url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# 2. What is CountVectorizer?

CountVectorizer converts a collection of text documents into a matrix of token counts:

- tokenizes (splits text into words)

- builds a vocabulary

- counts how often each word appears

- outputs a document-term matrix

In [None]:
# Example text documents
documents = [
    "I love machine learning",
    "Machine learning is awesome",
    "I love programming and learning"
]

# 1. Create the vectorizer
vectorizer = CountVectorizer()

# 2. Fit and transform
X = vectorizer.fit_transform(documents)

In [None]:
# 3. Show vocabulary
print("Vocabulary:", vectorizer.get_feature_names_out())

Vocabulary: ['and' 'awesome' 'is' 'learning' 'love' 'machine' 'programming']


In [None]:
# 4. Show document-term matrix
print("Document-term matrix:\n", X.toarray())

Document-term matrix:
 [[0 0 0 1 1 1 0]
 [0 1 1 1 0 1 0]
 [1 0 0 1 1 0 1]]


# 2. What TfidfTransformer Does

TfidfTransformer converts a raw count matrix (produced by CountVectorizer or any count-based representation) into TF-IDF scores.

TF (Term Frequency): how often a word appears in a document

IDF (Inverse Document Frequency): how “unique” a word is across documents

TF-IDF = TF × IDF → higher for important/rare words, lower for very common words

In [None]:
docs = [
    "the cat sat on the mat",
    "the dog sat on the log"
]

# Step 1: Convert documents to raw counts
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(docs)

print("Count matrix:\n", X_counts.toarray())
print("Vocabulary:", count_vect.vocabulary_)

# Step 2: Transform count matrix into TF-IDF
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_counts)

print("\nTF-IDF matrix:\n", X_tfidf.toarray())


Count matrix:
 [[1 0 0 1 1 1 2]
 [0 1 1 0 1 1 2]]
Vocabulary: {'the': 6, 'cat': 0, 'sat': 5, 'on': 4, 'mat': 3, 'dog': 1, 'log': 2}

TF-IDF matrix:
 [[0.44554752 0.         0.         0.44554752 0.31701073 0.31701073
  0.63402146]
 [0.         0.44554752 0.44554752 0.         0.31701073 0.31701073
  0.63402146]]


# 3. Machine Learning Workflow

In [None]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


In [None]:
def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [None]:
def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [None]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()

    # train classifier
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)
    clf.fit(X_train_tfidf, y_train)

    # predict on test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    y_pred = clf.predict(X_test_tfidf)

    # display results
    display_results(y_test, y_pred)

# 4. Build a pipeline

In [None]:
def main():
  X, y = load_data()
  X_train, X_test, y_train, y_test = train_test_split(X, y)

  # create a pipeline:
  pipeline = Pipeline([
      ('vect', CountVectorizer(tokenizer=tokenize)),
      ('tfidf', TfidfTransformer()),
      ('clf', RandomForestClassifier())
  ])

  # train a classifier:
  pipeline.fit(X_train, y_train)

  # predict:
  y_preds = pipeline.predict(X_test)

  # display results
  display_results(y_test, y_pred)

# 5. Creating Custom Transformer

Remember, all estimators have a fit method, and since this is a transformer, it also has a transform method.

- FIT METHOD: This takes in a 2d array X for the feature data and a 1d array y for the target labels. Inside the fit method, we simply return self. This allows us to chain methods together, since the result on calling fit on the transformer is still the transformer object. This method is required to be compatible with scikit-learn.

- TRANSFORM METHOD: The transform function is where we include the code that well, transforms the data. In this case, we return the data in X multiplied by 10. This transform method also takes a 2d array X.


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TenMultiplier(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X * 10

In [None]:
multiplier = TenMultiplier()
X = np.array([6, 3, 7, 4, 7])
multiplier.transform(X)

array([60, 30, 70, 40, 70])

In [None]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

# 6. Feature Union

In [None]:
def model_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', RandomForestClassifier())
    ])

    return pipeline


In [None]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = model_pipeline()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    display_results(y_test, y_pred)

# 7. Grid Search Pipeline

In [None]:
def build_model():
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', RandomForestClassifier())
    ])

    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
        'features__text_pipeline__vect__max_features': (None, 5000, 10000),
        'features__text_pipeline__tfidf__use_idf': (True, False),
        'clf__n_estimators': [50, 100, 200],
        'clf__min_samples_split': [2, 3, 4],
        'features__transformer_weights': (
            {'text_pipeline': 1, 'starting_verb': 0.5},
            {'text_pipeline': 0.5, 'starting_verb': 1},
            {'text_pipeline': 0.8, 'starting_verb': 1},
        )
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv

In [None]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = build_model()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    display_results(model, y_test, y_pred)