# Building a Machine Learning Pipeline using Sklearn

The purpose of this notebook is to create a sklearn pipeline in order to make life a lot easier when scoring new data i.e. articles which have to undergo the same preprocessing and modeling strategy as the training data

## Import Modules

In [1]:
# import modules
from pathlib import Path
import numpy as np
import pandas as pd
from nltk import word_tokenize
from sklearn.model_selection import train_test_split          
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle
from sklearn.pipeline import Pipeline

## Set Path Variables

In [2]:
# set path variables
DATAPATH = Path('data')
MODELPATH = Path('model')

# Utility Functions

## Import Data

In [4]:
# import data
df = pd.read_csv(DATAPATH / "npr.csv")
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


## Define Custom Tokenizer Class 

In [5]:
# write custom tokenizer class to be passed to CountVectorizer instance 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        lemma_tokens = [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
        lemma_tokens_alpha = [t for t in lemma_tokens if t.isalpha()]
        return lemma_tokens_alpha

## Define Preprocessing and Modeling Pipeline

In [6]:
# init pipeline object
pipe = Pipeline([('cv', CountVectorizer(lowercase=True,
                                        stop_words='english',
                                        tokenizer=LemmaTokenizer(),
                                        ngram_range=(1,1),
                                        max_df=.9, 
                                        min_df=.05,
                                        max_features=1000 
                                        )), 
                ('lda', LatentDirichletAllocation(n_components=10,
                                                  random_state=123,
                                                  n_jobs=-1))])

## Train and Test Split
We use a train test split to train the model on the training data and use the test data later on just for scoring kind of "new" data; the test data is of course not used by any means to test the model since it´s an unsupervised case with the ground truth being unknown 

In [37]:
X_train, X_test = train_test_split(df, test_size=0.33, random_state=123)

In [33]:
X_train.head()

Unnamed: 0,Article
8854,"Welcome to the third installment of Read, Watc..."
3298,Milwaukee has the nation’s publicly funded v...
8716,Donald Trump has released the names of his eco...
2246,South Korea and the U. S. confirm North Korea ...
7008,A international convoy carrying desperately ...


# Fit the Pipeline 

In [9]:
# train the pipeline
pipe.fit(X_train.Article)



Pipeline(steps=[('cv',
                 CountVectorizer(max_df=0.9, max_features=1000, min_df=0.05,
                                 stop_words='english',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x7f1819cedfd0>)),
                ('lda',
                 LatentDirichletAllocation(n_jobs=-1, random_state=123))])

# Evaluate the functionality of the fitted pipeline and all its components

In [10]:
# check the whole pipeline ie apply it to the first article...we expect an array with topic probabilities
pipe.transform([X_train['Article'][0]])

array([[2.62533055e-04, 4.84698064e-02, 2.62549169e-04, 2.62537356e-04,
        2.62571962e-04, 2.06425127e-02, 2.14062831e-01, 6.26977393e-01,
        8.85347242e-02, 2.62541136e-04]])

In [11]:
# check the count vectorizer part of the pipeline ie apply it to the first article
dtm = pipe['cv'].transform([X_train['Article'][0]])
dtm

<1x982 sparse matrix of type '<class 'numpy.int64'>'
	with 200 stored elements in Compressed Sparse Row format>

In [12]:
# get the terms
pipe['cv'].get_feature_names()[-10:]



['writer',
 'writes',
 'writing',
 'written',
 'wrong',
 'wrote',
 'year',
 'yes',
 'york',
 'young']

In [13]:
# check the lda part of the pipeline ie apply it to the first article
pipe['lda'].transform(dtm)

array([[2.62533055e-04, 4.84698064e-02, 2.62549169e-04, 2.62537356e-04,
        2.62571962e-04, 2.06425127e-02, 2.14062831e-01, 6.26977393e-01,
        8.85347242e-02, 2.62541136e-04]])

In [14]:
# have a look at the fit result i.e. the topics
pipe['lda'].components_[:10]

array([[1.30561293e+02, 3.46584736e+02, 9.31554455e+01, ...,
        4.71670441e+01, 7.98119516e+01, 1.55920483e+02],
       [3.74494538e+00, 2.36232207e+02, 2.35982711e+01, ...,
        3.58311082e+01, 4.01715109e+01, 1.06517833e+03],
       [5.91813424e+01, 1.92565675e+02, 2.59814395e+02, ...,
        6.02530942e-01, 3.76251999e+02, 1.17452760e-01],
       ...,
       [6.01144189e+01, 1.56361991e+02, 3.17901120e+01, ...,
        7.35209429e+01, 1.03345443e+02, 7.36526349e+01],
       [6.27781282e+01, 1.08325839e+02, 9.28736704e+01, ...,
        1.23048859e+01, 3.06674595e+02, 1.47377892e+01],
       [8.35393196e+01, 3.44366849e+02, 1.96710825e+01, ...,
        2.95118344e+02, 6.91488465e+02, 6.82621916e+02]])

## Score New Data 

In [38]:
X_test.head()

Unnamed: 0,Article
6807,"Hamilton, the Broadway musical about ”the scra..."
4640,Tennessee Ernie Ford was fed up with the trapp...
3160,"When Yomi Wrong was born in 1972, doctors told..."
6476,"As laptops become smaller and more ubiquitous,..."
9379,Republican presidential nominee Donald Trump i...


In [39]:
# reset index
X_test.reset_index(drop=True, inplace=True)
X_test.head()

Unnamed: 0,Article
0,"Hamilton, the Broadway musical about ”the scra..."
1,Tennessee Ernie Ford was fed up with the trapp...
2,"When Yomi Wrong was born in 1972, doctors told..."
3,"As laptops become smaller and more ubiquitous,..."
4,Republican presidential nominee Donald Trump i...


In [50]:
def get_article_topic(pipeline_obj:Pipeline, article:pd.DataFrame, article_num:int):
    topic_num = np.argmax(pipe.transform([article[article_num]]))
    return topic_num 

def get_topic_terms(lda_obj, cv_obj, topic_num=0, top_n=3):
    """ get the index position of the top 3 terms in a topic.
    input the fitted laten dirichtlet object.
    input the fitted count-vectorizer object.
    input the topic number.
    input the top-n words belonging to each topic.
    output the top_n words for topic_num. """
    return [cv_obj.get_feature_names_out()[ind] for ind in lda_obj.components_[topic_num].argsort()[-top_n:]]

In [53]:
# get article topic for single article
article_num = 0
top_n = 10
topic_num = get_article_topic(pipe, X_test.Article, article_num)
top_terms = get_topic_terms(lda_obj=pipe['lda'], cv_obj=pipe['cv'], topic_num=topic_num, top_n=top_n)
print('The main topic for article #{} is topic #{}'.format(article_num, topic_num))
print('The top {} terms for topic #{} are: \n {}'.format(top_n, topic_num, top_terms))
print('\n')

The main topic for article #0 is topic #9
The top 10 terms for topic #9 are: 
 ['know', 'ha', 'people', 'year', 'say', 'time', 'just', 'like', 't', 'wa']


