# Building a Machine Learning Pipeline using Sklearn

The purpose of this notebook is to create a sklearn pipeline in order to make life a lot easier when scoring new data i.e. articles which have to undergo the same preprocessing and modeling strategy as the training data

In [1]:
# import modules
from pathlib import Path
import numpy as np
import pandas as pd
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle
from sklearn.pipeline import Pipeline

In [2]:
# set path variables
DATAPATH = Path('data')
MODELPATH = Path('model')

In [3]:
# import data
df = pd.read_csv(DATAPATH / "npr.csv")
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
# write custom tokenizer class to be passed to CountVectorizer instance 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        lemma_tokens = [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
        lemma_tokens_alpha = [t for t in lemma_tokens if t.isalpha()]
        return lemma_tokens_alpha

In [5]:
# init pipeline object
pipe = Pipeline([('cv', CountVectorizer(lowercase=True,
                                        stop_words='english',
                                        tokenizer=LemmaTokenizer(),
                                        ngram_range=(1,1),
                                        max_df=.9, 
                                        min_df=.05,
                                        max_features=1000 
                                        )), 
                ('lda', LatentDirichletAllocation(n_components=10,
                                                  random_state=123,
                                                  n_jobs=-1))])

In [6]:
# train the pipeline
pipe.fit(df.Article)



Pipeline(steps=[('cv',
                 CountVectorizer(max_df=0.9, max_features=1000, min_df=0.05,
                                 stop_words='english',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x7fa3e015b520>)),
                ('lda',
                 LatentDirichletAllocation(n_jobs=-1, random_state=123))])

# Evaluate the functionality of the fitted pipeline and all its components

In [30]:
# check the whole pipeline ie apply it to the first article...we expect an array with topic probabilities
pipe.transform([df['Article'][0]])

array([[8.69250925e-02, 2.57175139e-04, 2.57153400e-04, 2.57148045e-04,
        6.06047398e-02, 4.56062952e-02, 2.57141754e-04, 2.57145035e-04,
        8.05320973e-01, 2.57136338e-04]])

In [35]:
# check the count vectorizer part of the pipeline ie apply it to the first article
dtm = pipe['cv'].transform([df['Article'][0]])
dtm

<1x979 sparse matrix of type '<class 'numpy.int64'>'
	with 200 stored elements in Compressed Sparse Row format>

In [37]:
# get the terms
pipe['cv'].get_feature_names()[-10:]



['writer',
 'writes',
 'writing',
 'written',
 'wrong',
 'wrote',
 'year',
 'yes',
 'york',
 'young']

In [36]:
# check the lda part of the pipeline ie apply it to the first article
pipe['lda'].transform(dtm)

array([[8.69250925e-02, 2.57175139e-04, 2.57153400e-04, 2.57148045e-04,
        6.06047398e-02, 4.56062952e-02, 2.57141754e-04, 2.57145035e-04,
        8.05320973e-01, 2.57136338e-04]])

In [38]:
# have a look at the fit result i.e. the topics
pipe['lda'].components_[:10]

array([[1.43195650e+02, 2.34134679e+02, 6.01013378e+02, ...,
        6.82072223e+00, 5.01368642e+02, 1.06639117e-01],
       [1.14387199e+02, 1.98434510e+02, 1.47439641e+02, ...,
        5.62634844e+01, 1.61976624e+02, 1.43655510e+03],
       [2.98953324e+01, 3.09574025e+02, 4.07509224e+02, ...,
        1.75718095e+01, 7.06604170e+01, 1.03160090e+02],
       ...,
       [2.77909753e+02, 6.15394106e+02, 2.30462459e+02, ...,
        6.10713898e+01, 1.86539814e+02, 1.35200342e+02],
       [1.10928781e+02, 1.37682785e+02, 6.74706397e+01, ...,
        8.77417950e+01, 6.71249813e+02, 9.21314930e+00],
       [3.63136927e+01, 7.67916003e+01, 2.43411914e+01, ...,
        6.45430157e+01, 3.26636876e+02, 3.04070900e+02]])

## Utility Functions

In [39]:
def get_article_topic(lda_obj, article_num):
    """ input LDA object and the number of the article to be scored.
    Return the topic number for that article"""
    topic_num = np.argmax(lda_obj.transform(dtm[article_num]))
    return topic_num 

def get_topic_terms(lda_obj, cv_obj, topic_num=0, top_n=3):
    """ get the index position of the top 3 terms in a topic.
    input the fitted laten dirichtlet object.
    input the fitted count-vectorizer object.
    input the topic number.
    input the top-n words belonging to each topic.
    output the top_n words for topic_num. """
      
    return [cv_obj.get_feature_names_out()[ind] for ind in lda_obj.components_[topic_num].argsort()[-top_n:]]

In [40]:
article_num = 0 # select article number
top_n = 5 # select top n terms
topic_num = get_article_topic(pipe['lda'], article_num)
print('The topic for article number {} is {}'.format(article_num, topic_num))

#top_terms = get_topic_terms(LDA, cv, topic_num, top_n)
#print('The top {} terms in article number {} are: \n {}'.format(top_n, article_num, top_terms))

The topic for article number 0 is 8
