# Building a Machine Learning Pipeline using Sklearn

The purpose of this notebook is to create a sklearn pipeline in order to make life a lot easier when scoring new data i.e. articles which have to undergo the same preprocessing and modeling strategy as the training data

## Import Modules

In [1]:
# import modules
from pathlib import Path
import numpy as np
import pandas as pd
from nltk import word_tokenize
from sklearn.model_selection import train_test_split          
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle
from sklearn.pipeline import Pipeline



## Set Path Variables

In [2]:
# set path variables
DATAPATH = Path(Path.cwd().parents[0] / 'data')
MODELPATH = Path(Path.cwd().parents[0] / 'model')

DATAPATH.mkdir(parents=True, exist_ok=True)
MODELPATH.mkdir(parents=True, exist_ok=True)

# Utility Functions

In [3]:
def get_article_topic(pipeline_obj:Pipeline, article:pd.DataFrame, article_num:int):
    """Generate article topics for given article and given i.e. trained pipeline
    """
    topic_num = np.argmax(pipeline_obj.transform([article[article_num]]))
    return topic_num 

def get_topic_terms(pipe_obj:Pipeline, topic_num=0, top_n=3):
    """ get the index position of the top 3 terms in a topic.
    input the fitted laten dirichtlet object.
    input the fitted count-vectorizer object.
    input the topic number.
    input the top-n words belonging to each topic.
    output the top_n words for topic_num. """
    return [pipe_obj['cv'].get_feature_names_out()[ind] for ind in pipe_obj['lda'].components_[topic_num].argsort()[-top_n:]]

## Import Data

In [4]:
# import data
df = pd.read_csv(DATAPATH / "npr.csv")
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


## Define Custom Tokenizer Class 

In [5]:
# write custom tokenizer class to be passed to CountVectorizer instance 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        lemma_tokens = [self.wnl.lemmatize(t) for t in word_tokenize(articles)] # lemmatize words
        lemma_tokens_alpha = [t for t in lemma_tokens if t.isalpha()] # ensure words are alphabetic
        lemma_tokens_alpha_long = [t for t in lemma_tokens_alpha if len(t)>=3] # ensure minimum word lengths
        return lemma_tokens_alpha_long

In [6]:
# add a few stopwords
my_additional_stop_words = ['ha', 'le', 'u', 'wa']
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

## Define Preprocessing and Modeling Pipeline

In [7]:
# init pipeline object
pipe = Pipeline([('cv', CountVectorizer(lowercase=True,
                                        #stop_words='english',
                                        stop_words=stop_words, # use my extended list of stop words
                                        tokenizer=LemmaTokenizer(),
                                        ngram_range=(1,1),
                                        max_df=.9, 
                                        min_df=.05,
                                        max_features=1000 
                                        )), 
                ('lda', LatentDirichletAllocation(n_components=10,
                                                  random_state=123,
                                                  n_jobs=-1))])

## Train and Test Split
We use a train test split to train the model on the training data and use the test data later on just for scoring kind of "new" data; the test data is of course not used by any means to test the model since it´s an unsupervised case with the ground truth being unknown 

In [8]:
X_train, X_test = train_test_split(df, test_size=0.33, random_state=123)

In [9]:
X_train.head()

Unnamed: 0,Article
8854,"Welcome to the third installment of Read, Watc..."
3298,Milwaukee has the nation’s publicly funded v...
8716,Donald Trump has released the names of his eco...
2246,South Korea and the U. S. confirm North Korea ...
7008,A international convoy carrying desperately ...


In [10]:
# reset index
X_test.reset_index(drop=True, inplace=True)

# save test data as csv file for later scoring
X_test.to_csv(DATAPATH /'npr_test.csv', index=False)

# Fit the Pipeline 

In [11]:
# train the pipeline
pipe.fit(X_train.Article)

In [None]:
# save trained pipeline model
pickle.dump(pipe, open(MODELPATH / 'pipe_model', 'wb'))

# Evaluate the functionality of the fitted pipeline and all its components

In [None]:
# check the whole pipeline ie apply it to the first article...we expect an array with topic probabilities
pipe.transform([X_train['Article'][0]])

array([[4.08878385e-01, 2.82548644e-04, 2.82556483e-04, 2.82553258e-04,
        2.82553023e-04, 2.82555312e-04, 2.42628428e-02, 2.82544473e-04,
        5.09450070e-02, 5.14218455e-01]])

In [None]:
# check the count vectorizer part of the pipeline ie apply it to the first article
dtm = pipe['cv'].transform([X_train['Article'][0]])
dtm

<1x978 sparse matrix of type '<class 'numpy.int64'>'
	with 197 stored elements in Compressed Sparse Row format>

In [None]:
# get the terms
pipe['cv'].get_feature_names()[-10:]



['writer',
 'writes',
 'writing',
 'written',
 'wrong',
 'wrote',
 'year',
 'yes',
 'york',
 'young']

In [None]:
# check the lda part of the pipeline ie apply it to the first article
pipe['lda'].transform(dtm)

array([[4.08878385e-01, 2.82548644e-04, 2.82556483e-04, 2.82553258e-04,
        2.82553023e-04, 2.82555312e-04, 2.42628428e-02, 2.82544473e-04,
        5.09450070e-02, 5.14218455e-01]])

In [None]:
# have a look at the fit result i.e. the topics
pipe['lda'].components_[:10]

array([[5.38036783e+01, 1.39500262e+02, 2.38628861e+01, ...,
        1.23897442e+02, 5.12709381e+02, 8.24055752e+01],
       [1.63442873e+02, 3.24633594e+02, 1.86475641e+02, ...,
        3.88589007e+01, 8.16695987e+01, 1.39606030e+02],
       [3.95461911e+01, 4.24780261e+02, 2.60639167e+02, ...,
        4.46330601e+01, 2.89979667e+02, 1.58123414e+02],
       ...,
       [5.29379506e+01, 7.31809994e+01, 1.33144608e+02, ...,
        1.85740750e+01, 1.56915206e+02, 1.00041419e-01],
       [1.53347316e+02, 5.58021438e+02, 1.74043972e+01, ...,
        4.40423786e+02, 4.63731024e+02, 9.29886926e+02],
       [1.01626771e+02, 1.21435535e+02, 2.22278325e+02, ...,
        5.01889224e+00, 7.35283428e+01, 2.39248508e+00]])

## Score New Data 

In [None]:
# import the pipeline model
pipe = pickle.load(open(MODELPATH / 'pipe_model', 'rb'))

In [None]:
# import txt data as dataframe
X_test = pd.read_csv(DATAPATH / 'npr_test.csv')
X_test.head()

Unnamed: 0,Article
0,"Hamilton, the Broadway musical about ”the scra..."
1,Tennessee Ernie Ford was fed up with the trapp...
2,"When Yomi Wrong was born in 1972, doctors told..."
3,"As laptops become smaller and more ubiquitous,..."
4,Republican presidential nominee Donald Trump i...


In [None]:
# how many articles we have
X_test.shape

(3958, 1)

In [None]:
# get article topic for single article
article_num = 100
top_n = 5
topic_num = get_article_topic(pipe, X_test.Article, article_num)
top_terms = get_topic_terms(pipe, topic_num, top_n)
print('The main topic for article #{} is topic #{}'.format(article_num, topic_num))
print('The top {} terms for topic #{} are: \n {}'.format(top_n, topic_num, top_terms))
print('\n')

The main topic for article #100 is topic #6
The top 5 terms for topic #6 are: 
 ['country', 'said', 'family', 'say', 'woman']


