In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from scipy import sparse
from itertools import compress
import contractions, unicodedata, re
from nltk.stem import LancasterStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

from sklearn.pipeline import make_union, make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

## Classes and functions to be used in preprocessing steps

In [None]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    '''
    Extract features one by one for a pipeline
    '''
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.column]

In [None]:
class ToNumeric(BaseEstimator, TransformerMixin):
    '''
    Converts features to numeric for a pipeline
    '''
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        numeric_df = pd.DataFrame()
        numeric_df[self.column] = pd.to_numeric(X)
        return numeric_df

In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    '''
    Preprocessing for a pandas series containing text including:
    
    1. Replacing specific characters
    2. Expanding contractions
    3. Removing non-ASCII characters
    4. Convert to lowercase
    5. Remove punctuation
    6. Stem words
    
    '''
    
    def __init__(self, replacement_dictionary=None, column_header=None):
        self.replacement_dictionary = replacement_dictionary
        self.column_header = column_header
    
    def _replace_characters(self, X, *args):
        '''
        Replaces specific characters in the columns_to_process of X based on a replacement_dictionary
        '''
        replaced_df = pd.DataFrame()
        data = X
        for key,value in self.replacement_dictionary.items():
            data = [text.replace(key,value) for text in data]
        replaced_df = data
        return replaced_df    
    
    def _expand_contractions(self, X, *args):
        '''
        Replaces contractions with the expanded form of the word (e.g. can't to cannot) in the columns_to_process of X
        '''
        replaced_df = pd.DataFrame()
        data = X
        data = [contractions.fix(text) for text in data]
        replaced_df = data
        return replaced_df
    
    def _remove_non_ascii(self, X, *args):
        '''
        Removes non-ascii characters from the text in the columns_to_process of X
        '''
        replaced_df = pd.DataFrame()
        data = X
        non_ascii = []
        for text in data:
            text_non_ascii = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            non_ascii.append(text_non_ascii)
        replaced_df = non_ascii
        return replaced_df
    
    def _to_lowercase(self, X, *args):
        '''
        Converts all characters to lowercase in the columns_to_process of X
        '''
        replaced_df = pd.DataFrame()
        data = X
        lower_case = []
        for text in data:
            text_lower = text.lower()
            lower_case.append(text_lower)
        replaced_df = lower_case
        return replaced_df

    def _remove_punctuation(self, X, *args):
        '''
        Removes punctuation from the text in the columns_to_process of X
        '''
        replaced_df = pd.DataFrame()
        data = X
        no_punct = []
        for text in data:
            text_ex_punct = re.sub(r'[^\w\s]', '', text)
            if text_ex_punct != '':
                no_punct.append(text_ex_punct)
        replaced_df = no_punct
        return replaced_df
    
    def _stem_words(self, X, *args):
        '''
        Stems the text in the columns_to_process of X
        '''
        replaced_df = pd.DataFrame()
        data = X
        stemmed_data = []
        stemmer = LancasterStemmer()
        for text in data:
            stemmed_text = []
            for word in text.split(' '):
                stemmed_text.append(stemmer.stem(word))
            stemmed_text = ' '.join(stemmed_text)
            stemmed_data.append(stemmed_text)
        replaced_df = stemmed_data
        return replaced_df

    
    def transform(self, X, *args):
        '''
        Combines all preprocessing steps for X
        '''
        print('Initialising replacing characters...')
        text_data = self._replace_characters(X)
        print('Completed replacing characters')
        print('Initialising expanding contractions...')
        text_data = self._expand_contractions(text_data)
        print('Completed expanding contractions')
        print('Initialising removing non-ascii characters...')
        text_data = self._remove_non_ascii(text_data)
        print('Completed removing non-ascii characters')
        print('Initialising converting characters to lowercase...')
        text_data = self._to_lowercase(text_data)
        print('Completed converting characters to lowercase')
        print('Initialising removal of punctuation...')
        text_data = self._remove_punctuation(text_data)
        print('Completed removal of punctuation')
        print('Initialising stemming words...')
        text_data = self._stem_words(text_data)
        print('Completed stemming words')
        text_data = pd.Series(data=text_data,index=X.index,name=self.column_header)
        return text_data
    
    def fit(self, X, *args):
        return self
    

In [None]:
class Dummifier(BaseEstimator, TransformerMixin):
    '''
    Dummifies a pandas series
    
    Ensures the resulting dummified columns match the fitted data after transformation
    '''
    
    def __init__(self):
        self.dummified_columns=None

    def transform(self, X, *args):
        '''
        Dummifies X and ensures the resulting columns match self.dummified_columns (created during fitting)
        
        Drops any columns in dummified X that are not in self.dummified_columns
        Adds a zero column for any columns in  self.dummified_columns that are not in dummified X
        '''
        # Dummify specific columns of X
        dummified_data = pd.get_dummies(X,drop_first=False)
        
        # Filter out dummified columns not in self.dummified_columns
        col_in_fit = list(compress(dummified_data.columns, dummified_data.columns.isin(self.dummified_columns)))
        dummified_data = dummified_data[col_in_fit]
        
        # Add columns in self.dummified_columns that are not in dummified X
        col_not_in_fit = list(set(self.dummified_columns)-set(dummified_data.columns))
        for col in col_not_in_fit:
            dummified_data[col] = 0

        return dummified_data


    def fit(self, X, *args):
        '''
        Creates an index of dummified columns after dummification of X
        Stored as self.dummified_columns
        '''
        # Dummify specific columns of X
        dummified_data = pd.get_dummies(X,drop_first=True)
        
        # Store new columns headers as self.dummified_columns
        self.dummified_columns = dummified_data.columns
        
        return self


## Process for creating pipeline and testing the accuracy of the model

#### X_train
- Undersampling
- PreProcessing (fit)
- PreProcessing (transform)
- Oversampling
- Model (fit)
- Pickle processing and model

#### X_test
- Unpickle preprocessing
- Unpickle model
- PreProcessing (transform)
- Model (predict)

## Create pipeline

In [None]:
replace_dict = {
    '\r':' ',
    '\n':' '
}

# Create pipelines for each column

invoice_desc_pipe = make_pipeline(
    FeatureExtractor('Invoice Desc'),
    TextPreprocessor(replacement_dictionary=replace_dict,column_header='Invoice Desc'),
    CountVectorizer(token_pattern='\w+',stop_words='english',max_df=1.0,min_df=10)
)

supplier_pipe = make_pipeline(
    FeatureExtractor('Supplier Name'),
    CountVectorizer(token_pattern='\w+',stop_words='english',max_df=1.0,min_df=10)
)

currency_pipe = make_pipeline(
    FeatureExtractor('Invoice Currency'),
    CountVectorizer(token_pattern='\w+',stop_words='english',max_df=1.0,min_df=10)
)

project_pipe = make_pipeline(
    FeatureExtractor('Project Owning Org'),
    CountVectorizer(token_pattern='\w+',stop_words='english',max_df=1.0,min_df=10)
)

supp_grp_pipe = make_pipeline(
    FeatureExtractor('Supplier_Group'),
    CountVectorizer(token_pattern='\w+',stop_words='english',max_df=1.0,min_df=10)
)

b_u_pipe = make_pipeline(
    FeatureExtractor('Business Unit'),
    Dummifier()
)

datasource_pipe = make_pipeline(
    FeatureExtractor('datasource'),
    Dummifier()
)

legacy_pipe = make_pipeline(
    FeatureExtractor('Legacy'),
    Dummifier()
)

leakage_id_pipe = make_pipeline(
    FeatureExtractor('Leakage_Identifier'),
    Dummifier()
)

leakage_grp_pipe = make_pipeline(
    FeatureExtractor('Leakage_Group'),
    Dummifier()
)

americas_pipe = make_pipeline(
    FeatureExtractor('Americas_Flag'),
    Dummifier()
)

invoice_amt_pipe = make_pipeline(
    FeatureExtractor('Invoice_Amt'),
    ToNumeric('Invoice_Amt'),
    StandardScaler()
)

usd_amt_pipe = make_pipeline(
    FeatureExtractor('USD_Amt'),
    ToNumeric('USD_Amt'),
    StandardScaler()
)

year_pipe = make_pipeline(
    FeatureExtractor('Year'),
    ToNumeric('Year'),
    StandardScaler()
)


# Union pipelines together

data_processing = make_union(invoice_desc_pipe,
                           supplier_pipe,
                           currency_pipe,
                           project_pipe,
                           supp_grp_pipe,
                           b_u_pipe,
                           datasource_pipe,
                           legacy_pipe,
                           leakage_id_pipe,
                           leakage_grp_pipe,
                           americas_pipe,
                           invoice_amt_pipe,
                           usd_amt_pipe,
                           year_pipe)

## Process data

In [None]:
data = pd.read_csv('01 English by dropping_GBP.csv',na_values='Unknown')

X = data.copy()
y = X.pop('Category_Group')

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y)

**Rebalance classes for training data**

In [None]:
# Remove categories with counts < 100
# NB: these will never be predicted by the final model

data_value_counts = pd.DataFrame(y_train.value_counts(dropna=False))

categories_overly_low = data_value_counts[data_value_counts['Category_Group']>100].index

y_train = y_train[y_train.isin(categories_overly_low)]

X_train = X_train.loc[y_train.index]

In [None]:
# Reduce number of records for categories with record counts > 10000

categories_too_high = data_value_counts[data_value_counts['Category_Group']>10000].index

y_train_too_high = y_train[y_train.isin(categories_too_high)].copy()

X_train_too_high = X_train.loc[y_train_too_high.index].copy()

y_train_remainder = y_train[~y_train.isin(categories_too_high)].copy()

X_train_remainder = X_train.loc[y_train_remainder.index].copy()

under_sampler = RandomUnderSampler(random_state=1)

X_train_undersampled, y_train_undersampled = under_sampler.fit_sample(X_train_too_high,y_train_too_high)

y_train_undersampled = pd.Series(y_train_undersampled)

y_train = y_train_undersampled.append(y_train_remainder,ignore_index=True)

X_train_undersampled = pd.DataFrame(X_train_undersampled,columns=X_train.columns)

X_train = X_train_undersampled.append(X_train_remainder,ignore_index=True)

### Fit pipeline and transform training data

In [None]:
data_processing.fit(X_train)

In [None]:
X_train_sparse = data_processing.transform(X_train)

**Under sample large classes**

In [None]:
sampler = SMOTE(random_state=1,n_jobs=-1)
X_resampled, y_resampled = sampler.fit_sample(X_train_sparse, y_train)

In [None]:
pd.Series(y_resampled).value_counts()

### Set up model and train on training data

In [None]:
rf_model = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [None]:
rf_model.fit(X_resampled,y_resampled)

### Transform test data

In [None]:
X_test_sparse = data_processing.transform(X_test)

### Test accuracy of model on test data and compare to baseline

In [None]:
rf_test_predictions = rf_model.predict(X_test_sparse)

In [None]:
print(accuracy_score(y_test,rf_test_predictions))

In [None]:
baseline_test = (y_test.value_counts()/len(y_test))[0]
baseline_test

### Create and save preprocesssing pipeline and model .pkl files

In [None]:
# save the data_processing pipeline and model to disk
preprocessing_filename = 'data_processing.pkl'
model_filename = 'model.pkl'
pickle.dump(data_processing, open(preprocessing_filename, 'wb'))
pickle.dump(rf_model, open(model_filename, 'wb'))