# Features Engineering and Prediction

In [62]:
import re
import pandas as pd
import nltk
from collections import Counter
import numpy as np
import scipy as sp
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.downloader
# from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
# from googletrans import Translator
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer


In [63]:
# %pip install watermark

In [64]:
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [65]:
%watermark -iv

scipy     : 1.7.3
gensim    : 4.1.2
pandas    : 1.4.2
matplotlib: 3.5.1
numpy     : 1.22.3
re        : 2.2.1
nltk      : 3.7



In [66]:
# %pip freeze > requirements2.txt

## Useful Functions

### Reading

In [67]:
MIN_DF = 50
RANDOM_STATE = 42


def read_data(path, deli):
    df = pd.read_csv(path, delimiter=deli)
    return df


def sample_data(df, fraction):
    stratified_sample = df.groupby('puor_ds_level1_new',group_keys=False).apply(lambda x: x.sample(frac=fraction, random_state=RANDOM_STATE))
    return stratified_sample

def filtering_level1(df):
    df = df.loc[df["puor_ds_level1_new"] != "OUT OF SCOPE"]
    return df


def drop_na_records(df):
    # how='any' if at least one element is null in the row, drop the entire row
    return df.dropna(how="any", axis=0)

def clean(text):
    text = text.lower()
    return text

#translator = Translator()
#def translate2en(tokens):
#        translated_tokens = [translator.translate(t, src="auto", dest="en").text for t in tokens]
#        return translated_tokens             
             


### Features

In [68]:
def sklearn_split(text):
    # return re.findall(r'(?u)\b\w\w+\b', text) # matches scikit-learn's TfidfVectorizer class to tokenize text
    # return re.findall(r'(?u)\b[a-zA-Z]\w+\b', text) # starting with a letter
    # return re.findall(r'(?u)[A-Za-z0-9][A-Za-z]+', text)
    return re.findall(r"(?u)[A-Za-z0-9][A-Za-z][A-Za-z]+", text)
    # (?u) turns on Unicode matching and not only ASCII characters
    # \b means starting and ending --> Matches only words, not punctuation

#nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# stemmatization
ps = PorterStemmer()
vocabulary = {}

# lemmatization qith Wordnet
#nltk.download('wordnet')
#nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()


def text_processing(word_tokens, train=True, bigrams=False, range_grams=None, root_word_type='stemmatization'):
    """Generates tokens, implements stemmatization or lemmatization, revomes stopwords and updates the vocabulary
    Args
        word_tokens::string, words to be tokenized
        train::boolean,  if True (default) updates the vocabulary only for the training set. This is done to prevent data leakage
        bigrams::boolean, if False (default) generates unigrams, if True generates bigrams
        range_grams::string, if "(1,2)" generates unigrams and bigrams, if "(2,2)" gemerates only bigrams, but
                                words with only onegram are retained
                                e.g. transportation cost for UK --> [transport cost, cost UK]
                                e.g. 1213 transport --> [transport]
    Return
        text_proc_ret::list, list of word tokens
        
    """
    if root_word_type=='stemmatization':
        text_proc = [ps.stem(w) for w in word_tokens if not w in stop_words]
    elif root_word_type=='lemmatization':
        text_proc = [lemmatizer.lemmatize(word, pos="v") for word in word_tokens]  

    if bigrams == True and range_grams == "(1,2)":
        text_proc_ret = text_proc + [
            f"{text_proc[i]} {text_proc[i+1]}" for i in range(len(text_proc) - 1)
        ]

    if bigrams == True and range_grams == "(2,2)":
        if len(text_proc) == 1:
            text_proc_ret = text_proc.copy()
        else:
            text_proc_ret = [f"{text_proc[i]} {text_proc[i+1]}" for i in range(len(text_proc) - 1)]

    if train:
        for w in text_proc_ret:
            if w in vocabulary:
                vocabulary[w] += 1
            else:
                vocabulary[w] = 1
    return text_proc_ret


def encoding_categorical_features(X_train, X_test, min_frq, drp="first", handle_uknwn="ignore"):
    """One hot encoding
    Args:
        X_train::vector or pandas Series, the X feature of the training set to be encoded
        X_test::vector or pandas Series, the X feature the test test to be encoded
        min_frq::int, minminum frequency for a value of the category to be retained
        drp::string, drop first value because it can be inferred from the others
        handle_uknw::string, ignore new values when are encountered i.e. in the test set
    Return
        encodings::tuple, containing the encoding of the training set and test, and a mapping of categories for the training set
    """
    enc = OneHotEncoder(drop=drp, handle_unknown=handle_uknwn, min_frequency=min_frq)
    X_train_enc = enc.fit_transform(X_train)
    cats = enc.categories_[0]
    map_id_to_cat = {v: k for v, k in enumerate(cats)}
    print(f"Number of features:{len(cats)}")
    print(f"First five features:{cats[:5]}")

    if min_frq != None:
        cats_infrequent = enc.infrequent_categories_[0]
        cats_infrequent_set = set(cats_infrequent)
        cats_retained = [c for c in cats if c not in cats_infrequent_set]
        print(f"Infrequent categories: {len(cats_infrequent)}")
        print(f"First five entries: {cats_infrequent[:5]}")
        print(
            f"Retained features (# Features - Infrequent features): {len(cats)-len(enc.infrequent_categories_[0])}"
        )
        map_id_to_cat = {v: k for v, k in enumerate(cats_retained)}

    X_test_enc = enc.transform(X_test)
    encodings = (X_train_enc, X_test_enc, map_id_to_cat)
    return encodings

# Functions for Word2Vec
# Loading pre-trained embeddings from Google
# google_news = gensim.downloader.load('word2vec-google-news-300')

def remove_stop_word_to_tokens(row):
    return [w for w in row if w not in stop_words]


def remove_words_not_in_embeddings(row, embds):
    return [w for w in row if w in embds.keys()]


def generate_word_vector_features(tokenized_texts, embds):
    docs = []
    for tokens in tokenized_texts:
        embeddings_row = []
        for word in tokens:
            if len(tokens)==0:
                embeddings_row.append(np.zeros(300))
            embeddings_row.append(embds[word])
            matrix_row = np.sum(np.vstack(embeddings_row), axis=0)
        docs.append(matrix_row)
    return np.vstack(docs)    


### Modeling

In [69]:
def model(classifier, X_tr, y_tr, X_ts, y_ts, rnd_state=RANDOM_STATE, cls_wght=None, max_feat=None, 
          random_forest=False, gradient_boosting=False, n_ests=0, lr=0):
    """fitting a machine learning model, provides accuracies at the class level and predicts values for the test set
    Args
        classifier, type of classifier e.g. DecisionTreeClassifier
        X_tr, training set
        y_tr, target variable of training set i.e. level1, or level2, or level3 or level4 category
        X_ts, test set
        y_ts, target variable of test set
        rnd_state::int, random state to have reproducible results
        cls_wght::dict or string, class weights to balance the model e.g. balanced
    Return
        modeling::tuple, cotaining the model, the prediction, and the ground-truth values


    """
    if gradient_boosting:
        mdl = classifier(random_state=rnd_state, n_estimators=n_ests, learning_rate=lr)   
    elif random_forest:
        mdl = classifier(random_state=rnd_state, class_weight=cls_wght, max_features=max_feat, n_estimators=n_ests) 
    else: # Decision Tree
        mdl = classifier(random_state=rnd_state, class_weight=cls_wght, max_features=max_feat)
    mdl.fit(X_tr, y_tr)
    print("Accuracy on training set: {:.3f}".format(mdl.score(X_tr, y_tr)))
    print("Accuracy on test set: {:.3f}".format(mdl.score(X_ts, y_ts)))
    dummy_cls = DummyClassifier(strategy="stratified").fit(X_tr, y_tr)
    print("Accuracy on test set by a DUMMY CLASSIFIER: {:.3f}".format(dummy_cls.score(X_ts, y_ts)))
    y_preds = mdl.predict(X_ts)  # X_train
    y_trues = y_ts  # train.puor_ds_level1_new #
    print("Classification Report for Test Set")
    print(classification_report(y_true=y_trues, y_pred=y_preds))
    print("Classification Report for Training Set")
    print(classification_report(y_true=y_tr, y_pred=mdl.predict(X_tr)))
    modeling = (mdl, y_preds, y_trues)
    return modeling


def integrating_prev_lev_predict_next_lev(X_train, X_test, level, model):
    """Integrates prediction of level i as a new feature for the next training predicting the level i+1 category
    Args
        X_train,
        X_test,
        level::string, category level e.g. puor_ds_level2_new
        model, classifier
    Return
        train_test_new::tuple

    """
    X_train_next_lvl_pred = model.predict(X_train).reshape(-1, 1)
    X_test_next_lvl = np.array(test[level]).reshape(-1, 1)
    X_train_next_lvl_pred_one, X_test_next_lvl_one, map = encoding_categorical_features(
        X_train=X_train_next_lvl_pred, X_test=X_test_next_lvl, min_frq=None
    )
    X_train_new = sp.sparse.hstack([X_train, X_train_next_lvl_pred_one])
    X_test_new = sp.sparse.hstack([X_test, X_test_next_lvl_one])
    train_test_new = (X_train_new, X_test_new)
    return train_test_new

## Reading and exploring data

In [70]:
train = read_data(
                    path=r"C:\Users\riccardoricci\Documents\data_MADS_CAPSTONE\training_data_2025-03-03.csv",
                  # path="data/training_data_2025-03-03.csv", 
                  # Relative path is not possible as this is sensitive data
                  deli="|")
train = drop_na_records(train)
train = filtering_level1(train)
print(len(train))

2074804


In [71]:
test = read_data(
                #path="data/test_set_2025-03-03.csv",
                path=r"C:\Users\riccardoricci\Documents\data_MADS_CAPSTONE\test_set_2025-03-03.csv",
                deli="|")
test = drop_na_records(test)
test = filtering_level1(test)

In [72]:
train_dist = train['puor_ds_level1_new'].value_counts(normalize=True)
test_dist = test['puor_ds_level1_new'].value_counts(normalize=True)


In [135]:
# Inverse weighting: higher weight for underrepresented classes in training
class_weights = {}

for cls in test_dist.index:
    if cls in test_dist and test_dist[cls] > 0:
        class_weights[cls] = test_dist.sum() / test_dist[cls]
    else:
        class_weights[cls] = 1e6  # Or np.inf

raw_weights = {cls: test_dist.sum() / test_dist[cls] if cls in test_dist else 1e6 for cls in test_dist.index}
min_weight = min(raw_weights.values())
class_weights = {cls: w / min_weight for cls, w in raw_weights.items()}

In [136]:
class_weights

{'GENERAL PROCUREMENT': 1.0,
 'TECHNICAL PROCUREMENT': 1.254064062210702,
 'SUPPLY CHAIN': 82.39902676399028}

In [75]:
# Reading a sample of training data to make things faster
train = sample_data(train, 0.15)


In [76]:
len(train)

311220

In [77]:
train['puor_ds_level1_new'].value_counts(normalize=True)

SUPPLY CHAIN             0.571923
TECHNICAL PROCUREMENT    0.247748
GENERAL PROCUREMENT      0.180329
Name: puor_ds_level1_new, dtype: float64

In [78]:
print(f"Training set has {train.shape[0]} rows and {train.shape[1]} columns")
print(f"Test set has {test.shape[0]} rows and {test.shape[1]} columns")

Training set has 311220 rows and 11 columns
Test set has 61282 rows and 11 columns


In [79]:
train.sample(5)

Unnamed: 0,puor_id_order_line,puor_id_lfa1_supplier,puor_id_t001_company,puor_id_prgp_purchasing_group,puor_id_pror_purchasing_org,puor_ds_po_description_translated,puor_ds_level1_new,puor_ds_level2_new,puor_ds_level3_new,puor_ds_level4_new,puor_cd_mat_group
2394181,SAPECCFE3_8210304204_30,SAPECCFE3_0000196519,SAPECCFE3_BR10,SAPECCFE3_Y01,SAPECCFE3_BR06,Transport Guarulhos/SANTOS,SUPPLY CHAIN,SECONDARY FREIGHT,DISTRIBUTION,Distribution to Customers and Returned Goods f...,ZZ9450
332227,SAPECCFE3_8210530421_20,SAPECCFE3_0000196519,SAPECCFE3_BR10,SAPECCFE3_Y01,SAPECCFE3_BR06,Transport Guarulhos/SAOBERNARDO,SUPPLY CHAIN,SECONDARY FREIGHT,DISTRIBUTION,Distribution to Customers and Returned Goods f...,ZZ9450
2207674,SAPECCFE3_4570203911_10,SAPECCFE3_0000159806,SAPECCFE3_FR20,SAPECCFE3_299,SAPECCFE3_FR16,Transport 16.07.2024,SUPPLY CHAIN,PRIMARY TRANSPORTS,ROAD TRANSPORT,Road Transportation,ZZ0304
716010,SAPECCFE3_8009010928_10,SAPECCFE3_0000082164,SAPECCFE3_AU10,SAPECCFE3_H01,SAPECCFE3_AU09,EY Tax Consultancy Fee 24/25 - OPS,GENERAL PROCUREMENT,PROFESSIONAL SERVICES,CONSULTANCY SERVICES,"Strategic, Managerial & organizational consulting",ZZ0906
1187599,SAPECCFE3_5710190829_10,SAPECCFE3_0000188127,SAPECCFE3_RU10,SAPECCFE3_838,SAPECCFE3_RU06,"Transport /г.Уральск,",SUPPLY CHAIN,PRIMARY TRANSPORTS,ROAD TRANSPORT,Road Transportation,ZZ0304


In [80]:
for lvl in range(1,5):
    print(f"TRAIN: # categories at Level {str(lvl)}:",len(train[f"puor_ds_level{str(lvl)}_new"].drop_duplicates()))
    print(f"TEST: # categories at Level {str(lvl)}:",len(test[f"puor_ds_level{str(lvl)}_new"].drop_duplicates()))

TRAIN: # categories at Level 1: 3
TEST: # categories at Level 1: 3
TRAIN: # categories at Level 2: 20
TEST: # categories at Level 2: 19
TRAIN: # categories at Level 3: 104
TEST: # categories at Level 3: 85
TRAIN: # categories at Level 4: 573
TEST: # categories at Level 4: 426


In [81]:
train["puor_ds_level1_new"].value_counts()

SUPPLY CHAIN             177994
TECHNICAL PROCUREMENT     77104
GENERAL PROCUREMENT       56122
Name: puor_ds_level1_new, dtype: int64

In [82]:
test["puor_ds_level1_new"].value_counts()

GENERAL PROCUREMENT      33866
TECHNICAL PROCUREMENT    27005
SUPPLY CHAIN               411
Name: puor_ds_level1_new, dtype: int64

SUPPLY CHAIN is the most frequent category, but this is the least represented in the test set

**Training and test datasets are not balanced with respect to the categories**. 


In [83]:
train[["puor_ds_level1_new","puor_ds_level2_new"]].value_counts()

puor_ds_level1_new     puor_ds_level2_new                    
SUPPLY CHAIN           PRIMARY TRANSPORTS                        134525
TECHNICAL PROCUREMENT  AUTOMATION & MRO                           66946
SUPPLY CHAIN           SECONDARY FREIGHT                          34240
GENERAL PROCUREMENT    SALES SERVICES                             23905
                       WORKPLACE SERVICES                         12254
TECHNICAL PROCUREMENT  INFRASTRUCTURE & INDUSTRIAL                 8102
GENERAL PROCUREMENT    MARKETING SERVICES                          6809
SUPPLY CHAIN           CUSTOM BROKERAGE & FORWARDING AGENCIES      6014
GENERAL PROCUREMENT    IT - INFORMATION TECHNOLOGY                 3866
                       PEOPLE SERVICES                             3803
SUPPLY CHAIN           WAREHOUSING                                 2887
GENERAL PROCUREMENT    MOBILITY                                    2603
                       PROFESSIONAL SERVICES                       2361
TE

In [84]:
test[["puor_ds_level1_new","puor_ds_level2_new"]].value_counts()

puor_ds_level1_new     puor_ds_level2_new                    
GENERAL PROCUREMENT    MARKETING SERVICES                        14748
TECHNICAL PROCUREMENT  AUTOMATION & MRO                          13038
GENERAL PROCUREMENT    WORKPLACE SERVICES                        10223
TECHNICAL PROCUREMENT  INFRASTRUCTURE & INDUSTRIAL                9292
                       PROCESS EQUIPMENT                          4530
GENERAL PROCUREMENT    PEOPLE SERVICES                            4039
                       SALES SERVICES                             2740
                       MOBILITY                                   1439
                       PROFESSIONAL SERVICES                       415
                       IT - INFORMATION TECHNOLOGY                 246
TECHNICAL PROCUREMENT  ENERGY                                      145
SUPPLY CHAIN           CUSTOM BROKERAGE & FORWARDING AGENCIES      117
                       PRIMARY TRANSPORTS                           87
               

In [85]:
train[["puor_ds_level1_new","puor_ds_level2_new","puor_ds_level3_new"]].value_counts().head(10)

puor_ds_level1_new     puor_ds_level2_new                      puor_ds_level3_new                                       
SUPPLY CHAIN           PRIMARY TRANSPORTS                      ROAD TRANSPORT                                               130765
TECHNICAL PROCUREMENT  AUTOMATION & MRO                        TECHNICAL MATERIALS, PPE & CLOTHES                            57346
SUPPLY CHAIN           SECONDARY FREIGHT                       DISTRIBUTION                                                  34240
GENERAL PROCUREMENT    SALES SERVICES                          POS MATERIAL                                                  18632
TECHNICAL PROCUREMENT  AUTOMATION & MRO                        MAINTENANCE SERVICES                                           9119
GENERAL PROCUREMENT    WORKPLACE SERVICES                      OFFICE SUPPLIES AND SERVICES                                   7698
SUPPLY CHAIN           CUSTOM BROKERAGE & FORWARDING AGENCIES  FREIGHT FORWARDING AGENCIES   

In [86]:
test[["puor_ds_level1_new","puor_ds_level2_new","puor_ds_level3_new"]].value_counts().head(10)

puor_ds_level1_new     puor_ds_level2_new           puor_ds_level3_new                                       
GENERAL PROCUREMENT    MARKETING SERVICES           CREATIVE DESIGN                                              10975
TECHNICAL PROCUREMENT  AUTOMATION & MRO             TECHNICAL MATERIALS, PPE & CLOTHES                            8932
GENERAL PROCUREMENT    WORKPLACE SERVICES           OFFICE SUPPLIES AND SERVICES                                  6884
TECHNICAL PROCUREMENT  INFRASTRUCTURE & INDUSTRIAL  FACILITY MANAGEMENT SOFTSERVICES FOR INDUSTRIAL LOCATIONS     3290
GENERAL PROCUREMENT    PEOPLE SERVICES              TRAINING                                                      2524
TECHNICAL PROCUREMENT  INFRASTRUCTURE & INDUSTRIAL  LABORATORY EQUIPMENT AND SERVICES                             2321
                       AUTOMATION & MRO             INDUSTRIAL AUTOMATION AND ROBOTICS                            2192
                                                    MAINT

In [87]:
train[["puor_ds_level1_new","puor_ds_level2_new","puor_ds_level3_new","puor_ds_level4_new"]].value_counts().head(10)

puor_ds_level1_new     puor_ds_level2_new  puor_ds_level3_new                  puor_ds_level4_new                                         
SUPPLY CHAIN           PRIMARY TRANSPORTS  ROAD TRANSPORT                      Road Transportation                                            104026
                       SECONDARY FREIGHT   DISTRIBUTION                        Distribution to Customers and Returned Goods from Customers     34208
GENERAL PROCUREMENT    SALES SERVICES      POS MATERIAL                        POS materials Storage and Logistic                              12574
SUPPLY CHAIN           PRIMARY TRANSPORTS  ROAD TRANSPORT                      Road Transportation Finished Products Export Costs               9408
TECHNICAL PROCUREMENT  AUTOMATION & MRO    MAINTENANCE SERVICES                OPX - Ordinary Maintenance                                       8929
                                           TECHNICAL MATERIALS, PPE & CLOTHES  Other Materials Immediate Use for Mai

In [88]:
test[["puor_ds_level1_new","puor_ds_level2_new","puor_ds_level3_new","puor_ds_level4_new"]].value_counts().head(10)

puor_ds_level1_new     puor_ds_level2_new           puor_ds_level3_new                                         puor_ds_level4_new                                          
GENERAL PROCUREMENT    MARKETING SERVICES           CREATIVE DESIGN                                            Packaging & POS - Creative Concept                              7186
                       WORKPLACE SERVICES           OFFICE SUPPLIES AND SERVICES                               Office supplies                                                 4053
TECHNICAL PROCUREMENT  AUTOMATION & MRO             TECHNICAL MATERIALS, PPE & CLOTHES                         Other Spare parts (not codified, catalogues items)              2958
GENERAL PROCUREMENT    PEOPLE SERVICES              TRAINING                                                   Training Design & Cost of Personal                              2524
                       MARKETING SERVICES           CREATIVE DESIGN                                         

### Exploring Features

In [89]:
train["puor_ds_po_description_translated"].sample(5)

141968                          7364_30.11.2023
1336238                             Transport /
2052323    DIE M8 PITCH 1.25 140300 M8 HOFFMANN
1237266                   Transports 2023.01.04
371758                    Transports 11/04/2023
Name: puor_ds_po_description_translated, dtype: object

## Feature Engineering

### PO Text

TF-IDF Weighting

In [90]:
# stemmatization
train["puor_ds_po_description_translated_token_sklearn_stem"] = (
    train["puor_ds_po_description_translated"]
    .apply(clean)
    .apply(sklearn_split)
    .apply(text_processing, bigrams=True, range_grams="(1,2)", root_word_type='stemmatization')
)

test["puor_ds_po_description_translated_token_sklearn_stem"] = (
    test["puor_ds_po_description_translated"]
    .apply(clean)
    .apply(sklearn_split)
    .apply(text_processing, train=False, bigrams=True, range_grams="(1,2)", root_word_type='stemmatization')
)

In [91]:
# lemmatization
# train["puor_ds_po_description_translated_token_sklearn_lem"] = (
#     train["puor_ds_po_description_translated"]
#     .apply(clean)
#     .apply(sklearn_split)
#     .apply(text_processing, bigrams=True, range_grams="(1,2)", root_word_type='lemmatization')
# )
# 
# test["puor_ds_po_description_translated_token_sklearn_lem"] = (
#     test["puor_ds_po_description_translated"]
#     .apply(clean)
#     .apply(sklearn_split)
#     .apply(text_processing, train=False, bigrams=True, range_grams="(1,2)", root_word_type='lemmatization')
# )

Word2Vec

In [92]:
# train['puor_ds_po_description_translated_tokens_w2v'] = train["puor_ds_po_description_translated"]\
#     .apply(clean)\
#     .apply(sklearn_split)
# test['puor_ds_po_description_translated_tokens_w2v'] = test["puor_ds_po_description_translated"]\
#     .apply(clean)\
#     .apply(sklearn_split)
# 
# vocabulary_no_stem = list(pd.Series(np.hstack(train['puor_ds_po_description_translated_tokens_w2v'].values)).drop_duplicates().values)
# 
# # word_index = {w:i for i,w in enumerate(vocabulary_no_stem)}
# 
# embeddings = {word: google_news[word] for word in vocabulary_no_stem if word in google_news}


In [93]:
# train['puor_ds_po_description_translated_tokens_clean_w2v'] = train['puor_ds_po_description_translated_tokens_w2v']\
#                         .apply(remove_stop_word_to_tokens)\
#                         .apply(remove_words_not_in_embeddings, embds=embeddings)
# 
# test['puor_ds_po_description_translated_tokens_clean_w2v'] = test['puor_ds_po_description_translated_tokens_w2v']\
#                         .apply(remove_stop_word_to_tokens)\
#                         .apply(remove_words_not_in_embeddings, embds=embeddings)

# X_train_word2vec = generate_word_vector_features(train['puor_ds_po_description_translated_tokens_clean_w2v'], embeddings)
# X_test_word2vec = generate_word_vector_features(test['puor_ds_po_description_translated_tokens_clean_w2v'], embeddings)

In [94]:
train[
    [
        "puor_ds_level1_new",
        "puor_ds_po_description_translated",
        "puor_ds_po_description_translated_token_sklearn_stem",
#        "puor_ds_po_description_translated_token_sklearn_lem",
#        "puor_ds_po_description_translated_tokens_clean_w2v",
    ]
].sample(10)

Unnamed: 0,puor_ds_level1_new,puor_ds_po_description_translated,puor_ds_po_description_translated_token_sklearn_stem
261143,GENERAL PROCUREMENT,Transport Guarulhos/SAOPAULO,"[transport, guarulho, saopaulo, transport guar..."
740651,SUPPLY CHAIN,501-SISCOMEX,[siscomex]
968855,TECHNICAL PROCUREMENT,BATTERY TYPE AA AC7 1.5V STYLUS,"[batteri, type, stylu, batteri type, type stylu]"
304218,SUPPLY CHAIN,Transport /,[transport]
576688,SUPPLY CHAIN,3273_09.02.2023,[]
1671249,SUPPLY CHAIN,15296_02.09.2024,[]
1838528,SUPPLY CHAIN,Transport Guarulhos/ARUJA,"[transport, guarulho, aruja, transport guarulh..."
1675564,SUPPLY CHAIN,20524_05.11.2024,[]
2157471,GENERAL PROCUREMENT,Nutella Ecu Activation Bread Bags,"[nutella, ecu, activ, bread, bag, nutella ecu,..."
347813,SUPPLY CHAIN,FREIGHT,[freight]


In [95]:
# train.loc[train["puor_id_order_line"]=='SAPECCFE3_1601004205_50']

In [96]:
train["n_tokens_stem"] = train["puor_ds_po_description_translated_token_sklearn_stem"].apply(lambda x: len(x))
# train["n_tokens_lem"] = train["puor_ds_po_description_translated_token_sklearn_stem"].apply(lambda x: len(x))
# train["n_tokens_w2v"] = train["puor_ds_po_description_translated_tokens_clean_w2v"].apply(lambda x: len(x))

print(f"Average number of tokens per record: {np.mean(train.n_tokens_stem)}")
# print(f"Average number of tokens per record: {np.mean(train.n_tokens_lem)}")
# print(f"Average number of tokens per record (Word2Vec): {np.mean(train.n_tokens_w2v)}")

Average number of tokens per record: 3.6038911381016643


Some text cannot be encoded because are just dates or codes

In [97]:
train[train["n_tokens_stem"] == 0][
    [
        "puor_ds_level4_new",
        "puor_ds_po_description_translated",
        "puor_ds_po_description_translated_token_sklearn_stem",
    ]
]

Unnamed: 0,puor_ds_level4_new,puor_ds_po_description_translated,puor_ds_po_description_translated_token_sklearn_stem
936389,Function related / professional consultancy,IT02I2B2412610 you 27 09 2024 cc 205965,[]
1958633,BUILDING MAINTENANCE,DC26,[]
186928,Employees tax services,565220,[]
2515962,Design & Management of non Digital Promotion,73510030,[]
2686538,Design & Management of non Digital Promotion,73510030,[]
...,...,...,...
1622964,OPX - Ordinary Maintenance,Гири M1,[]
917771,OPX - Ordinary Maintenance,ab 01.01.2022,[]
119963,ELECTRICAL / ELECTRONICAL COMPONENTS,64723460501 Рамка дверцы,[]
975596,SPECIFIC DRAWING COMPONENTS,LA31535D-0000-01,[]


In [98]:
print(f"Percentage of records that have no tokens: {len(train[train.n_tokens_stem == 0]) / len(train):.2%}")

Percentage of records that have no tokens: 6.46%


In [99]:
# train_expl = train[train_expl["n_tokens"]>0]
# train_expl.shape

In [100]:
vocabulary_min_df = {w: k for w, k in vocabulary.items() if k >= MIN_DF}
print(f"Length vocabulary: {len(vocabulary)}")
print(f"Length vocabulary with word frequency greater than {MIN_DF}: {len(vocabulary_min_df)}")

Length vocabulary: 200762
Length vocabulary with word frequency greater than 50: 2228


In [101]:
# https://www.davidsbatista.net/blog/2018/02/28/TfidfVectorizer/


def dummy_fun(doc):
    return doc


vectorizer = TfidfVectorizer(
    analyzer="word",
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocabulary_min_df.keys(),
)

With Lemmatization

In [102]:
# X_train_text_proc_lem = vectorizer.fit_transform(
#     train["puor_ds_po_description_translated_token_sklearn_lem"]
# )
# X_test_text_proc_lem = vectorizer.transform(test["puor_ds_po_description_translated_token_sklearn_lem"])

# cats_map_text_lem = {v: k for v, k in enumerate(vectorizer.get_feature_names_out())}
# len(cats_map_text_lem)

In [103]:
# X_train_text_proc_lem.shape

In [104]:
# X_test_text_proc_lem.shape

With Stemmatization

In [105]:
X_train_text_proc_stem = vectorizer.fit_transform(
    train["puor_ds_po_description_translated_token_sklearn_stem"]
)
X_test_text_proc_stem = vectorizer.transform(test["puor_ds_po_description_translated_token_sklearn_stem"])

In [106]:
cats_map_text = {v: k for v, k in enumerate(vectorizer.get_feature_names_out())}
len(cats_map_text)

2228

In [107]:
# pd.crosstab(train.puor_id_lfa1_supplier, train.puor_ds_level1_new)
# chi2_contingency(pd.crosstab(train_expl.puor_id_lfa1_supplier, train_expl.puor_ds_level2_new))

In [108]:
# Exporting data for data viz
train.to_csv(r'C:\Users\riccardoricci\Documents\data_MADS_CAPSTONE\train_features.csv')
test.to_csv(r'C:\Users\riccardoricci\Documents\data_MADS_CAPSTONE\test_features.csv')

#### Latent Semantic Indexing

Using Singular Value Decomposition

In [109]:
X_train_text_proc_stem.shape

(311220, 2228)

In [110]:
#n_components = 500  # Choose number of singular values/vectors to keep
#svd = TruncatedSVD(n_components=n_components)
#X_train_text_proc_reduced = svd.fit_transform(X_train_text_proc_stem) 

#print("Reduced matrix shape:", X_train_text_proc_reduced.shape)
#cumulative_variance = np.cumsum(svd.explained_variance_ratio_)


Scree plot

In [111]:
#plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
#plt.axhline(y=0.80, color='r', linestyle='--', label="80% variance")
#plt.xlabel("Number of Components")
#plt.ylabel("Cumulative Explained Variance")
#plt.title("Scree Plot")
#plt.legend()
#plt.show()

In [112]:
#X_test_text_proc_reduced = svd.transform(X_test_text_proc_stem)

### Supplier

In [113]:
X_train_suppl = np.array(train.puor_id_lfa1_supplier).reshape(-1, 1)
X_test_suppl = np.array(test.puor_id_lfa1_supplier).reshape(-1, 1)
X_train_suppl_one, X_test_suppl_one, cats_map_suppl = encoding_categorical_features(
    X_train=X_train_suppl, X_test=X_test_suppl, min_frq=10
)

Number of features:14649
First five features:['SAPECCFE3_0000010001' 'SAPECCFE3_0000010007' 'SAPECCFE3_0000010009'
 'SAPECCFE3_0000010011' 'SAPECCFE3_0000010018']
Infrequent categories: 11920
First five entries: ['SAPECCFE3_0000010009' 'SAPECCFE3_0000010011' 'SAPECCFE3_0000010018'
 'SAPECCFE3_0000010020' 'SAPECCFE3_0000010045']
Retained features (# Features - Infrequent features): 2729




In [114]:
X_train_suppl_one.shape

(311220, 2729)

In [115]:
X_test_suppl_one.shape

(61282, 2729)

### Purchasing Organization

In [116]:
X_train_porg = np.array(train.puor_id_pror_purchasing_org).reshape(-1, 1)
X_test_porg = np.array(test.puor_id_pror_purchasing_org).reshape(-1, 1)
X_train_porg_one, X_test_porg_one, cats_map_porg = encoding_categorical_features(
    X_train=X_train_porg, X_test=X_test_porg, min_frq=10
)

Number of features:407
First five features:['SAPECCFE3_2A09' 'SAPECCFE3_2A10' 'SAPECCFE3_AE09' 'SAPECCFE3_AE29'
 'SAPECCFE3_AE33']
Infrequent categories: 89
First five entries: ['SAPECCFE3_AE09' 'SAPECCFE3_AE29' 'SAPECCFE3_AE38' 'SAPECCFE3_AE40'
 'SAPECCFE3_AR08']
Retained features (# Features - Infrequent features): 318




In [117]:
X_train_porg_one.shape

(311220, 318)

In [118]:
X_test_porg_one.shape

(61282, 318)

### Company

In [119]:
X_train_comp = np.array(train.puor_id_t001_company).reshape(-1, 1)
X_test_comp = np.array(test.puor_id_t001_company).reshape(-1, 1)
X_train_comp_one, X_test_comp_one, cats_map_comp = encoding_categorical_features(
    X_train=X_train_comp, X_test=X_test_comp, min_frq=20
)

Number of features:90
First five features:['SAPECCFE3_10ES' 'SAPECCFE3_10RO' 'SAPECCFE3_10SE' 'SAPECCFE3_11CN'
 'SAPECCFE3_11SE']
Infrequent categories: 7
First five entries: ['SAPECCFE3_AE10' 'SAPECCFE3_AE20' 'SAPECCFE3_BE30' 'SAPECCFE3_LU35'
 'SAPECCFE3_VN30']
Retained features (# Features - Infrequent features): 83


### Purchasing Group

In [120]:
X_train_pgrp = np.array(train.puor_id_prgp_purchasing_group).reshape(-1, 1)
X_test_pgrp = np.array(test.puor_id_prgp_purchasing_group).reshape(-1, 1)
X_train_pgrp_one, X_test_pgrp_one, cats_map_prgp = encoding_categorical_features(
    X_train=X_train_pgrp, X_test=X_test_pgrp, min_frq=10
)

Number of features:1272
First five features:['SAPECCFE3_000' 'SAPECCFE3_004' 'SAPECCFE3_100' 'SAPECCFE3_101'
 'SAPECCFE3_103']
Infrequent categories: 574
First five entries: ['SAPECCFE3_000' 'SAPECCFE3_004' 'SAPECCFE3_100' 'SAPECCFE3_101'
 'SAPECCFE3_10C']
Retained features (# Features - Infrequent features): 698




In [121]:
y_train = train.puor_ds_level1_new
y_test = test.puor_ds_level1_new

## Modeling

### Predicting Level 1

Let's use just text features to compare tf-idf and word2vec

TF-IDF only

In [122]:
#print("TF-IDF (Lemmatization)")
#tree, y_predictions, y_gold_labels = model(
#    DecisionTreeClassifier, X_tr=X_train_text_proc_lem, y_tr=y_train, X_ts=X_test_text_proc_lem, y_ts=y_test, cls_wght='balanced'
#)

In [123]:
#print("TF-IDF (Stemmatization)")
## weights = {"SUPPLY CHAIN": 0.43, "TECHNICAL PROCUREMENT": 0.75, "GENERAL PROCUREMENT": 0.82}
#tree, y_predictions, y_gold_labels = model(
#    DecisionTreeClassifier, X_tr=X_train_text_proc_stem, y_tr=y_train, X_ts=X_test_text_proc_stem, y_ts=y_test, cls_wght='balanced'
#)

TF-IDF (Stemmatization) with Latent Semantic Analysis

In [124]:
#print(f"TF-IDF (Stemmatization) with Latent Semantic Analysis (Number of components retained: {len(svd.components_)})")
#tree, y_predictions, y_gold_labels = model(
#    DecisionTreeClassifier, X_tr=X_train_text_proc_reduced, y_tr=y_train, X_ts=X_test_text_proc_reduced, y_ts=y_test, cls_wght='balanced'
#)

Using Word2Vec

In [125]:
# print("Word2Vec")
#tree, y_predictions, y_gold_labels = model(
#    DecisionTreeClassifier, X_tr=X_train_word2vec, y_tr=y_train, X_ts=X_test_word2vec, y_ts=y_test, cls_wght='balanced'
#)

I retain the TF-IDF Model with stemmatization as it provides the greatest performance and it is faster since it uses sparse vectors. 

In the following I concatenate the other features e.g. suppliers, purchasing organizations etc.

### Train and Test Set Engineering

In [126]:
X_train = sp.sparse.hstack(
    [X_train_text_proc_stem, X_train_suppl_one, X_train_porg_one, X_train_comp_one, X_train_pgrp_one]
)
# X_train = sp.sparse.hstack([X_train_text_proc,X_train_suppl_one ])

X_test = sp.sparse.hstack(
    [X_test_text_proc_stem, X_test_suppl_one, X_test_porg_one, X_test_comp_one, X_test_pgrp_one]
)
# X_test = sp.sparse.hstack([X_test_text_proc,X_test_suppl_one])

In [127]:
dicts = [cats_map_text, cats_map_suppl,cats_map_porg,cats_map_comp, cats_map_prgp]  # List of dictionaries
categories_mapping = {}
counter = 0  # Start progressive index

for d in dicts:
    for value in d.values():
        categories_mapping[counter] = value
        counter += 1

len(categories_mapping)

6056

In [128]:
X_train.shape

(311220, 6056)

Choosing the best model for level 1 among DecisionTree, Random Forest and Gradient Boosting

In [153]:
test_dist

GENERAL PROCUREMENT      0.552626
TECHNICAL PROCUREMENT    0.440668
SUPPLY CHAIN             0.006707
Name: puor_ds_level1_new, dtype: float64

In [185]:
# Inverse weighting: higher weight for underrepresented classes in training
class_weights = {}

for cls in test_dist.index:
    if cls in test_dist and test_dist[cls] > 0:
        class_weights[cls] = test_dist.sum() / test_dist[cls]
    else:
        class_weights[cls] = 1e6  # Or np.inf

raw_weights = {cls: test_dist.sum() / test_dist[cls] if cls in test_dist else 1e6 for cls in test_dist.index}
min_weight = min(raw_weights.values())
class_weights = {cls: w / min_weight for cls, w in raw_weights.items()}
# reducing weight of supply chain
class_weights["SUPPLY CHAIN"]=class_weights["SUPPLY CHAIN"]/20

In [186]:
class_weights

{'GENERAL PROCUREMENT': 1.0,
 'TECHNICAL PROCUREMENT': 1.254064062210702,
 'SUPPLY CHAIN': 4.119951338199514}

In [187]:
model(
    DecisionTreeClassifier, X_tr=X_train, y_tr=y_train, X_ts=X_test, y_ts=y_test, cls_wght=class_weights
)

Accuracy on training set: 0.964
Accuracy on test set: 0.956
Accuracy on test set by a DUMMY CLASSIFIER: 0.212
Classification Report for Test Set
                       precision    recall  f1-score   support

  GENERAL PROCUREMENT       0.98      0.95      0.96     33866
         SUPPLY CHAIN       0.33      0.92      0.48       411
TECHNICAL PROCUREMENT       0.96      0.96      0.96     27005

             accuracy                           0.96     61282
            macro avg       0.75      0.95      0.80     61282
         weighted avg       0.96      0.96      0.96     61282

Classification Report for Training Set
                       precision    recall  f1-score   support

  GENERAL PROCUREMENT       1.00      0.81      0.89     56122
         SUPPLY CHAIN       0.94      1.00      0.97    177994
TECHNICAL PROCUREMENT       1.00      1.00      1.00     77104

             accuracy                           0.96    311220
            macro avg       0.98      0.93      0.95   

(DecisionTreeClassifier(class_weight={'GENERAL PROCUREMENT': 1.0,
                                      'SUPPLY CHAIN': 4.119951338199514,
                                      'TECHNICAL PROCUREMENT': 1.254064062210702},
                        random_state=42),
 array(['GENERAL PROCUREMENT', 'SUPPLY CHAIN', 'GENERAL PROCUREMENT', ...,
        'GENERAL PROCUREMENT', 'GENERAL PROCUREMENT',
        'TECHNICAL PROCUREMENT'], dtype=object),
 0          GENERAL PROCUREMENT
 1        TECHNICAL PROCUREMENT
 2          GENERAL PROCUREMENT
 3          GENERAL PROCUREMENT
 4          GENERAL PROCUREMENT
                  ...          
 65241      GENERAL PROCUREMENT
 65242      GENERAL PROCUREMENT
 65244      GENERAL PROCUREMENT
 65245      GENERAL PROCUREMENT
 65246    TECHNICAL PROCUREMENT
 Name: puor_ds_level1_new, Length: 61282, dtype: object)

In [184]:
model(
    DecisionTreeClassifier, X_tr=X_train, y_tr=y_train, X_ts=X_test, y_ts=y_test, cls_wght=None
)

Accuracy on training set: 0.966
Accuracy on test set: 0.959
Accuracy on test set by a DUMMY CLASSIFIER: 0.215
Classification Report for Test Set
                       precision    recall  f1-score   support

  GENERAL PROCUREMENT       0.97      0.97      0.97     33866
         SUPPLY CHAIN       0.49      0.81      0.61       411
TECHNICAL PROCUREMENT       0.96      0.95      0.96     27005

             accuracy                           0.96     61282
            macro avg       0.81      0.91      0.84     61282
         weighted avg       0.96      0.96      0.96     61282

Classification Report for Training Set
                       precision    recall  f1-score   support

  GENERAL PROCUREMENT       0.98      0.83      0.90     56122
         SUPPLY CHAIN       0.95      0.99      0.97    177994
TECHNICAL PROCUREMENT       1.00      1.00      1.00     77104

             accuracy                           0.97    311220
            macro avg       0.97      0.94      0.96   

(DecisionTreeClassifier(random_state=42),
 array(['GENERAL PROCUREMENT', 'SUPPLY CHAIN', 'GENERAL PROCUREMENT', ...,
        'GENERAL PROCUREMENT', 'GENERAL PROCUREMENT',
        'TECHNICAL PROCUREMENT'], dtype=object),
 0          GENERAL PROCUREMENT
 1        TECHNICAL PROCUREMENT
 2          GENERAL PROCUREMENT
 3          GENERAL PROCUREMENT
 4          GENERAL PROCUREMENT
                  ...          
 65241      GENERAL PROCUREMENT
 65242      GENERAL PROCUREMENT
 65244      GENERAL PROCUREMENT
 65245      GENERAL PROCUREMENT
 65246    TECHNICAL PROCUREMENT
 Name: puor_ds_level1_new, Length: 61282, dtype: object)

In [130]:
assert False

AssertionError: 

In [None]:
#model(
#    RandomForestClassifier, X_tr=X_train, y_tr=y_train, X_ts=X_test, y_ts=y_test, cls_wght='balanced', random_forest=True, n_ests=20
#)

In [None]:
#model(
#   GradientBoostingClassifier, X_tr=X_train, y_tr=y_train, X_ts=X_test, y_ts=y_test, gradient_boosting=True, n_ests=60, lr=0.01
#

Best Model: RandomForestClassifier 

In [None]:
tree, y_predictions, y_gold_labels = model(
    RandomForestClassifier, X_tr=X_train, y_tr=y_train, X_ts=X_test, y_ts=y_test, cls_wght='balanced', n_ests=10
    #DecisionTreeClassifier, X_tr=X_train, y_tr=y_train, X_ts=X_test, y_ts=y_test, cls_wght='balanced'
)

KeyboardInterrupt: 

In [None]:
categories_mapping

{0: 'nutella',
 1: 'social',
 2: 'aug',
 3: 'inv',
 4: 'transport',
 5: 'riodejanei',
 6: 'transport riodejanei',
 7: 'kinder',
 8: 'surpris',
 9: 'freight',
 10: 'product',
 11: 'rocher',
 12: 'tvc',
 13: 'high',
 14: 'cabinet',
 15: 'door',
 16: 'guarulho',
 17: 'jundiai',
 18: 'transport guarulho',
 19: 'guarulho jundiai',
 20: 'brasilia',
 21: 'transport brasilia',
 22: 'digit',
 23: 'septemb',
 24: 'post',
 25: 'roc',
 26: 'test',
 27: 'kit',
 28: 'piracicaba',
 29: 'guarulho piracicaba',
 30: 'saopaulo',
 31: 'guarulho saopaulo',
 32: 'payment',
 33: 'lock',
 34: 'lyreco',
 35: 'squar',
 36: 'oct',
 37: 'clean',
 38: 'servic',
 39: 'nielsen',
 40: 'report',
 41: 'chair',
 42: 'legal',
 43: 'nut',
 44: 'hof',
 45: 'fire',
 46: 'control',
 47: 'easter',
 48: 'expens',
 49: 'jan',
 50: 'part',
 51: 'purchas',
 52: 'cord',
 53: 'build',
 54: 'buyout',
 55: 'pack',
 56: 'batteri',
 57: 'rioclaro',
 58: 'guarulho rioclaro',
 59: 'insur',
 60: 'year',
 61: 'evd',
 62: 'sampl',
 63: 'lin

In [None]:
features_imp = tree.feature_importances_
sort_idx = np.argsort(features_imp)[::-1]

In [None]:
for idx in sort_idx[:40]:
    print(f"Feature name: {categories_mapping[idx]}, Feature Importance: {features_imp[idx]}")

Feature name: transport, Feature Importance: 0.2743903094994677
Feature name: SAPECCFE3_LU19, Feature Importance: 0.058997724246649355
Feature name: SAPECCFE3_ITT3, Feature Importance: 0.05220116038421298
Feature name: SAPECCFE3_ZA19, Feature Importance: 0.05114810675447027
Feature name: SAPECCFE3_US29, Feature Importance: 0.0378379729139998
Feature name: SAPECCFE3_AU09, Feature Importance: 0.0327681862052771
Feature name: SAPECCFE3_NL09, Feature Importance: 0.03170465451697461
Feature name: SAPECCFE3_BE20, Feature Importance: 0.023981050429480084
Feature name: SAPECCFE3_BR10, Feature Importance: 0.018724902534549445
Feature name: SAPECCFE3_ES10, Feature Importance: 0.016728244732416422
Feature name: SAPECCFE3_MX03, Feature Importance: 0.01665593575606507
Feature name: SAPECCFE3_CH09, Feature Importance: 0.01665247436022043
Feature name: SAPECCFE3_CN34, Feature Importance: 0.01497297887098398
Feature name: SAPECCFE3_RS09, Feature Importance: 0.014808156533050355
Feature name: SAPECCFE3

In [None]:
# gbrt = GradientBoostingClassifier(random_state=RANDOM_STATE)
# gbrt.fit(X_train, y_train)
#
# print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
# print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))

### Integrating Level1 and Predicting Level2

In [None]:
X_train1, X_test1 = integrating_prev_lev_predict_next_lev(
    X_train, X_test, level="puor_ds_level1_new", model=tree
)
y_train1 = train.puor_ds_level2_new
y_test1 = test.puor_ds_level2_new

Number of features:3
First five features:['GENERAL PROCUREMENT' 'SUPPLY CHAIN' 'TECHNICAL PROCUREMENT']


In [None]:
tree1, y_predictions1, y_gold_labels1 = model(
    DecisionTreeClassifier, X_tr=X_train1, y_tr=y_train1, X_ts=X_test1, y_ts=y_test1, cls_wght='balanced'
)

Accuracy on training set: 0.926
Accuracy on test set: 0.660
Accuracy on test set by a DUMMY CLASSIFIER: 0.066
Classification Report for Test Set


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                        precision    recall  f1-score   support

                      AUTOMATION & MRO       0.65      0.70      0.67     13038
CUSTOM BROKERAGE & FORWARDING AGENCIES       0.89      0.91      0.90       117
                                ENERGY       0.07      0.52      0.12       145
      ENERGY AND UTILITIES FOR OFFICES       0.28      0.80      0.41        15
           INFRASTRUCTURE & INDUSTRIAL       0.64      0.48      0.55      9292
           IT - INFORMATION TECHNOLOGY       0.13      0.64      0.22       246
                    MARKETING SERVICES       0.92      0.81      0.86     14748
                              MOBILITY       0.52      0.84      0.64      1439
                          OTHER NON GP       0.00      0.00      0.00         0
                   PALLETTS PURCHASING       0.72      0.57      0.63        23
                 PALLETTS RENT & LEASE       1.00      0.93      0.96        54
                       PEOPLE SERVICES 

In [None]:
tree1, y_predictions1, y_gold_labels1 = model(
    RandomForestClassifier, X_tr=X_train1, y_tr=y_train1, X_ts=X_test1, y_ts=y_test1, cls_wght='balanced', n_ests=10
)

Accuracy on training set: 0.930
Accuracy on test set: 0.668
Accuracy on test set by a DUMMY CLASSIFIER: 0.068
Classification Report for Test Set


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                        precision    recall  f1-score   support

                      AUTOMATION & MRO       0.64      0.76      0.70     13038
CUSTOM BROKERAGE & FORWARDING AGENCIES       0.89      0.94      0.92       117
                                ENERGY       0.09      0.64      0.16       145
      ENERGY AND UTILITIES FOR OFFICES       0.55      0.80      0.65        15
           INFRASTRUCTURE & INDUSTRIAL       0.70      0.47      0.56      9292
           IT - INFORMATION TECHNOLOGY       0.14      0.65      0.23       246
                    MARKETING SERVICES       0.96      0.79      0.87     14748
                              MOBILITY       0.52      0.85      0.64      1439
                          OTHER NON GP       0.00      0.00      0.00         0
                   PALLETTS PURCHASING       0.76      0.57      0.65        23
                 PALLETTS RENT & LEASE       1.00      0.93      0.96        54
                       PEOPLE SERVICES 

In [None]:
tree1, y_predictions1, y_gold_labels1 = model(
    GradientBoostingClassifier, X_tr=X_train1, y_tr=y_train1, X_ts=X_test1, y_ts=y_test1, gradient_boosting=True
)

Accuracy on training set: 0.910
Accuracy on test set: 0.678
Accuracy on test set by a DUMMY CLASSIFIER: 0.068
Classification Report for Test Set


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                        precision    recall  f1-score   support

                      AUTOMATION & MRO       0.54      0.96      0.69     13038
CUSTOM BROKERAGE & FORWARDING AGENCIES       0.81      0.87      0.84       117
                                ENERGY       0.67      0.46      0.54       145
      ENERGY AND UTILITIES FOR OFFICES       0.61      0.73      0.67        15
           INFRASTRUCTURE & INDUSTRIAL       0.90      0.20      0.33      9292
           IT - INFORMATION TECHNOLOGY       0.37      0.89      0.52       246
                    MARKETING SERVICES       0.97      0.78      0.87     14748
                              MOBILITY       0.68      0.83      0.75      1439
                          OTHER NON GP       0.00      0.00      0.00         0
                   PALLETTS PURCHASING       0.00      0.00      0.00        23
                 PALLETTS RENT & LEASE       0.83      0.96      0.89        54
                       PEOPLE SERVICES 

In [None]:
tree1, y_predictions1, y_gold_labels1 = model(
    DecisionTreeClassifier, X_tr=X_train1, y_tr=y_train1, X_ts=X_test1, y_ts=y_test1, cls_wght='balanced'
)

In [None]:
#X_test_lvl2_pred = tree1.predict(X_test1).reshape(-1, 1)
#n = 0
#range = np.arange(130, 140)
#for i in range + 1:
#    print(i)
#    print(f"PO Line: {test.puor_id_order_line.iloc[i]}")
#    if X_test_lvl2_pred[i][0] == test.puor_ds_level2_new.iloc[i]:
#        print("vvv CORRECT PREDICTION vvv")
#        n += 1
#    else:
#        print("xxx NOT CORRECT PREDICTION xxx")
#
#    print(f"Prediction is {X_test_lvl2_pred[i][0]}")
#    print(f"Truth is {test.puor_ds_level2_new.iloc[i]}")
#    print("------")
#print(f"Number of correct cases {n} out of {len(range)}")

131
PO Line: SAPECCFE3_6309078766_910
vvv CORRECT PREDICTION vvv
Prediction is MOBILITY
Truth is MOBILITY
------
132
PO Line: SAPECCFE3_6100163920_50001
xxx NOT CORRECT PREDICTION xxx
Prediction is PROCESS EQUIPMENT
Truth is AUTOMATION & MRO
------
133
PO Line: SAPECCFE3_1606016849_20
vvv CORRECT PREDICTION vvv
Prediction is MARKETING SERVICES
Truth is MARKETING SERVICES
------
134
PO Line: SAPECCFE3_9308016883_10
vvv CORRECT PREDICTION vvv
Prediction is INFRASTRUCTURE & INDUSTRIAL
Truth is INFRASTRUCTURE & INDUSTRIAL
------
135
PO Line: SAPECCFIP_5000197525_30
vvv CORRECT PREDICTION vvv
Prediction is SALES SERVICES
Truth is SALES SERVICES
------
136
PO Line: SAPECCFE3_4020006608_20
vvv CORRECT PREDICTION vvv
Prediction is WORKPLACE SERVICES
Truth is WORKPLACE SERVICES
------
137
PO Line: SAPECCFE3_1620001659_30
vvv CORRECT PREDICTION vvv
Prediction is MARKETING SERVICES
Truth is MARKETING SERVICES
------
138
PO Line: SAPECCFE3_3304001012_10
vvv CORRECT PREDICTION vvv
Prediction is WOR

### Integrating Level2 and Predicting Level3

In [None]:
X_train2, X_test2 = integrating_prev_lev_predict_next_lev(
    X_train1, X_test1, level="puor_ds_level2_new", model=tree1
)
y_train2 = train.puor_ds_level3_new
y_test2 = test.puor_ds_level3_new

Number of features:20
First five features:['AUTOMATION & MRO' 'CUSTOM BROKERAGE & FORWARDING AGENCIES' 'ENERGY'
 'ENERGY AND UTILITIES FOR OFFICES' 'INFRASTRUCTURE & INDUSTRIAL']


In [None]:
tree2, y_predictions2, y_gold_labels2 = model(
    DecisionTreeClassifier, X_tr=X_train2, y_tr=y_train2, X_ts=X_test2, y_ts=y_test2, cls_wght='balanced'
)

Accuracy on training set: 0.908
Accuracy on test set: 0.662
Accuracy on test set by a DUMMY CLASSIFIER: 0.035
Classification Report for Test Set


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                           precision    recall  f1-score   support

                                      ADMINISTRATION COST       0.00      0.00      0.00         0
                                  ADMINISTRATIVE SERVICES       0.47      0.85      0.61        52
                                B&U PROFESSIONAL SERVICES       0.11      0.45      0.17       329
                                           CERTIFICATIONS       0.02      0.50      0.03        26
                                                 CLEANING       0.83      0.37      0.51      1876
                                         CLEANING (LOCAL)       0.08      1.00      0.14        17
                         COMMERCIAL QUALITY AND FRESHNESS       0.91      0.68      0.78        44
                                     CONSULTANCY SERVICES       0.50      0.54      0.52       125
                               CONSUMER RESEARCH (AD-HOC)       0.28      0.84      0.41       237
         

In [None]:
#X_test_lvl3_pred = tree2.predict(X_test2).reshape(-1, 1)
#n = 0
#range = np.arange(130, 140)
#for i in range + 1:
#    print(i)
#    print(f"PO Line: {test.puor_id_order_line.iloc[i]}")
#    if X_test_lvl3_pred[i][0] == test.puor_ds_level3_new.iloc[i]:
#        print("vvv CORRECT PREDICTION vvv")
#        n += 1
#    else:
#        print("xxx NOT CORRECT PREDICTION xxx")
#
#    print(f"Prediction is {X_test_lvl3_pred[i][0]}")
#    print(f"Truth is {test.puor_ds_level3_new.iloc[i]}")
#    print("------")
#print(f"Number of correct cases {n} out of {len(range)}")

131
PO Line: SAPECCFE3_6309078766_910
vvv CORRECT PREDICTION vvv
Prediction is FLEET
Truth is FLEET
------
132
PO Line: SAPECCFE3_6100163920_50001
xxx NOT CORRECT PREDICTION xxx
Prediction is MAINTENANCE SERVICES
Truth is INDUSTRIAL AUTOMATION AND ROBOTICS
------
133
PO Line: SAPECCFE3_1606016849_20
vvv CORRECT PREDICTION vvv
Prediction is CREATIVE DESIGN
Truth is CREATIVE DESIGN
------
134
PO Line: SAPECCFE3_9308016883_10
xxx NOT CORRECT PREDICTION xxx
Prediction is MAINTENANCE & IMPROVEMENTS
Truth is FACILITY MANAGEMENT SOFTSERVICES FOR INDUSTRIAL LOCATIONS
------
135
PO Line: SAPECCFIP_5000197525_30
xxx NOT CORRECT PREDICTION xxx
Prediction is MERCHANDISING & HOSTESS & BROKER COMMISSIONS
Truth is CONSUMERS PROMOTION ACTIVITIES
------
136
PO Line: SAPECCFE3_4020006608_20
vvv CORRECT PREDICTION vvv
Prediction is OFFICE SUPPLIES AND SERVICES
Truth is OFFICE SUPPLIES AND SERVICES
------
137
PO Line: SAPECCFE3_1620001659_30
vvv CORRECT PREDICTION vvv
Prediction is CREATIVE DESIGN
Truth i

### Integrating Level3 and Predicting Level4

In [None]:
X_train3, X_test3 = integrating_prev_lev_predict_next_lev(
    X_train2, X_test2, level="puor_ds_level3_new", model=tree2
)
y_train3 = train.puor_ds_level4_new
y_test3 = test.puor_ds_level4_new

Number of features:104
First five features:['ADMINISTRATION COST' 'ADMINISTRATIVE SERVICES' 'AIR FREIGHT TRANSPORT'
 'B&U PROFESSIONAL SERVICES' 'BONUS / MALUS\xa0(WAREHOUSING)']


In [None]:
tree3, y_predictions3, y_gold_labels3 = model(
    DecisionTreeClassifier, X_tr=X_train3, y_tr=y_train3, X_ts=X_test3, y_ts=y_test3, cls_wght='balanced'
)

Accuracy on training set: 0.887
Accuracy on test set: 0.641
Accuracy on test set by a DUMMY CLASSIFIER: 0.005
Classification Report for Test Set


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                              precision    recall  f1-score   support

                                            1/4 Wood Pallets       1.00      1.00      1.00        14
                  1st Level Support Outsourcing Applications       0.49      0.88      0.63        26
                    1st Level Support Outsourcing Technology       0.00      0.00      0.00        11
                    2nd Level Support Outsourcing Technology       0.00      0.00      0.00        37
       Ad hoc reports/ analysis performed on Retail Panel DB       0.33      0.15      0.21        13
       Ad hoc reports/analysis on syndicated data (excl RMS)       0.28      0.42      0.33        12
         Additional Services and Costs for Real Estate Lease       0.44      0.92      0.59        26
                            Administrative Costs Warehousing       0.00      0.00      0.00         0
                        Application Perpetual Licenses CAPEX       1.00      1.00

In [None]:
X_test_lvl4_pred = tree3.predict(X_test3).reshape(-1, 1)
n = 0
range = np.arange(130, 140)
for i in range + 1:
    print(i)
    print(f"PO Line: {test.puor_id_order_line.iloc[i]}")
    if X_test_lvl4_pred[i][0] == test.puor_ds_level4_new.iloc[i]:
        print("vvv CORRECT PREDICTION vvv")
        n += 1
    else:
        print("xxx NOT CORRECT PREDICTION xxx")

    print(f"Prediction is {X_test_lvl4_pred[i][0]}")
    print(f"Truth is {test.puor_ds_level4_new.iloc[i]}")
    print("------")
print(f"Number of correct cases {n} out of {len(range)}")

131
PO Line: SAPECCFE3_6309078766_910
vvv CORRECT PREDICTION vvv
Prediction is Company Vehicles - Lease
Truth is Company Vehicles - Lease
------
132
PO Line: SAPECCFE3_6100163920_50001
vvv CORRECT PREDICTION vvv
Prediction is Automation professional services
Truth is Automation professional services
------
133
PO Line: SAPECCFE3_1606016849_20
vvv CORRECT PREDICTION vvv
Prediction is Packaging & POS - Creative Concept
Truth is Packaging & POS - Creative Concept
------
134
PO Line: SAPECCFE3_9308016883_10
vvv CORRECT PREDICTION vvv
Prediction is Pest Control
Truth is Pest Control
------
135
PO Line: SAPECCFIP_5000197525_30
vvv CORRECT PREDICTION vvv
Prediction is Design & Management of non Digital Promotion
Truth is Design & Management of non Digital Promotion
------
136
PO Line: SAPECCFE3_4020006608_20
vvv CORRECT PREDICTION vvv
Prediction is Office supplies
Truth is Office supplies
------
137
PO Line: SAPECCFE3_1620001659_30
vvv CORRECT PREDICTION vvv
Prediction is Surprises - Technica