In [1]:
import pandas as pd
import numpy as np

import warnings
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=FitFailedWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)

In [2]:
info = pd.read_excel('Final Cleaned Data.xlsx')
info.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185 entries, 0 to 1184
Data columns (total 5 columns):
ASIN                    1185 non-null object
Description             1185 non-null object
Price                   1185 non-null float64
Verified Subcategory    1185 non-null object
Description_New         1185 non-null object
dtypes: float64(1), object(4)
memory usage: 46.4+ KB


In [3]:
'''
    Now we have our models trained on different features. 
    We want to combine their predictive force to make a even-better-performed model using stacking method
    But first we need to preprocess our models to make sure they work on the features that they are good at.
    We also use pipeline here just to make the process easier
'''

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from joblib import load

#Lambda functions are replaced by regular functions since they can't be saved as pickle files
#get_text = FunctionTransformer(lambda x: x['Description_New'], validate=False)
#get_num = FunctionTransformer(lambda x: x['Price'].values.reshape(-1,1), validate=False)

def get_text_func(df):
    return df['Description_New']

def get_num_func(df):
    return df['Price'].values.reshape(-1,1)

get_text = FunctionTransformer(get_text_func, validate=False)
get_num = FunctionTransformer(get_num_func, validate=False)

price_model = load('Price_knn.pkl')
text_model_vote = load('Text_voting.pkl')
text_model_count = load('Text_RF_count.pkl')
text_model_ngram = load('Text_LR_ngram.pkl')
text_model_tfidf = load('Text_LR_tfidf.pkl')

In [4]:
'''
    The first stacking method I tried is VotingClassifier.
    It combines each estimator's predicted probability with a given weight (if using soft voting), or a majority vote if using hard voting
'''

from sklearn.ensemble import VotingClassifier
pl_price = Pipeline([('selector', get_num),
                     ('clf', price_model)])
pl_text_tfidf = Pipeline([('selector', get_text), 
                    ('vectorizer', TfidfVectorizer()),
                    ('clf', text_model_tfidf)])

pl_text_ngram = Pipeline([('selector', get_text), 
                    ('vectorizer', CountVectorizer(ngram_range=(2,3))),
                    ('clf', text_model_ngram)])

pl_text_count = Pipeline([('selector', get_text), 
                    ('vectorizer', CountVectorizer()),
                    ('clf', text_model_count)])

vote = VotingClassifier(
    estimators = [
        ('1', pl_text_tfidf),
        ('2', pl_text_count),
        ('3', pl_text_ngram),
        ('4', pl_price)
                 ],
    voting='soft', 
    weights=[0.25, 0.25, 0.25, 0.25]
)

print(cross_val_score(vote, info, info['Verified Subcategory'], cv=5, n_jobs=-1).mean())

X_train, X_test, y_train, y_test = train_test_split(info, info['Verified Subcategory'], test_size=0.2, stratify=info['Verified Subcategory'], random_state=1)
vote.fit(X_train, y_train)
print(vote.score(X_test, y_test))
print(classification_report(vote.predict(X_test), y_test))


0.8767932489451477
0.9324894514767933
                                precision    recall  f1-score   support

   DP and MDP Display Adapters       0.98      0.91      0.94        91
         HDMI and DVI Adapters       0.83      1.00      0.90        19
                          None       0.92      0.92      0.92        60
Thunderbolt 3 Display Adapters       0.33      1.00      0.50         1
        USB-A Display Adapters       0.95      1.00      0.97        18
        USB-C Display Adapters       0.96      0.94      0.95        48

                      accuracy                           0.93       237
                     macro avg       0.83      0.96      0.86       237
                  weighted avg       0.94      0.93      0.93       237



In [5]:
'''
    The second stacking method I tried is StackingClassifier.
    The base estimators take inputs first, provide outputs, and the meta estimators take these as inputs. 
    To be honest I don't know what's the best choice for final estimator but from what I found on different resources, 
    LogisticRegression is a commonly used one. 
'''
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stack = StackingClassifier(
    estimators=[
        ('1', pl_text_tfidf),
        ('2', pl_text_count),
        ('3', pl_text_ngram),
        ('4', pl_price)], 
    final_estimator=LogisticRegression()
)

print(cross_val_score(stack, info, info['Verified Subcategory'], cv=5, n_jobs=-1).mean())
stack.fit(X_train, y_train)
print(stack.score(X_test, y_test))
print(classification_report(stack.predict(X_test), y_test))

0.8784810126582279
0.919831223628692
                                precision    recall  f1-score   support

   DP and MDP Display Adapters       0.94      0.91      0.92        88
         HDMI and DVI Adapters       0.87      0.91      0.89        22
                          None       0.93      0.89      0.91        63
Thunderbolt 3 Display Adapters       0.33      1.00      0.50         1
        USB-A Display Adapters       0.95      1.00      0.97        18
        USB-C Display Adapters       0.91      0.96      0.93        45

                      accuracy                           0.92       237
                     macro avg       0.82      0.94      0.86       237
                  weighted avg       0.93      0.92      0.92       237



In [6]:
'''
    Finally, let's save these models! 
'''
from joblib import dump

dump(vote, "Final_vote.pkl")
dump(stack, "Final_stack.pkl")

['Final_stack.pkl']

In [8]:
print(classification_report(vote.predict(info), info['Verified Subcategory']))

                                precision    recall  f1-score   support

   DP and MDP Display Adapters       1.00      0.98      0.99       429
         HDMI and DVI Adapters       0.96      1.00      0.98       110
                          None       0.98      0.98      0.98       300
Thunderbolt 3 Display Adapters       0.86      1.00      0.92        12
        USB-A Display Adapters       0.99      1.00      0.99        97
        USB-C Display Adapters       0.99      0.99      0.99       237

                      accuracy                           0.99      1185
                     macro avg       0.96      0.99      0.98      1185
                  weighted avg       0.99      0.99      0.99      1185

